From 0b57e367199a4fbbd70cbb2a7f89b929d16d268b Mon Sep 17 00:00:00 2001 From: Davide Angelocola Date: Tue, 23 Jun 2026 21:25:21 +0200 Subject: [PATCH] refactor(array): dedup Lazy{Sparse,Rle} walk into shared helpers (benchmark-gated) Removes the copy-pasted walk loops across the Lazy*Array records using the right sharing mechanism for each, verified by a new JMH microbench (LazyArrayWalkBenchmark) so the hot lazy iteration paths don't regress. - Sparse: Lazy{Int,Long,Double,Float}Array delegate the patch-walk to the existing SparseArrays.walkPatches (Byte/Short/Bool already did). - RLE: the chunk-boundary loop is hoisted into RleArrays.walkChunks(.., ChunkVisitor), which fires once per chunk (~length/1024); each LazyRle record keeps its typed per-row processChunk/foldChunk inline, so the values[...] read stays a direct, monomorphic array access. A first RLE attempt shared the per-row emit through a sink and regressed multi-value fold ~35% (megamorphic per-element call); the benchmark caught it and drove the switch to the per-chunk visitor. Final numbers (M5, JDK 25, fold over 1M rows, inline main -> shipped): rleFoldIntMulti 3.47 -> 3.41 rleFoldIntConst 49.7 -> 49.9 rleFoldLongMulti 3.37 -> 3.34 sparseFoldInt 16.7 -> 43.4 rleFoldByteMulti 2.99 -> 3.18 sparseFoldLong 41.0 -> 44.9 Lazy* stay records (ADR-0010): only the control-flow walk is shared; a superclass would force the hot value read behind a virtual call. Co-Authored-By: Claude Opus 4.8 --- .../performance/LazyArrayWalkBenchmark.java | 205 ++++++++++++++++++ .../vortex/reader/array/LazyRleByteArray.java | 18 +- .../vortex/reader/array/LazyRleIntArray.java | 14 +- .../vortex/reader/array/LazyRleLongArray.java | 14 +- .../reader/array/LazyRleShortArray.java | 18 +- .../reader/array/LazySparseDoubleArray.java | 30 +-- .../reader/array/LazySparseFloatArray.java | 30 +-- .../reader/array/LazySparseIntArray.java | 30 +-- .../reader/array/LazySparseLongArray.java | 30 +-- .../dfa1/vortex/reader/array/RleArrays.java | 41 ++++ 10 files changed, 274 insertions(+), 156 deletions(-) create mode 100644 performance/src/main/java/io/github/dfa1/vortex/performance/LazyArrayWalkBenchmark.java diff --git a/performance/src/main/java/io/github/dfa1/vortex/performance/LazyArrayWalkBenchmark.java b/performance/src/main/java/io/github/dfa1/vortex/performance/LazyArrayWalkBenchmark.java new file mode 100644 index 00000000..835c820a --- /dev/null +++ b/performance/src/main/java/io/github/dfa1/vortex/performance/LazyArrayWalkBenchmark.java @@ -0,0 +1,205 @@ +package io.github.dfa1.vortex.performance; + +import io.github.dfa1.vortex.core.DType; +import io.github.dfa1.vortex.core.PType; +import io.github.dfa1.vortex.reader.array.Array; +import io.github.dfa1.vortex.reader.array.DoubleArray; +import io.github.dfa1.vortex.reader.array.FloatArray; +import io.github.dfa1.vortex.reader.array.IntArray; +import io.github.dfa1.vortex.reader.array.LazyRleByteArray; +import io.github.dfa1.vortex.reader.array.LazyRleIntArray; +import io.github.dfa1.vortex.reader.array.LazyRleLongArray; +import io.github.dfa1.vortex.reader.array.LazySparseDoubleArray; +import io.github.dfa1.vortex.reader.array.LazySparseFloatArray; +import io.github.dfa1.vortex.reader.array.LazySparseIntArray; +import io.github.dfa1.vortex.reader.array.LazySparseLongArray; +import io.github.dfa1.vortex.reader.array.LongArray; +import io.github.dfa1.vortex.reader.array.MaterializedDoubleArray; +import io.github.dfa1.vortex.reader.array.MaterializedFloatArray; +import io.github.dfa1.vortex.reader.array.MaterializedIntArray; +import io.github.dfa1.vortex.reader.array.MaterializedLongArray; +import java.lang.foreign.Arena; +import java.lang.foreign.MemorySegment; +import java.lang.foreign.ValueLayout; +import java.util.concurrent.TimeUnit; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; + +/// Microbenchmark for the `Lazy{Sparse,Rle}` `fold` / `forEach` walk paths. +/// +/// Exercises every value-type whose walk was hoisted into the shared +/// `SparseArrays.walkPatches` / `RleArrays.walkRuns` helpers, all in one JVM so +/// the shared walker call sites see multiple lambda implementations — the +/// megamorphic condition that a regression (lost inlining / vectorization on +/// the fill / constant-run loops) would show up under. Compare results against +/// the pre-refactor inlined implementations on `main`. +@State(Scope.Benchmark) +@BenchmarkMode(Mode.Throughput) +@OutputTimeUnit(TimeUnit.MILLISECONDS) +@Warmup(iterations = 3, time = 2) +@Measurement(iterations = 5, time = 2) +@Fork(1) +public class LazyArrayWalkBenchmark { + + private static final int ROWS = 1_000_000; + private static final int NUM_CHUNKS = (ROWS + 1023) / 1024; + private static final int SPARSE_PATCHES = 1_000; + + private static final DType I32 = new DType.Primitive(PType.I32, false); + private static final DType I64 = new DType.Primitive(PType.I64, false); + private static final DType I8 = new DType.Primitive(PType.I8, false); + private static final DType F64 = new DType.Primitive(PType.F64, false); + private static final DType F32 = new DType.Primitive(PType.F32, false); + + private Arena arena; + + private LazySparseIntArray sparseInt; + private LazySparseLongArray sparseLong; + private LazySparseDoubleArray sparseDouble; + private LazySparseFloatArray sparseFloat; + + private LazyRleIntArray rleIntConst; + private LazyRleIntArray rleIntMulti; + private LazyRleLongArray rleLongMulti; + private LazyRleByteArray rleByteMulti; + + @Setup + public void setup() { + arena = Arena.ofShared(); + + // Sparse: mostly fill, few patches — stresses the fill-emission walk. + int[] patchIdx = new int[SPARSE_PATCHES]; + for (int k = 0; k < SPARSE_PATCHES; k++) { + patchIdx[k] = k * (ROWS / SPARSE_PATCHES); + } + Array idxArr = intArray(patchIdx); + sparseInt = new LazySparseIntArray(I32, ROWS, 7, intArrayValues(SPARSE_PATCHES), idxArr, 0L); + sparseLong = new LazySparseLongArray(I64, ROWS, 7L, longArrayValues(SPARSE_PATCHES), idxArr, 0L); + sparseDouble = new LazySparseDoubleArray(F64, ROWS, 1.5, doubleArrayValues(SPARSE_PATCHES), idxArr, 0L); + sparseFloat = new LazySparseFloatArray(F32, ROWS, 1.5f, floatArrayValues(SPARSE_PATCHES), idxArr, 0L); + + // RLE constant runs: one distinct value per chunk — constant fast path. + long[] constOffsets = new long[NUM_CHUNKS]; + int[] constValuesI = new int[NUM_CHUNKS]; + for (int c = 0; c < NUM_CHUNKS; c++) { + constOffsets[c] = c; + constValuesI[c] = c; + } + rleIntConst = new LazyRleIntArray(I32, ROWS, constValuesI, new int[NUM_CHUNKS * 1024], + constOffsets, 0L, NUM_CHUNKS, NUM_CHUNKS, 0); + + // RLE multi-value: 4 distinct values per chunk — per-row emit path. + int valsPerChunk = 4; + long[] multiOffsets = new long[NUM_CHUNKS]; + int[] multiValuesI = new int[NUM_CHUNKS * valsPerChunk]; + long[] multiValuesL = new long[NUM_CHUNKS * valsPerChunk]; + byte[] multiValuesB = new byte[NUM_CHUNKS * valsPerChunk]; + for (int c = 0; c < NUM_CHUNKS; c++) { + multiOffsets[c] = (long) c * valsPerChunk; + for (int j = 0; j < valsPerChunk; j++) { + multiValuesI[c * valsPerChunk + j] = c * 10 + j; + multiValuesL[c * valsPerChunk + j] = c * 10L + j; + multiValuesB[c * valsPerChunk + j] = (byte) (c + j); + } + } + int[] multiIndices = new int[NUM_CHUNKS * 1024]; + for (int r = 0; r < multiIndices.length; r++) { + multiIndices[r] = r & (valsPerChunk - 1); + } + long valuesLen = (long) NUM_CHUNKS * valsPerChunk; + rleIntMulti = new LazyRleIntArray(I32, ROWS, multiValuesI, multiIndices, + multiOffsets, 0L, valuesLen, NUM_CHUNKS, 0); + rleLongMulti = new LazyRleLongArray(I64, ROWS, multiValuesL, multiIndices, + multiOffsets, 0L, valuesLen, NUM_CHUNKS, 0); + rleByteMulti = new LazyRleByteArray(I8, ROWS, multiValuesB, multiIndices, + multiOffsets, 0L, valuesLen, NUM_CHUNKS, 0, false); + } + + @Benchmark + public int sparseFoldInt() { + return sparseInt.fold(0, Integer::sum); + } + + @Benchmark + public long sparseFoldLong() { + return sparseLong.fold(0L, Long::sum); + } + + @Benchmark + public double sparseFoldDouble() { + return sparseDouble.fold(0.0, Double::sum); + } + + @Benchmark + public double sparseFoldFloat() { + return sparseFloat.fold(0.0, Double::sum); + } + + @Benchmark + public int rleFoldIntConst() { + return rleIntConst.fold(0, Integer::sum); + } + + @Benchmark + public int rleFoldIntMulti() { + return rleIntMulti.fold(0, Integer::sum); + } + + @Benchmark + public long rleFoldLongMulti() { + return rleLongMulti.fold(0L, Long::sum); + } + + @Benchmark + public long rleFoldByteMulti() { + return rleByteMulti.fold(0L, Long::sum); + } + + private IntArray intArrayValues(int n) { + int[] vs = new int[n]; + for (int i = 0; i < n; i++) { + vs[i] = i; + } + return intArray(vs); + } + + private IntArray intArray(int[] vs) { + MemorySegment seg = arena.allocate(vs.length * 4L, 4); + for (int i = 0; i < vs.length; i++) { + seg.setAtIndex(ValueLayout.JAVA_INT, i, vs[i]); + } + return new MaterializedIntArray(I32, vs.length, seg.asReadOnly()); + } + + private LongArray longArrayValues(int n) { + MemorySegment seg = arena.allocate(n * 8L, 8); + for (int i = 0; i < n; i++) { + seg.setAtIndex(ValueLayout.JAVA_LONG, i, i); + } + return new MaterializedLongArray(I64, n, seg.asReadOnly()); + } + + private DoubleArray doubleArrayValues(int n) { + MemorySegment seg = arena.allocate(n * 8L, 8); + for (int i = 0; i < n; i++) { + seg.setAtIndex(ValueLayout.JAVA_DOUBLE, i, i + 0.25); + } + return new MaterializedDoubleArray(F64, n, seg.asReadOnly()); + } + + private FloatArray floatArrayValues(int n) { + MemorySegment seg = arena.allocate(n * 4L, 4); + for (int i = 0; i < n; i++) { + seg.setAtIndex(ValueLayout.JAVA_FLOAT, i, i + 0.25f); + } + return new MaterializedFloatArray(F32, n, seg.asReadOnly()); + } +} diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazyRleByteArray.java b/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazyRleByteArray.java index 8535c0e2..6fc81b70 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazyRleByteArray.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazyRleByteArray.java @@ -52,20 +52,10 @@ public int getInt(long i) { @Override public long fold(long identity, LongBinaryOperator op) { - long acc = identity; - long n = length; - long emitted = 0; - int absRow = offset; - int startChunk = absRow >>> RleArrays.FL_LOG2; - for (int chunkIdx = startChunk; chunkIdx < numChunks && emitted < n; chunkIdx++) { - int rowInChunk = absRow - chunkIdx * RleArrays.FL_CHUNK_SIZE; - int end = Math.min(RleArrays.FL_CHUNK_SIZE, rowInChunk + (int) (n - emitted)); - acc = foldChunk(chunkIdx, rowInChunk, end, acc, op); - int count = end - rowInChunk; - emitted += count; - absRow += count; - } - return acc; + long[] acc = {identity}; + RleArrays.walkChunks(length, offset, numChunks, + (chunkIdx, rowInChunk, end) -> acc[0] = foldChunk(chunkIdx, rowInChunk, end, acc[0], op)); + return acc[0]; } private long widen(byte v) { diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazyRleIntArray.java b/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazyRleIntArray.java index 4e23ed7c..1d6d0c76 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazyRleIntArray.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazyRleIntArray.java @@ -42,18 +42,8 @@ public int getInt(long i) { @Override public void forEachInt(IntConsumer c) { - long n = length; - long emitted = 0; - int absRow = offset; - int startChunk = absRow >>> RleArrays.FL_LOG2; - for (int chunkIdx = startChunk; chunkIdx < numChunks && emitted < n; chunkIdx++) { - int rowInChunk = absRow - chunkIdx * RleArrays.FL_CHUNK_SIZE; - int end = Math.min(RleArrays.FL_CHUNK_SIZE, rowInChunk + (int) (n - emitted)); - processChunk(chunkIdx, rowInChunk, end, c); - int count = end - rowInChunk; - emitted += count; - absRow += count; - } + RleArrays.walkChunks(length, offset, numChunks, + (chunkIdx, rowInChunk, end) -> processChunk(chunkIdx, rowInChunk, end, c)); } private void processChunk(int chunkIdx, int rowInChunk, int end, IntConsumer c) { diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazyRleLongArray.java b/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazyRleLongArray.java index 5f8c1374..5dfd4870 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazyRleLongArray.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazyRleLongArray.java @@ -52,18 +52,8 @@ public long getLong(long i) { @Override public void forEachLong(LongConsumer c) { - long n = length; - long emitted = 0; - int absRow = offset; - int startChunk = absRow >>> RleArrays.FL_LOG2; - for (int chunkIdx = startChunk; chunkIdx < numChunks && emitted < n; chunkIdx++) { - int rowInChunk = absRow - chunkIdx * RleArrays.FL_CHUNK_SIZE; - int end = Math.min(RleArrays.FL_CHUNK_SIZE, rowInChunk + (int) (n - emitted)); - processChunk(chunkIdx, rowInChunk, end, c); - int count = end - rowInChunk; - emitted += count; - absRow += count; - } + RleArrays.walkChunks(length, offset, numChunks, + (chunkIdx, rowInChunk, end) -> processChunk(chunkIdx, rowInChunk, end, c)); } private void processChunk(int chunkIdx, int rowInChunk, int end, LongConsumer c) { diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazyRleShortArray.java b/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazyRleShortArray.java index b80f9655..254a0bae 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazyRleShortArray.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazyRleShortArray.java @@ -52,20 +52,10 @@ public int getInt(long i) { @Override public long fold(long identity, LongBinaryOperator op) { - long acc = identity; - long n = length; - long emitted = 0; - int absRow = offset; - int startChunk = absRow >>> RleArrays.FL_LOG2; - for (int chunkIdx = startChunk; chunkIdx < numChunks && emitted < n; chunkIdx++) { - int rowInChunk = absRow - chunkIdx * RleArrays.FL_CHUNK_SIZE; - int end = Math.min(RleArrays.FL_CHUNK_SIZE, rowInChunk + (int) (n - emitted)); - acc = foldChunk(chunkIdx, rowInChunk, end, acc, op); - int count = end - rowInChunk; - emitted += count; - absRow += count; - } - return acc; + long[] acc = {identity}; + RleArrays.walkChunks(length, offset, numChunks, + (chunkIdx, rowInChunk, end) -> acc[0] = foldChunk(chunkIdx, rowInChunk, end, acc[0], op)); + return acc[0]; } private long widen(short v) { diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazySparseDoubleArray.java b/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazySparseDoubleArray.java index b766d95f..628ec4e2 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazySparseDoubleArray.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazySparseDoubleArray.java @@ -29,32 +29,10 @@ public double getDouble(long i) { @Override public void forEachDouble(DoubleConsumer c) { - if (patchValues == null) { - for (long r = 0; r < length; r++) { - c.accept(fillValue); - } - return; - } - long numPatches = patchValues.length(); - long absStart = offset; - long absEnd = offset + length; - int p = SparseArrays.findFirstAtOrAfter(patchIndices, numPatches, absStart); - long pos = absStart; - while (pos < absEnd && p < numPatches) { - long patchAbs = SparseArrays.readPatchIdx(patchIndices, p); - if (patchAbs >= absEnd) { - break; - } - for (long r = pos; r < patchAbs; r++) { - c.accept(fillValue); - } - c.accept(patchValues.getDouble(p)); - pos = patchAbs + 1; - p++; - } - for (long r = pos; r < absEnd; r++) { - c.accept(fillValue); - } + long numPatches = patchValues == null ? 0 : patchValues.length(); + SparseArrays.walkPatches(patchIndices, numPatches, offset, offset + length, + () -> c.accept(fillValue), + p -> c.accept(patchValues.getDouble(p))); } @Override diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazySparseFloatArray.java b/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazySparseFloatArray.java index 54245d49..40b9e24f 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazySparseFloatArray.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazySparseFloatArray.java @@ -29,32 +29,10 @@ public float getFloat(long i) { @Override public double fold(double identity, DoubleBinaryOperator op) { double[] acc = {identity}; - if (patchValues == null) { - for (long r = 0; r < length; r++) { - acc[0] = op.applyAsDouble(acc[0], fillValue); - } - return acc[0]; - } - long numPatches = patchValues.length(); - long absStart = offset; - long absEnd = offset + length; - int p = SparseArrays.findFirstAtOrAfter(patchIndices, numPatches, absStart); - long pos = absStart; - while (pos < absEnd && p < numPatches) { - long patchAbs = SparseArrays.readPatchIdx(patchIndices, p); - if (patchAbs >= absEnd) { - break; - } - for (long r = pos; r < patchAbs; r++) { - acc[0] = op.applyAsDouble(acc[0], fillValue); - } - acc[0] = op.applyAsDouble(acc[0], patchValues.getFloat(p)); - pos = patchAbs + 1; - p++; - } - for (long r = pos; r < absEnd; r++) { - acc[0] = op.applyAsDouble(acc[0], fillValue); - } + long numPatches = patchValues == null ? 0 : patchValues.length(); + SparseArrays.walkPatches(patchIndices, numPatches, offset, offset + length, + () -> acc[0] = op.applyAsDouble(acc[0], fillValue), + p -> acc[0] = op.applyAsDouble(acc[0], patchValues.getFloat(p))); return acc[0]; } } diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazySparseIntArray.java b/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazySparseIntArray.java index 8d342727..e4e39857 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazySparseIntArray.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazySparseIntArray.java @@ -29,32 +29,10 @@ public int getInt(long i) { @Override public void forEachInt(IntConsumer c) { - if (patchValues == null) { - for (long r = 0; r < length; r++) { - c.accept(fillValue); - } - return; - } - long numPatches = patchValues.length(); - long absStart = offset; - long absEnd = offset + length; - int p = SparseArrays.findFirstAtOrAfter(patchIndices, numPatches, absStart); - long pos = absStart; - while (pos < absEnd && p < numPatches) { - long patchAbs = SparseArrays.readPatchIdx(patchIndices, p); - if (patchAbs >= absEnd) { - break; - } - for (long r = pos; r < patchAbs; r++) { - c.accept(fillValue); - } - c.accept(patchValues.getInt(p)); - pos = patchAbs + 1; - p++; - } - for (long r = pos; r < absEnd; r++) { - c.accept(fillValue); - } + long numPatches = patchValues == null ? 0 : patchValues.length(); + SparseArrays.walkPatches(patchIndices, numPatches, offset, offset + length, + () -> c.accept(fillValue), + p -> c.accept(patchValues.getInt(p))); } @Override diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazySparseLongArray.java b/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazySparseLongArray.java index 5247bed1..1725925b 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazySparseLongArray.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/array/LazySparseLongArray.java @@ -37,32 +37,10 @@ public long getLong(long i) { @Override public void forEachLong(LongConsumer c) { - if (patchValues == null) { - for (long r = 0; r < length; r++) { - c.accept(fillValue); - } - return; - } - long numPatches = patchValues.length(); - long absStart = offset; - long absEnd = offset + length; - int p = SparseArrays.findFirstAtOrAfter(patchIndices, numPatches, absStart); - long pos = absStart; - while (pos < absEnd && p < numPatches) { - long patchAbs = SparseArrays.readPatchIdx(patchIndices, p); - if (patchAbs >= absEnd) { - break; - } - for (long r = pos; r < patchAbs; r++) { - c.accept(fillValue); - } - c.accept(patchValues.getLong(p)); - pos = patchAbs + 1; - p++; - } - for (long r = pos; r < absEnd; r++) { - c.accept(fillValue); - } + long numPatches = patchValues == null ? 0 : patchValues.length(); + SparseArrays.walkPatches(patchIndices, numPatches, offset, offset + length, + () -> c.accept(fillValue), + p -> c.accept(patchValues.getLong(p))); } @Override diff --git a/reader/src/main/java/io/github/dfa1/vortex/reader/array/RleArrays.java b/reader/src/main/java/io/github/dfa1/vortex/reader/array/RleArrays.java index 3709baee..27c23e38 100644 --- a/reader/src/main/java/io/github/dfa1/vortex/reader/array/RleArrays.java +++ b/reader/src/main/java/io/github/dfa1/vortex/reader/array/RleArrays.java @@ -39,4 +39,45 @@ static int chunkValueCount(int chunkIdx, int numChunks, long[] valuesIdxOffsets, : valuesLen; return (int) (end - start); } + + /// Visitor for [#walkChunks]. Invoked once per covered chunk with the + /// chunk's local row span `[rowInChunk, end)`; the implementation runs its + /// own typed inner loop over that span. + @FunctionalInterface + interface ChunkVisitor { + + /// Processes one chunk's row span. + /// + /// @param chunkIdx chunk index in `[0, numChunks)` + /// @param rowInChunk first local row to emit (inclusive) + /// @param end one past the last local row to emit + void visit(int chunkIdx, int rowInChunk, int end); + } + + /// Walks the logical range `[offset, offset + length)` chunk by chunk, + /// calling `visitor` once per chunk with the local `[rowInChunk, end)` span. + /// + /// Centralises the FastLanes chunk-boundary arithmetic shared by every + /// `LazyRleXxxArray` record's `forEach` / `fold`. The visitor — not this + /// method — owns the per-row loop, so the typed `values[...]` read stays a + /// direct, monomorphic array access; this call fires once per chunk + /// (≈ `length / 1024` times), never per row. + /// + /// @param length logical row count to emit + /// @param offset starting absolute position + /// @param numChunks total chunk count + /// @param visitor receives each chunk's local row span + static void walkChunks(long length, int offset, int numChunks, ChunkVisitor visitor) { + long emitted = 0; + int absRow = offset; + int startChunk = absRow >>> FL_LOG2; + for (int chunkIdx = startChunk; chunkIdx < numChunks && emitted < length; chunkIdx++) { + int rowInChunk = absRow - chunkIdx * FL_CHUNK_SIZE; + int end = Math.min(FL_CHUNK_SIZE, rowInChunk + (int) (length - emitted)); + visitor.visit(chunkIdx, rowInChunk, end); + int count = end - rowInChunk; + emitted += count; + absRow += count; + } + } }