diff --git a/README.md b/README.md index 0246789..5888603 100644 --- a/README.md +++ b/README.md @@ -101,8 +101,9 @@ Most others do not. The Java Team at Intel (R) introduced the vector implementation for FastPFOR based on the Java Vector API that showed significant gains over the non-vectorized implementation. For an example usage, see -examples/vector/Example.java. The feature requires JDK 19+ and is currently for -advanced users. +examples/vector/Example.java. On aarch64 (e.g. Graviton3), use JDK 24 or +later: earlier releases lack the Vector API SVE intrinsics and run a fallback +that is slower than the scalar codec. JavaFastPFOR as a dependency ------------------------ diff --git a/pom.xml b/pom.xml index 180a49d..5594a43 100644 --- a/pom.xml +++ b/pom.xml @@ -90,33 +90,17 @@ 21 21 + + --add-modules + jdk.incubator.vector + + + + + maven-surefire-plugin + + --add-modules jdk.incubator.vector - - - default-compile - compile - - compile - - - - me/lemire/integercompression/vector/* - module-info.java - - - - - - org.apache.felix diff --git a/src/main/java/me/lemire/integercompression/vector/VectorBitPacker.java b/src/main/java/me/lemire/integercompression/vector/VectorBitPacker.java index 9b2e1ca..1b0df04 100644 --- a/src/main/java/me/lemire/integercompression/vector/VectorBitPacker.java +++ b/src/main/java/me/lemire/integercompression/vector/VectorBitPacker.java @@ -15,77 +15,42 @@ * classes. * */ -public class VectorBitPacker { +public class VectorBitPacker implements VectorBitPackerKernels { private static final VectorSpecies SPECIES_512 = IntVector.SPECIES_512; - private static final VectorSpecies SPECIES_256 = - IntVector.SPECIES_256; private static final int VLEN_512 = 16; - private static final int VLEN_256 = 8; private static final int BLOCK_SIZE = 256; - private static final IntVector MASK_1 = - IntVector.broadcast(SPECIES_256, (1 << 1) - 1); private static final IntVector MASK_2 = IntVector.broadcast(SPECIES_512, (1 << 2) - 1); - private static final IntVector MASK_3 = - IntVector.broadcast(SPECIES_256, (1 << 3) - 1); private static final IntVector MASK_4 = IntVector.broadcast(SPECIES_512, (1 << 4) - 1); - private static final IntVector MASK_5 = - IntVector.broadcast(SPECIES_256, (1 << 5) - 1); private static final IntVector MASK_6 = IntVector.broadcast(SPECIES_512, (1 << 6) - 1); - private static final IntVector MASK_7 = - IntVector.broadcast(SPECIES_256, (1 << 7) - 1); private static final IntVector MASK_8 = IntVector.broadcast(SPECIES_512, (1 << 8) - 1); - private static final IntVector MASK_9 = - IntVector.broadcast(SPECIES_256, (1 << 9) - 1); private static final IntVector MASK_10 = IntVector.broadcast(SPECIES_512, (1 << 10) - 1); - private static final IntVector MASK_11 = - IntVector.broadcast(SPECIES_256, (1 << 11) - 1); private static final IntVector MASK_12 = IntVector.broadcast(SPECIES_512, (1 << 12) - 1); - private static final IntVector MASK_13 = - IntVector.broadcast(SPECIES_256, (1 << 13) - 1); private static final IntVector MASK_14 = IntVector.broadcast(SPECIES_512, (1 << 14) - 1); - private static final IntVector MASK_15 = - IntVector.broadcast(SPECIES_256, (1 << 15) - 1); private static final IntVector MASK_16 = IntVector.broadcast(SPECIES_512, (1 << 16) - 1); - private static final IntVector MASK_17 = - IntVector.broadcast(SPECIES_256, (1 << 17) - 1); private static final IntVector MASK_18 = IntVector.broadcast(SPECIES_512, (1 << 18) - 1); - private static final IntVector MASK_19 = - IntVector.broadcast(SPECIES_256, (1 << 19) - 1); private static final IntVector MASK_20 = IntVector.broadcast(SPECIES_512, (1 << 20) - 1); - private static final IntVector MASK_21 = - IntVector.broadcast(SPECIES_256, (1 << 21) - 1); private static final IntVector MASK_22 = IntVector.broadcast(SPECIES_512, (1 << 22) - 1); - private static final IntVector MASK_23 = - IntVector.broadcast(SPECIES_256, (1 << 23) - 1); private static final IntVector MASK_24 = IntVector.broadcast(SPECIES_512, (1 << 24) - 1); - private static final IntVector MASK_25 = - IntVector.broadcast(SPECIES_256, (1 << 25) - 1); private static final IntVector MASK_26 = IntVector.broadcast(SPECIES_512, (1 << 26) - 1); - private static final IntVector MASK_27 = - IntVector.broadcast(SPECIES_256, (1 << 27) - 1); private static final IntVector MASK_28 = IntVector.broadcast(SPECIES_512, (1 << 28) - 1); - private static final IntVector MASK_29 = - IntVector.broadcast(SPECIES_256, (1 << 29) - 1); private static final IntVector MASK_30 = IntVector.broadcast(SPECIES_512, (1 << 30) - 1); - private static final IntVector MASK_31 = - IntVector.broadcast(SPECIES_256, (1 << 31) - 1); /** * Pack 32 integers @@ -101,103 +66,104 @@ public class VectorBitPacker { * @param b * number of bits to use per integer */ - public static void fastpack(final int[] in, int inpos, final int[] out, - int outpos, int b) { + @Override + public void fastpack(final int[] in, int inpos, final int[] out, + int outpos, int b) { switch (b) { case 0: break; case 1: - fastpack1(in, inpos, out, outpos); + VectorBitPacker256.fastpack1(in, inpos, out, outpos); break; case 2: fastpack2(in, inpos, out, outpos); break; case 3: - fastpack3(in, inpos, out, outpos); + VectorBitPacker256.fastpack3(in, inpos, out, outpos); break; case 4: fastpack4(in, inpos, out, outpos); break; case 5: - fastpack5(in, inpos, out, outpos); + VectorBitPacker256.fastpack5(in, inpos, out, outpos); break; case 6: fastpack6(in, inpos, out, outpos); break; case 7: - fastpack7(in, inpos, out, outpos); + VectorBitPacker256.fastpack7(in, inpos, out, outpos); break; case 8: fastpack8(in, inpos, out, outpos); break; case 9: - fastpack9(in, inpos, out, outpos); + VectorBitPacker256.fastpack9(in, inpos, out, outpos); break; case 10: fastpack10(in, inpos, out, outpos); break; case 11: - fastpack11(in, inpos, out, outpos); + VectorBitPacker256.fastpack11(in, inpos, out, outpos); break; case 12: fastpack12(in, inpos, out, outpos); break; case 13: - fastpack13(in, inpos, out, outpos); + VectorBitPacker256.fastpack13(in, inpos, out, outpos); break; case 14: fastpack14(in, inpos, out, outpos); break; case 15: - fastpack15(in, inpos, out, outpos); + VectorBitPacker256.fastpack15(in, inpos, out, outpos); break; case 16: fastpack16(in, inpos, out, outpos); break; case 17: - fastpack17(in, inpos, out, outpos); + VectorBitPacker256.fastpack17(in, inpos, out, outpos); break; case 18: fastpack18(in, inpos, out, outpos); break; case 19: - fastpack19(in, inpos, out, outpos); + VectorBitPacker256.fastpack19(in, inpos, out, outpos); break; case 20: fastpack20(in, inpos, out, outpos); break; case 21: - fastpack21(in, inpos, out, outpos); + VectorBitPacker256.fastpack21(in, inpos, out, outpos); break; case 22: fastpack22(in, inpos, out, outpos); break; case 23: - fastpack23(in, inpos, out, outpos); + VectorBitPacker256.fastpack23(in, inpos, out, outpos); break; case 24: fastpack24(in, inpos, out, outpos); break; case 25: - fastpack25(in, inpos, out, outpos); + VectorBitPacker256.fastpack25(in, inpos, out, outpos); break; case 26: fastpack26(in, inpos, out, outpos); break; case 27: - fastpack27(in, inpos, out, outpos); + VectorBitPacker256.fastpack27(in, inpos, out, outpos); break; case 28: fastpack28(in, inpos, out, outpos); break; case 29: - fastpack29(in, inpos, out, outpos); + VectorBitPacker256.fastpack29(in, inpos, out, outpos); break; case 30: fastpack30(in, inpos, out, outpos); break; case 31: - fastpack31(in, inpos, out, outpos); + VectorBitPacker256.fastpack31(in, inpos, out, outpos); break; case 32: System.arraycopy(in, inpos, out, outpos, BLOCK_SIZE); @@ -205,103 +171,104 @@ public static void fastpack(final int[] in, int inpos, final int[] out, } } - static void fastpackNoMask(final int[] in, int inpos, final int[] out, + @Override + public void fastpackNoMask(final int[] in, int inpos, final int[] out, int outpos, int b) { switch (b) { case 0: break; case 1: - fastpackNoMask1(in, inpos, out, outpos); + VectorBitPacker256.fastpackNoMask1(in, inpos, out, outpos); break; case 2: fastpackNoMask2(in, inpos, out, outpos); break; case 3: - fastpackNoMask3(in, inpos, out, outpos); + VectorBitPacker256.fastpackNoMask3(in, inpos, out, outpos); break; case 4: fastpackNoMask4(in, inpos, out, outpos); break; case 5: - fastpackNoMask5(in, inpos, out, outpos); + VectorBitPacker256.fastpackNoMask5(in, inpos, out, outpos); break; case 6: fastpackNoMask6(in, inpos, out, outpos); break; case 7: - fastpackNoMask7(in, inpos, out, outpos); + VectorBitPacker256.fastpackNoMask7(in, inpos, out, outpos); break; case 8: fastpackNoMask8(in, inpos, out, outpos); break; case 9: - fastpackNoMask9(in, inpos, out, outpos); + VectorBitPacker256.fastpackNoMask9(in, inpos, out, outpos); break; case 10: fastpackNoMask10(in, inpos, out, outpos); break; case 11: - fastpackNoMask11(in, inpos, out, outpos); + VectorBitPacker256.fastpackNoMask11(in, inpos, out, outpos); break; case 12: fastpackNoMask12(in, inpos, out, outpos); break; case 13: - fastpackNoMask13(in, inpos, out, outpos); + VectorBitPacker256.fastpackNoMask13(in, inpos, out, outpos); break; case 14: fastpackNoMask14(in, inpos, out, outpos); break; case 15: - fastpackNoMask15(in, inpos, out, outpos); + VectorBitPacker256.fastpackNoMask15(in, inpos, out, outpos); break; case 16: fastpackNoMask16(in, inpos, out, outpos); break; case 17: - fastpackNoMask17(in, inpos, out, outpos); + VectorBitPacker256.fastpackNoMask17(in, inpos, out, outpos); break; case 18: fastpackNoMask18(in, inpos, out, outpos); break; case 19: - fastpackNoMask19(in, inpos, out, outpos); + VectorBitPacker256.fastpackNoMask19(in, inpos, out, outpos); break; case 20: fastpackNoMask20(in, inpos, out, outpos); break; case 21: - fastpackNoMask21(in, inpos, out, outpos); + VectorBitPacker256.fastpackNoMask21(in, inpos, out, outpos); break; case 22: fastpackNoMask22(in, inpos, out, outpos); break; case 23: - fastpackNoMask23(in, inpos, out, outpos); + VectorBitPacker256.fastpackNoMask23(in, inpos, out, outpos); break; case 24: fastpackNoMask24(in, inpos, out, outpos); break; case 25: - fastpackNoMask25(in, inpos, out, outpos); + VectorBitPacker256.fastpackNoMask25(in, inpos, out, outpos); break; case 26: fastpackNoMask26(in, inpos, out, outpos); break; case 27: - fastpackNoMask27(in, inpos, out, outpos); + VectorBitPacker256.fastpackNoMask27(in, inpos, out, outpos); break; case 28: fastpackNoMask28(in, inpos, out, outpos); break; case 29: - fastpackNoMask29(in, inpos, out, outpos); + VectorBitPacker256.fastpackNoMask29(in, inpos, out, outpos); break; case 30: fastpackNoMask30(in, inpos, out, outpos); break; case 31: - fastpackNoMask31(in, inpos, out, outpos); + VectorBitPacker256.fastpackNoMask31(in, inpos, out, outpos); break; case 32: System.arraycopy(in, inpos, out, outpos, BLOCK_SIZE); @@ -323,104 +290,105 @@ static void fastpackNoMask(final int[] in, int inpos, final int[] out, * @param b * number of bits to use per integer */ - public static void fastunpack(final int[] in, int inpos, final int[] out, - int outpos, int b) { + @Override + public void fastunpack(final int[] in, int inpos, final int[] out, + int outpos, int b) { switch (b) { case 0: Arrays.fill(out, outpos, outpos + 256, 0); break; case 1: - fastunpack1(in, inpos, out, outpos); + VectorBitPacker256.fastunpack1(in, inpos, out, outpos); break; case 2: fastunpack2(in, inpos, out, outpos); break; case 3: - fastunpack3(in, inpos, out, outpos); + VectorBitPacker256.fastunpack3(in, inpos, out, outpos); break; case 4: fastunpack4(in, inpos, out, outpos); break; case 5: - fastunpack5(in, inpos, out, outpos); + VectorBitPacker256.fastunpack5(in, inpos, out, outpos); break; case 6: fastunpack6(in, inpos, out, outpos); break; case 7: - fastunpack7(in, inpos, out, outpos); + VectorBitPacker256.fastunpack7(in, inpos, out, outpos); break; case 8: fastunpack8(in, inpos, out, outpos); break; case 9: - fastunpack9(in, inpos, out, outpos); + VectorBitPacker256.fastunpack9(in, inpos, out, outpos); break; case 10: fastunpack10(in, inpos, out, outpos); break; case 11: - fastunpack11(in, inpos, out, outpos); + VectorBitPacker256.fastunpack11(in, inpos, out, outpos); break; case 12: fastunpack12(in, inpos, out, outpos); break; case 13: - fastunpack13(in, inpos, out, outpos); + VectorBitPacker256.fastunpack13(in, inpos, out, outpos); break; case 14: fastunpack14(in, inpos, out, outpos); break; case 15: - fastunpack15(in, inpos, out, outpos); + VectorBitPacker256.fastunpack15(in, inpos, out, outpos); break; case 16: fastunpack16(in, inpos, out, outpos); break; case 17: - fastunpack17(in, inpos, out, outpos); + VectorBitPacker256.fastunpack17(in, inpos, out, outpos); break; case 18: fastunpack18(in, inpos, out, outpos); break; case 19: - fastunpack19(in, inpos, out, outpos); + VectorBitPacker256.fastunpack19(in, inpos, out, outpos); break; case 20: fastunpack20(in, inpos, out, outpos); break; case 21: - fastunpack21(in, inpos, out, outpos); + VectorBitPacker256.fastunpack21(in, inpos, out, outpos); break; case 22: fastunpack22(in, inpos, out, outpos); break; case 23: - fastunpack23(in, inpos, out, outpos); + VectorBitPacker256.fastunpack23(in, inpos, out, outpos); break; case 24: fastunpack24(in, inpos, out, outpos); break; case 25: - fastunpack25(in, inpos, out, outpos); + VectorBitPacker256.fastunpack25(in, inpos, out, outpos); break; case 26: fastunpack26(in, inpos, out, outpos); break; case 27: - fastunpack27(in, inpos, out, outpos); + VectorBitPacker256.fastunpack27(in, inpos, out, outpos); break; case 28: fastunpack28(in, inpos, out, outpos); break; case 29: - fastunpack29(in, inpos, out, outpos); + VectorBitPacker256.fastunpack29(in, inpos, out, outpos); break; case 30: fastunpack30(in, inpos, out, outpos); break; case 31: - fastunpack31(in, inpos, out, outpos); + VectorBitPacker256.fastunpack31(in, inpos, out, outpos); break; case 32: System.arraycopy(in, inpos, out, outpos, BLOCK_SIZE); @@ -428,176 +396,6 @@ public static void fastunpack(final int[] in, int inpos, final int[] out, } } - public static int slowpack(final int[] in, int inpos, int inlen, - final int[] out, int outpos, int b) { - if (inlen == 0) - return outpos; - if (b == 32) { - System.arraycopy(in, inpos, out, outpos, inlen); - return outpos + inlen; - } - int mask = (1 << b) - 1; - int c = 0; - int l = 0; - int r = 0; - int val = 0; - for (int i = 0; i < inlen; i++) { - val = in[inpos + i] & mask; - out[outpos] |= val << (c + r); - c += b; - l = (32 - r) % b; - if (c + r >= 32) { - if (i < inlen - 1 || l != 0) - outpos++; - r = l == 0 ? 0 : b - l; - if (l != 0) - out[outpos] = val >> (b - r); - c = 0; - } - } - return outpos; - } - - public static int slowunpack(final int[] in, int inpos, final int[] out, - int outpos, int outlen, int b) { - if (outlen == 0) { - return inpos; - } - if (b == 32) { - System.arraycopy(in, inpos, out, outpos, outlen); - return inpos + outlen; - } - int mask = (1 << b) - 1; - int limit = outpos + outlen; - int r = 0; - int val = 0; - int i = 0; - for (; outpos < limit; i++) { - if (r > 0) - out[outpos++] = - (val >>> (32 - (b - r))) | ((in[inpos + i] << (b - r)) & mask); - val = in[inpos + i]; - int j = 0; - int l = 32 - r; - int ll = l % b == 0 ? l : l - b; - while (j < ll && outpos < limit) { - out[outpos++] = (val >> (j + r)) & mask; - j += b; - } - r = l % b == 0 ? 0 : b - (l % b); - } - return inpos + i; - } - - public static int numCompressedInts(int n, int b) { - int width = b % 2 == 0 ? VLEN_512 : VLEN_256; - if (n <= width) - return n; - int intsPerVec = (32 / b) * width; - int q = (n + intsPerVec - 1) / intsPerVec; - return q * width; - } - - private static void fastpack1(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - var oV = iV.and(MASK_1); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 1).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 2).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 3).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 5).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 6).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 7).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 9).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 10).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 11).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 12).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 13).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 14).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 15).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 16).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 17).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 18).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 19).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 20).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 21).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 22).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 23).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 24).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 25).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 26).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 27).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 28).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); - oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 29).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); - oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 30).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); - oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 31).or(oV); - - oV.intoArray(out, outpos); - } - private static void fastpack2(final int[] in, int inpos, final int[] out, int outpos) { var iV = IntVector.fromArray(SPECIES_512, in, inpos); @@ -650,116 +448,6 @@ private static void fastpack2(final int[] in, int inpos, final int[] out, oV.intoArray(out, outpos); } - private static void fastpack3(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - var oV = iV.and(MASK_3); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 3).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 6).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 9).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 12).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 15).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 18).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 21).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 24).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 27).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 30).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_3).lanewise(VectorOperators.LSHR, 2); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 1).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 7).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 10).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 13).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 16).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 19).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 22).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 25).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 28).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 31).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_3).lanewise(VectorOperators.LSHR, 1); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 2).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 5).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 11).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 14).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 17).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 20).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); - oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 23).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); - oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 26).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); - oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 29).or(oV); - - oV.intoArray(out, outpos); - } - private static void fastpack4(final int[] in, int inpos, final int[] out, int outpos) { var iV = IntVector.fromArray(SPECIES_512, in, inpos); @@ -816,126 +504,6 @@ private static void fastpack4(final int[] in, int inpos, final int[] out, oV.intoArray(out, outpos); } - private static void fastpack5(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - var oV = iV.and(MASK_5); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 5).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 10).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 15).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 20).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 25).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 30).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHR, 2); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 3).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 13).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 18).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 23).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHR, 4); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 1).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 6).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 11).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 16).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 21).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 26).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 31).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHR, 1); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 9).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 14).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 19).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 24).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 29).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHR, 3); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 2).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 7).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 12).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 17).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 22).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); - oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 27).or(oV); - - oV.intoArray(out, outpos); - } - private static void fastpack6(final int[] in, int inpos, final int[] out, int outpos) { var iV = IntVector.fromArray(SPECIES_512, in, inpos); @@ -997,136 +565,6 @@ private static void fastpack6(final int[] in, int inpos, final int[] out, oV.intoArray(out, outpos); } - private static void fastpack7(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - var oV = iV.and(MASK_7); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 7).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 14).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 21).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 4); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 3).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 10).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 17).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 24).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 31).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 1); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 6).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 13).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 20).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 27).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 5); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 2).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 9).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 16).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 23).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 30).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 2); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 5).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 12).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 19).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 26).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 6); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 1).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 15).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 22).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 29).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 3); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 11).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 18).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); - oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 25).or(oV); - - oV.intoArray(out, outpos); - } - private static void fastpack8(final int[] in, int inpos, final int[] out, int outpos) { var iV = IntVector.fromArray(SPECIES_512, in, inpos); @@ -1193,146 +631,6 @@ private static void fastpack8(final int[] in, int inpos, final int[] out, oV.intoArray(out, outpos); } - private static void fastpack9(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - var oV = iV.and(MASK_9); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 9).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 18).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 27).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 5); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 13).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 22).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 31).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 1); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 17).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 26).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 6); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 3).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 12).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 21).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 30).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 2); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 7).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 16).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 25).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 7); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 2).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 11).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 20).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 29).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 3); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 6).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 15).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 1).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 10).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 19).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 4); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 5).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 14).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); - oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 23).or(oV); - - oV.intoArray(out, outpos); - } - private static void fastpack10(final int[] in, int inpos, final int[] out, int outpos) { var iV = IntVector.fromArray(SPECIES_512, in, inpos); @@ -1404,169 +702,19 @@ private static void fastpack10(final int[] in, int inpos, final int[] out, oV.intoArray(out, outpos); } - private static void fastpack11(final int[] in, int inpos, final int[] out, + private static void fastpack12(final int[] in, int inpos, final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - var oV = iV.and(MASK_11); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 11).or(oV); + var iV = IntVector.fromArray(SPECIES_512, in, inpos); + var oV = iV.and(MASK_12); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 12).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 22).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 24).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 10); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 1).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 12).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 23).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 9); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 2).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 13).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 3).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 14).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 25).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 7); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 15).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 26).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 6); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 5).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 16).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 27).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 5); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 6).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 17).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 4); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 7).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 18).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 29).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 3); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 19).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 30).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 2); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 9).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 20).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 31).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 1); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 10).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); - oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 21).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastpack12(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_512, in, inpos); - var oV = iV.and(MASK_12); - iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); - oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 12).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); - oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 8); + outpos += VLEN_512; + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 8); iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 4).or(oV); @@ -1630,166 +778,6 @@ private static void fastpack12(final int[] in, int inpos, final int[] out, oV.intoArray(out, outpos); } - private static void fastpack13(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - var oV = iV.and(MASK_13); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 13).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 26).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 6); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 7).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 20).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 12); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 1).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 14).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 27).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 5); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 21).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 11); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 2).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 15).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 4); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 9).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 22).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 10); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 3).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 16).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 29).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 3); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 10).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 23).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 9); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 17).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 30).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 2); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 11).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 5).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 18).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 31).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 1); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 12).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 25).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 7); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 6).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); - oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 19).or(oV); - - oV.intoArray(out, outpos); - } - private static void fastpack14(final int[] in, int inpos, final int[] out, int outpos) { var iV = IntVector.fromArray(SPECIES_512, in, inpos); @@ -1871,176 +859,6 @@ private static void fastpack14(final int[] in, int inpos, final int[] out, oV.intoArray(out, outpos); } - private static void fastpack15(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - var oV = iV.and(MASK_15); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 15).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 30).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 2); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 13).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 4); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 11).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 26).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 6); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 9).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 7).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 22).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 10); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 5).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 20).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 12); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 3).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 18).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 14); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 1).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 16).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 31).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 1); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 14).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 29).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 3); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 12).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 27).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 5); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 10).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 25).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 7); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 23).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 9); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 6).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 21).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 11); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 19).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 13); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 2).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); - oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 17).or(oV); - - oV.intoArray(out, outpos); - } - private static void fastpack16(final int[] in, int inpos, final int[] out, int outpos) { var iV = IntVector.fromArray(SPECIES_512, in, inpos); @@ -2127,510 +945,564 @@ private static void fastpack16(final int[] in, int inpos, final int[] out, oV.intoArray(out, outpos); } - private static void fastpack17(final int[] in, int inpos, final int[] out, + private static void fastpack18(final int[] in, int inpos, final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - var oV = iV.and(MASK_17); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 17).or(oV); + var iV = IntVector.fromArray(SPECIES_512, in, inpos); + var oV = iV.and(MASK_18); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 18).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 15); + outpos += VLEN_512; + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 14); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 2).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 4).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 19).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 22).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 13); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 10); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 4).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 8).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 21).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 26).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 11); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 6); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 6).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 12).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 23).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 30).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 9); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 8).or(oV); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 2); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 25).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 16).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 7); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 16); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 10).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 2).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 27).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 20).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 5); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 12); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 12).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 6).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 29).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 24).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 3); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 8); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 14).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 10).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 31).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 28).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 1); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 4); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 16).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 14).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + } + + private static void fastpack20(final int[] in, int inpos, final int[] out, + int outpos) { + var iV = IntVector.fromArray(SPECIES_512, in, inpos); + var oV = iV.and(MASK_20); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 20).or(oV); - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 16); + oV.intoArray(out, outpos); + outpos += VLEN_512; + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 12); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 1).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 8).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 18).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 28).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 14); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 3).or(oV); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 4); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 20).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 16).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 12); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 16); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 5).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 4).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 22).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 24).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 10); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 7).or(oV); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 8); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 24).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 12).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 8); + oV = oV.zero(SPECIES_512); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 9).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); + oV = iV.and(MASK_20).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 26).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 20).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 12); - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 6); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 11).or(oV); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 4); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 28).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 16).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 4); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 16); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 13).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 4).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 30).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 24).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 2); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 8); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); - oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 15).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 12).or(oV); oV.intoArray(out, outpos); } - private static void fastpack18(final int[] in, int inpos, final int[] out, + private static void fastpack22(final int[] in, int inpos, final int[] out, int outpos) { var iV = IntVector.fromArray(SPECIES_512, in, inpos); - var oV = iV.and(MASK_18); + var oV = iV.and(MASK_22); iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); - oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 18).or(oV); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 22).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 14); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 10); iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); - oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); - oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 22).or(oV); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 12).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 10); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 2).or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); - oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 8).or(oV); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_512; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 8); iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); - oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 26).or(oV); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 14).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 6); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 18); iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); - oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 12).or(oV); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 4).or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); - oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 30).or(oV); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 26).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 2); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 6); iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); - oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 16).or(oV); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 16).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 16); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 16); iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); - oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 2).or(oV); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 6).or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); - oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 20).or(oV); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 28).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 12); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 4); iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); - oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 6).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); - oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 24).or(oV); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 18).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 8); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 8).or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); - oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 10).or(oV); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_512; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 2); iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); - oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 28).or(oV); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 20).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 4); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 12); iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); - oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 14).or(oV); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 10).or(oV); oV.intoArray(out, outpos); } - private static void fastpack19(final int[] in, int inpos, final int[] out, + private static void fastpack24(final int[] in, int inpos, final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - var oV = iV.and(MASK_19); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 19).or(oV); + var iV = IntVector.fromArray(SPECIES_512, in, inpos); + var oV = iV.and(MASK_24); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8); - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 13); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 6).or(oV); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 25).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 7); + oV = oV.zero(SPECIES_512); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 12).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); + oV = iV.and(MASK_24).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 31).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 1); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 18).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 14); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 5).or(oV); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 24).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 8); + oV = oV.zero(SPECIES_512); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 11).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); + oV = iV.and(MASK_24).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 30).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 2); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 17).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 15); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 4).or(oV); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 23).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 9); + oV = oV.zero(SPECIES_512); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 10).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); + oV = iV.and(MASK_24).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 29).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 3); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 16).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 16); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 3).or(oV); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 22).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + } - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 10); + private static void fastpack26(final int[] in, int inpos, final int[] out, + int outpos) { + var iV = IntVector.fromArray(SPECIES_512, in, inpos); + var oV = iV.and(MASK_26); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 26).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 9).or(oV); + oV.intoArray(out, outpos); + outpos += VLEN_512; + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 6); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 28).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 20).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 4); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 12); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 15).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 14).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 17); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 2).or(oV); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 18); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 21).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 8).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 11); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 24); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 8).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 2).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 27).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 28).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 5); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 4); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 14).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 22).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 18); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 1).or(oV); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 10); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 20).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 16).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 12); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 7).or(oV); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 16); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 26).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 10).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 22); - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 6); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 4).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); - oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 13).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 30).or(oV); oV.intoArray(out, outpos); - } + outpos += VLEN_512; - private static void fastpack20(final int[] in, int inpos, final int[] out, + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_512; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_512; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_512; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastpack28(final int[] in, int inpos, final int[] out, int outpos) { var iV = IntVector.fromArray(SPECIES_512, in, inpos); - var oV = iV.and(MASK_20); + var oV = iV.and(MASK_28); iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); - oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 20).or(oV); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 28).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 12); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 4); iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); - oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 8).or(oV); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_512; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 8); iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); - oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 28).or(oV); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 20).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 4); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 12); iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); - oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 16).or(oV); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 16).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 16); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 16); iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); - oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 4).or(oV); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_512; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 20); iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); - oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 24).or(oV); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 8).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 8); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 24); iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); - oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 12).or(oV); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 4).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; @@ -2638,611 +1510,522 @@ private static void fastpack20(final int[] in, int inpos, final int[] out, oV = oV.zero(SPECIES_512); iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); - oV = iV.and(MASK_20).or(oV); + oV = iV.and(MASK_28).or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); - oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 20).or(oV); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 28).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 12); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 4); iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); - oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 8).or(oV); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_512; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 8); iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); - oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 28).or(oV); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 20).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 4); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 12); iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); - oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 16).or(oV); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 16).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 16); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 16); iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); - oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 4).or(oV); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_512; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 20); iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); - oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 24).or(oV); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 8).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 8); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 24); iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); - oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 12).or(oV); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 4).or(oV); oV.intoArray(out, outpos); } - private static void fastpack21(final int[] in, int inpos, final int[] out, + private static void fastpack30(final int[] in, int inpos, final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - var oV = iV.and(MASK_21); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 21).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 11); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 10).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 31).or(oV); + var iV = IntVector.fromArray(SPECIES_512, in, inpos); + var oV = iV.and(MASK_30); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 30).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 1); + outpos += VLEN_512; + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 2); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 20).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 28).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 12); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 9).or(oV); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 4); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 30).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 26).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 2); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 6); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 19).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 24).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 13); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 8).or(oV); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 8); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 29).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 22).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 3); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 10); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 18).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 20).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 14); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 7).or(oV); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 12); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 28).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 18).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 4); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 14); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 17).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 16).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 15); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 6).or(oV); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 16); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 27).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 14).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 5); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 18); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 16).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 12).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 16); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 5).or(oV); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 20); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 26).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 10).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 6); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 22); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 15).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 8).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 17); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 4).or(oV); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 24); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 25).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 6).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 7); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 26); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 14).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 4).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 18); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 3).or(oV); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 28); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 24).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 2).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 8); + } - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 13).or(oV); + private static void fastpackNoMask2(final int[] in, int inpos, + final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_512, in, inpos); + var oV = iV; + iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 19); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 2).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 23).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 9); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 12).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 20); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 1).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 22).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 10); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); - oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 11).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); oV.intoArray(out, outpos); } - private static void fastpack22(final int[] in, int inpos, final int[] out, - int outpos) { + private static void fastpackNoMask4(final int[] in, int inpos, + final int[] out, int outpos) { var iV = IntVector.fromArray(SPECIES_512, in, inpos); - var oV = iV.and(MASK_22); + var oV = iV; iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); - oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 22).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 10); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); - oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 12).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 20); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); - oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 2).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); - oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 8); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); - oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 14).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 18); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); - oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 4).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); - oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 26).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - - oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 6); + oV = oV.zero(SPECIES_512); iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); - oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 16); + oV = iV.or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); - oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 6).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); - oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 4); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); - oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 18).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 14); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); - oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 8).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); - oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 30).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 2); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); - oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 20).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 12); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); - oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 10).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); oV.intoArray(out, outpos); } - private static void fastpack23(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - var oV = iV.and(MASK_23); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 23).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 9); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 14).or(oV); + private static void fastpackNoMask6(final int[] in, int inpos, + final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_512, in, inpos); + var oV = iV; + iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 18); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 5).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 28).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; + oV = iV.lanewise(VectorOperators.LSHR, 2); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 4); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 19).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 13); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 10).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 22); + oV = iV.lanewise(VectorOperators.LSHR, 4); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 1).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 24).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 8); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 15).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 17); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 6).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 29).or(oV); + } - oV.intoArray(out, outpos); - outpos += VLEN_256; + private static void fastpackNoMask8(final int[] in, int inpos, + final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_512, in, inpos); + var oV = iV; + iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 3); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 20).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 12); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 11).or(oV); + outpos += VLEN_512; + oV = oV.zero(SPECIES_512); - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); + oV = iV.or(oV); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 21); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 2).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 25).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 7); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 16).or(oV); + oV = oV.zero(SPECIES_512); - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); + oV = iV.or(oV); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 16); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 7).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 30).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 2); + oV = oV.zero(SPECIES_512); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 21).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); + oV = iV.or(oV); - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 11); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 12).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 20); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 3).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 26).or(oV); + } - oV.intoArray(out, outpos); - outpos += VLEN_256; + private static void fastpackNoMask10(final int[] in, int inpos, + final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_512, in, inpos); + var oV = iV; + iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 6); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 17).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; + oV = iV.lanewise(VectorOperators.LSHR, 2); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 15); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 8).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 31).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 1); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 22).or(oV); + oV = iV.lanewise(VectorOperators.LSHR, 4); - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 10); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 13).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; + + oV = iV.lanewise(VectorOperators.LSHR, 6); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 19); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 4).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 27).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 5); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 18).or(oV); + oV = iV.lanewise(VectorOperators.LSHR, 8); - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 14); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); - oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 9).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); oV.intoArray(out, outpos); } - private static void fastpack24(final int[] in, int inpos, final int[] out, - int outpos) { + private static void fastpackNoMask12(final int[] in, int inpos, + final int[] out, int outpos) { var iV = IntVector.fromArray(SPECIES_512, in, inpos); - var oV = iV.and(MASK_24); + var oV = iV; iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); - oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); - oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - - oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16); + oV = iV.lanewise(VectorOperators.LSHR, 8); iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); - oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = oV.zero(SPECIES_512); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); - oV = iV.and(MASK_24).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); - oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8); + oV = iV.lanewise(VectorOperators.LSHR, 4); iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); - oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); - oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; @@ -3250,7535 +2033,1600 @@ private static void fastpack24(final int[] in, int inpos, final int[] out, oV = oV.zero(SPECIES_512); iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); - oV = iV.and(MASK_24).or(oV); + oV = iV.or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); - oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); - oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16); + oV = iV.lanewise(VectorOperators.LSHR, 8); iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); - oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = oV.zero(SPECIES_512); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); - oV = iV.and(MASK_24).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); - oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8); + oV = iV.lanewise(VectorOperators.LSHR, 4); iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); - oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); - oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); oV.intoArray(out, outpos); } - private static void fastpack25(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - var oV = iV.and(MASK_25); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 25).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 7); + private static void fastpackNoMask14(final int[] in, int inpos, + final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_512, in, inpos); + var oV = iV; + iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 18).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; + oV = iV.lanewise(VectorOperators.LSHR, 4); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 14); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 11).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 21); + oV = iV.lanewise(VectorOperators.LSHR, 8); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 4).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 29).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; + + oV = iV.lanewise(VectorOperators.LSHR, 12); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 3); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 22).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 10); + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 15).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; + + oV = iV.lanewise(VectorOperators.LSHR, 6); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 17); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 8).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 24); + oV = iV.lanewise(VectorOperators.LSHR, 10); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 1).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 26).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 6); + } - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 19).or(oV); + private static void fastpackNoMask16(final int[] in, int inpos, + final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_512, in, inpos); + var oV = iV; + iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; + oV = oV.zero(SPECIES_512); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 13); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); + oV = iV.or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 12).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 20); + oV = oV.zero(SPECIES_512); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 5).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); + oV = iV.or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 30).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 2); + oV = oV.zero(SPECIES_512); + + iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); + oV = iV.or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 23).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; + + oV = oV.zero(SPECIES_512); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 9); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); + oV = iV.or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 16).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; + + oV = oV.zero(SPECIES_512); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 16); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); + oV = iV.or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 9).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 23); + oV = oV.zero(SPECIES_512); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 2).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); + oV = iV.or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 27).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; + + oV = oV.zero(SPECIES_512); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 5); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); + oV = iV.or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 20).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 12); + } - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 13).or(oV); + private static void fastpackNoMask18(final int[] in, int inpos, + final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_512, in, inpos); + var oV = iV; + iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 19); + outpos += VLEN_512; + oV = iV.lanewise(VectorOperators.LSHR, 14); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 6).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 31).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; + + oV = iV.lanewise(VectorOperators.LSHR, 10); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 1); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 24).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 8); + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 17).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 15); + oV = iV.lanewise(VectorOperators.LSHR, 2); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 10).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 22); + oV = iV.lanewise(VectorOperators.LSHR, 16); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 3).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 28).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; + + oV = iV.lanewise(VectorOperators.LSHR, 12); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 4); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 21).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 11); + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 14).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 18); + oV = iV.lanewise(VectorOperators.LSHR, 4); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); - oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 7).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); oV.intoArray(out, outpos); } - private static void fastpack26(final int[] in, int inpos, final int[] out, - int outpos) { + private static void fastpackNoMask20(final int[] in, int inpos, + final int[] out, int outpos) { var iV = IntVector.fromArray(SPECIES_512, in, inpos); - var oV = iV.and(MASK_26); + var oV = iV; iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); - oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 26).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 6); + oV = iV.lanewise(VectorOperators.LSHR, 12); iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); - oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 20).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 12); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); - oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 14).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 18); + oV = iV.lanewise(VectorOperators.LSHR, 4); iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); - oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 8).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 24); + oV = iV.lanewise(VectorOperators.LSHR, 16); iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); - oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 2).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); - oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 28).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 4); + oV = iV.lanewise(VectorOperators.LSHR, 8); iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); - oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 22).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 10); + oV = oV.zero(SPECIES_512); iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); - oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 16); + oV = iV.or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); - oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 10).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 22); + oV = iV.lanewise(VectorOperators.LSHR, 12); iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); - oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 4).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); - oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 30).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 2); + oV = iV.lanewise(VectorOperators.LSHR, 4); iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); - oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 24).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 8); + oV = iV.lanewise(VectorOperators.LSHR, 16); iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); - oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 18).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 14); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); - oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 12).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 20); + oV = iV.lanewise(VectorOperators.LSHR, 8); iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); - oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 6).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); oV.intoArray(out, outpos); } - private static void fastpack27(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - var oV = iV.and(MASK_27); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 27).or(oV); + private static void fastpackNoMask22(final int[] in, int inpos, + final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_512, in, inpos); + var oV = iV; + iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 5); + outpos += VLEN_512; + oV = iV.lanewise(VectorOperators.LSHR, 10); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 22).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 10); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 17).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; + oV = iV.lanewise(VectorOperators.LSHR, 20); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 15); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 12).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 20); + oV = iV.lanewise(VectorOperators.LSHR, 8); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 7).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 25); + oV = iV.lanewise(VectorOperators.LSHR, 18); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 2).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 29).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 3); + oV = iV.lanewise(VectorOperators.LSHR, 6); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 24).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 19).or(oV); + outpos += VLEN_512; - oV.intoArray(out, outpos); - outpos += VLEN_256; + oV = iV.lanewise(VectorOperators.LSHR, 16); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 13); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 14).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 18); + oV = iV.lanewise(VectorOperators.LSHR, 4); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 9).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 23); + oV = iV.lanewise(VectorOperators.LSHR, 14); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 4).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 31).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 1); + oV = iV.lanewise(VectorOperators.LSHR, 2); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 26).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 6); + oV = iV.lanewise(VectorOperators.LSHR, 12); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 21).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 11); + } - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 16).or(oV); + private static void fastpackNoMask24(final int[] in, int inpos, + final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_512, in, inpos); + var oV = iV; + iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 16); + outpos += VLEN_512; + oV = iV.lanewise(VectorOperators.LSHR, 8); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 11).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 21); + oV = iV.lanewise(VectorOperators.LSHR, 16); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 6).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 26); + oV = oV.zero(SPECIES_512); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 1).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); + oV = iV.or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 28).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 4); + oV = iV.lanewise(VectorOperators.LSHR, 8); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 23).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 9); + oV = iV.lanewise(VectorOperators.LSHR, 16); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 18).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 14); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 13).or(oV); + outpos += VLEN_512; - oV.intoArray(out, outpos); - outpos += VLEN_256; + oV = oV.zero(SPECIES_512); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 19); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); + oV = iV.or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 8).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 24); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 3).or(oV); + oV = iV.lanewise(VectorOperators.LSHR, 8); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 30).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 2); + oV = iV.lanewise(VectorOperators.LSHR, 16); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 25).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 7); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 20).or(oV); + outpos += VLEN_512; - oV.intoArray(out, outpos); - outpos += VLEN_256; + oV = oV.zero(SPECIES_512); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 12); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); + oV = iV.or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 15).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 17); + oV = iV.lanewise(VectorOperators.LSHR, 8); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 10).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 22); + oV = iV.lanewise(VectorOperators.LSHR, 16); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); - oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 5).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); oV.intoArray(out, outpos); } - private static void fastpack28(final int[] in, int inpos, final int[] out, - int outpos) { + private static void fastpackNoMask26(final int[] in, int inpos, + final int[] out, int outpos) { var iV = IntVector.fromArray(SPECIES_512, in, inpos); - var oV = iV.and(MASK_28); + var oV = iV; iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); - oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 28).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 4); + oV = iV.lanewise(VectorOperators.LSHR, 6); iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); - oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 24).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 8); + oV = iV.lanewise(VectorOperators.LSHR, 12); iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); - oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 20).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 12); + oV = iV.lanewise(VectorOperators.LSHR, 18); iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); - oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 16).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 16); + oV = iV.lanewise(VectorOperators.LSHR, 24); iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); - oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 12).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 20); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); - oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 8).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 24); + oV = iV.lanewise(VectorOperators.LSHR, 4); iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); - oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 4).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = oV.zero(SPECIES_512); + oV = iV.lanewise(VectorOperators.LSHR, 10); iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); - oV = iV.and(MASK_28).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); - oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 28).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 4); + oV = iV.lanewise(VectorOperators.LSHR, 16); - iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); - oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 24).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 8); + oV = iV.lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); - oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 20).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 12); + oV = iV.lanewise(VectorOperators.LSHR, 2); iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); - oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 16).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 16); + oV = iV.lanewise(VectorOperators.LSHR, 8); iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); - oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 12).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 20); + oV = iV.lanewise(VectorOperators.LSHR, 14); iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); - oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 8).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 24); + oV = iV.lanewise(VectorOperators.LSHR, 20); iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); - oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 4).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); oV.intoArray(out, outpos); } - private static void fastpack29(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - var oV = iV.and(MASK_29); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 29).or(oV); + private static void fastpackNoMask28(final int[] in, int inpos, + final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_512, in, inpos); + var oV = iV; + iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 3); + outpos += VLEN_512; + oV = iV.lanewise(VectorOperators.LSHR, 4); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 26).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 6); + oV = iV.lanewise(VectorOperators.LSHR, 8); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 23).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 9); + oV = iV.lanewise(VectorOperators.LSHR, 12); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 20).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 12); + oV = iV.lanewise(VectorOperators.LSHR, 16); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 17).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 15); + oV = iV.lanewise(VectorOperators.LSHR, 20); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 14).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 18); + oV = iV.lanewise(VectorOperators.LSHR, 24); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 11).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 21); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 8).or(oV); + outpos += VLEN_512; - oV.intoArray(out, outpos); - outpos += VLEN_256; + oV = oV.zero(SPECIES_512); - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 24); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); + oV = iV.or(oV); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 5).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 27); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 2).or(oV); + oV = iV.lanewise(VectorOperators.LSHR, 4); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 31).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 1); + oV = iV.lanewise(VectorOperators.LSHR, 8); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 28).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 4); + oV = iV.lanewise(VectorOperators.LSHR, 12); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 25).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 7); + oV = iV.lanewise(VectorOperators.LSHR, 16); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 22).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 10); + oV = iV.lanewise(VectorOperators.LSHR, 20); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 19).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 13); + oV = iV.lanewise(VectorOperators.LSHR, 24); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 16).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 16); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 13).or(oV); + } - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 19); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 10).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 22); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 7).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 25); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 4).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 28); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 1).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 30).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 2); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 27).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 5); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 21).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 11); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 18).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 14); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 15).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 17); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 12).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 20); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 9).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 23); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 6).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 26); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); - oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 3).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastpack30(final int[] in, int inpos, final int[] out, - int outpos) { + private static void fastpackNoMask30(final int[] in, int inpos, + final int[] out, int outpos) { var iV = IntVector.fromArray(SPECIES_512, in, inpos); - var oV = iV.and(MASK_30); + var oV = iV; iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); - oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 30).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 2); + oV = iV.lanewise(VectorOperators.LSHR, 2); iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); - oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 28).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 4); + oV = iV.lanewise(VectorOperators.LSHR, 4); iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); - oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 26).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 6); + oV = iV.lanewise(VectorOperators.LSHR, 6); iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); - oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 24).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 8); + oV = iV.lanewise(VectorOperators.LSHR, 8); iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); - oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 22).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 10); + oV = iV.lanewise(VectorOperators.LSHR, 10); iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); - oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 20).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 12); + oV = iV.lanewise(VectorOperators.LSHR, 12); iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); - oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 18).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 14); + oV = iV.lanewise(VectorOperators.LSHR, 14); iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); - oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 16).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 16); + oV = iV.lanewise(VectorOperators.LSHR, 16); iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); - oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 14).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 18); + oV = iV.lanewise(VectorOperators.LSHR, 18); iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); - oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 12).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 20); + oV = iV.lanewise(VectorOperators.LSHR, 20); iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); - oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 10).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 22); + oV = iV.lanewise(VectorOperators.LSHR, 22); iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); - oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 8).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 24); + oV = iV.lanewise(VectorOperators.LSHR, 24); iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); - oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 6).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 26); + oV = iV.lanewise(VectorOperators.LSHR, 26); iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); - oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 4).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 28); + oV = iV.lanewise(VectorOperators.LSHR, 28); iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); - oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 2).or(oV); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); oV.intoArray(out, outpos); } - private static void fastpack31(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - var oV = iV.and(MASK_31); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 31).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 1); + private static void fastunpack2(final int[] in, int inpos, final int[] out, + int outpos) { + var iV = IntVector.fromArray(SPECIES_512, in, inpos); + iV.and(MASK_2).intoArray(out, outpos); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 30).or(oV); + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_512; - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 2); + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 29).or(oV); + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_512; - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 3); + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 28).or(oV); + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_512; - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 4); + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 27).or(oV); + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_512; - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 5); + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 26).or(oV); + iV.lanewise(VectorOperators.LSHR, 26).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_512; - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 28).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 6); + iV.lanewise(VectorOperators.LSHR, 30).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_512; + } - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 25).or(oV); + private static void fastunpack4(final int[] in, int inpos, final int[] out, + int outpos) { + var iV = IntVector.fromArray(SPECIES_512, in, inpos); + iV.and(MASK_4).intoArray(out, outpos); + outpos += VLEN_512; - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 7); + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 24).or(oV); + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_512; - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 8); + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 23).or(oV); + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_512; - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 28).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 9); + var oV = iV.and(MASK_4); + oV = oV.zero(SPECIES_512); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 22).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); + oV = iV.and(0xf).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 10); + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 21).or(oV); + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_512; - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 11); + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 20).or(oV); + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_512; - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 12); + iV.lanewise(VectorOperators.LSHR, 28).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_512; + } - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 19).or(oV); + private static void fastunpack6(final int[] in, int inpos, final int[] out, + int outpos) { + var iV = IntVector.fromArray(SPECIES_512, in, inpos); + iV.and(MASK_6).intoArray(out, outpos); + outpos += VLEN_512; - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 13); + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 18).or(oV); + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_512; - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 14); + var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_6); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 17).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); + oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 2).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 15); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 16).or(oV); + outpos += VLEN_512; - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 16); + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 15).or(oV); + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_512; - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 17); + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_6); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 14).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 4).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 18); + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 13).or(oV); + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_512; - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 19); + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 12).or(oV); + iV.lanewise(VectorOperators.LSHR, 26).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_512; + } - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 20); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 11).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 21); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 10).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 22); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 9).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 23); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 8).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 24); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 7).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 25); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 6).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 26); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 5).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 27); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 4).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 28); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 3).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 29); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 2).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 30); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); - oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 1).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastpackNoMask1(final int[] in, int inpos, - final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - var oV = iV; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); - oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); - oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); - oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastpackNoMask2(final int[] in, int inpos, - final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_512, in, inpos); - var oV = iV; - iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); - oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); - oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); - oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); - oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); - oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); - oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); - oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); - oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); - oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); - oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); - oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); - oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastpackNoMask3(final int[] in, int inpos, - final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - var oV = iV; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 2); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 1); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); - oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); - oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); - oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastpackNoMask4(final int[] in, int inpos, - final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_512, in, inpos); - var oV = iV; - iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); - oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); - oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); - oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); - oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - oV = oV.zero(SPECIES_512); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); - oV = iV.or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); - oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); - oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); - oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); - oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastpackNoMask5(final int[] in, int inpos, - final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - var oV = iV; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 2); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 4); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 1); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 3); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); - oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); - oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); - oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastpackNoMask6(final int[] in, int inpos, - final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_512, in, inpos); - var oV = iV; - iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); - oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); - oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); - oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); - oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 2); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); - oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); - oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); - oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); - oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 4); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); - oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); - oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); - oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); - oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastpackNoMask7(final int[] in, int inpos, - final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - var oV = iV; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 4); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 1); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 5); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 2); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 6); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 3); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); - oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); - oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); - oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastpackNoMask8(final int[] in, int inpos, - final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_512, in, inpos); - var oV = iV; - iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - oV = oV.zero(SPECIES_512); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); - oV = iV.or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = oV.zero(SPECIES_512); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); - oV = iV.or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = oV.zero(SPECIES_512); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); - oV = iV.or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastpackNoMask9(final int[] in, int inpos, - final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - var oV = iV; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 5); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 1); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 6); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 2); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 7); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 3); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 4); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); - oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); - oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); - oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastpackNoMask10(final int[] in, int inpos, - final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_512, in, inpos); - var oV = iV; - iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); - oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); - oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); - oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 2); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); - oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); - oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 4); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); - oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); - oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 6); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); - oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); - oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); - oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); - oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); - oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastpackNoMask11(final int[] in, int inpos, - final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - var oV = iV; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 10); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 9); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 7); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 6); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 5); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 4); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 3); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 2); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); - oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 1); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); - oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); - oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastpackNoMask12(final int[] in, int inpos, - final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_512, in, inpos); - var oV = iV; - iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); - oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); - oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); - oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 4); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); - oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = oV.zero(SPECIES_512); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); - oV = iV.or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); - oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); - oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); - oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 4); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); - oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastpackNoMask13(final int[] in, int inpos, - final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - var oV = iV; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 6); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 12); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 5); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 11); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 4); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 10); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 3); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 9); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 2); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 1); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); - oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 7); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); - oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); - oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastpackNoMask14(final int[] in, int inpos, - final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_512, in, inpos); - var oV = iV; - iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); - oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); - oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 4); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); - oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); - oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); - oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 12); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); - oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); - oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 2); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); - oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); - oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 6); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); - oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 10); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); - oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); - oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastpackNoMask15(final int[] in, int inpos, - final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - var oV = iV; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 2); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 4); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 6); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 10); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 12); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 14); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 1); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 3); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 5); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 7); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 9); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 11); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); - oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 13); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); - oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); - oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastpackNoMask16(final int[] in, int inpos, - final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_512, in, inpos); - var oV = iV; - iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - oV = oV.zero(SPECIES_512); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); - oV = iV.or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = oV.zero(SPECIES_512); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); - oV = iV.or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = oV.zero(SPECIES_512); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); - oV = iV.or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = oV.zero(SPECIES_512); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); - oV = iV.or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = oV.zero(SPECIES_512); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); - oV = iV.or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = oV.zero(SPECIES_512); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); - oV = iV.or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = oV.zero(SPECIES_512); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); - oV = iV.or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastpackNoMask17(final int[] in, int inpos, - final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - var oV = iV; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 15); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 13); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 11); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 9); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 7); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 5); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 3); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 1); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 16); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 14); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 12); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 10); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 6); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 4); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); - oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); - oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 2); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); - oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastpackNoMask18(final int[] in, int inpos, - final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_512, in, inpos); - var oV = iV; - iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); - oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 14); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); - oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); - oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 10); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); - oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 6); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); - oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); - oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 2); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 16); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); - oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); - oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 12); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); - oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); - oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); - oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 4); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); - oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastpackNoMask19(final int[] in, int inpos, - final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - var oV = iV; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 13); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 7); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 1); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 14); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 2); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 15); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 9); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 3); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 16); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 10); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 4); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 17); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 11); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 5); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 18); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 12); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); - oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); - oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 6); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); - oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastpackNoMask20(final int[] in, int inpos, - final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_512, in, inpos); - var oV = iV; - iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); - oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 12); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); - oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 4); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 16); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); - oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); - oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = oV.zero(SPECIES_512); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); - oV = iV.or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); - oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 12); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); - oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 4); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 16); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); - oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); - oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastpackNoMask21(final int[] in, int inpos, - final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - var oV = iV; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 11); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 1); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 12); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 2); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 13); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 3); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 14); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 4); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 15); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 5); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 16); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 6); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 17); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 7); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 18); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 19); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 9); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 20); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); - oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); - oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 10); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); - oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastpackNoMask22(final int[] in, int inpos, - final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_512, in, inpos); - var oV = iV; - iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); - oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 10); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); - oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 20); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); - oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); - oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 18); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); - oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); - oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 6); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 16); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); - oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); - oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 4); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); - oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 14); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); - oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 2); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); - oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 12); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); - oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastpackNoMask23(final int[] in, int inpos, - final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - var oV = iV; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 9); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 18); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 4); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 13); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 22); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 17); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 3); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 12); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 21); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 7); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 16); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 2); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 11); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 20); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 6); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 15); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 1); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 10); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 19); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); - oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 5); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); - oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 14); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); - oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastpackNoMask24(final int[] in, int inpos, - final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_512, in, inpos); - var oV = iV; - iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 16); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = oV.zero(SPECIES_512); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); - oV = iV.or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 16); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = oV.zero(SPECIES_512); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); - oV = iV.or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 16); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = oV.zero(SPECIES_512); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); - oV = iV.or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 16); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastpackNoMask25(final int[] in, int inpos, - final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - var oV = iV; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 7); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 14); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 21); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 3); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 10); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 17); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 24); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 6); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 13); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 20); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 2); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 9); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 16); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 23); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 5); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 12); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 19); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 1); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 15); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 22); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 4); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); - oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 11); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); - oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 18); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); - oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastpackNoMask26(final int[] in, int inpos, - final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_512, in, inpos); - var oV = iV; - iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); - oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 6); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); - oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 12); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); - oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 18); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 24); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); - oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); - oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 4); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); - oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 10); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 16); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); - oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 22); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); - oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); - oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 2); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); - oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 14); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); - oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 20); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); - oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastpackNoMask27(final int[] in, int inpos, - final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - var oV = iV; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 5); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 10); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 15); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 20); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 25); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 3); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 13); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 18); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 23); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 1); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 6); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 11); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 16); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 21); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 26); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 4); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 9); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 14); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 19); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 24); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 2); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 7); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 12); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); - oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 17); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); - oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 22); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); - oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastpackNoMask28(final int[] in, int inpos, - final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_512, in, inpos); - var oV = iV; - iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); - oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 4); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); - oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 12); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 16); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); - oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 20); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 24); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); - oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = oV.zero(SPECIES_512); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); - oV = iV.or(oV); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); - oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 4); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); - oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 12); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 16); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); - oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 20); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 24); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); - oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastpackNoMask29(final int[] in, int inpos, - final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - var oV = iV; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 3); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 6); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 9); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 12); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 15); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 18); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 21); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 24); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 27); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 1); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 4); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 7); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 10); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 13); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 16); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 19); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 22); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 25); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 28); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 2); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 5); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 11); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 14); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 17); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 20); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); - oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 23); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); - oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 26); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); - oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastpackNoMask30(final int[] in, int inpos, - final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_512, in, inpos); - var oV = iV; - iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); - oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 2); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); - oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 4); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); - oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 6); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); - oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 10); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); - oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 12); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); - oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 14); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 16); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); - oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 18); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); - oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 20); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); - oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 22); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 24); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 208); - oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 26); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 224); - oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 28); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 240); - oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastpackNoMask31(final int[] in, int inpos, - final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - var oV = iV; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 1); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 2); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 3); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 4); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 5); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 6); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 7); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 8); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 9); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 10); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 11); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 12); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 13); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 14); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 15); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 16); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 17); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 18); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 19); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 20); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 21); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 22); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 23); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 24); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 25); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 26); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 27); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 28); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); - oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 29); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); - oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 30); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); - oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); - - oV.intoArray(out, outpos); - } - - private static void fastunpack1(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - iV.and(MASK_1).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 1).and(MASK_1).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 2).and(MASK_1).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 3).and(MASK_1).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 4).and(MASK_1).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 5).and(MASK_1).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 6).and(MASK_1).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 7).and(MASK_1).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 8).and(MASK_1).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 9).and(MASK_1).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 10).and(MASK_1).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 11).and(MASK_1).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 12).and(MASK_1).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 13).and(MASK_1).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 14).and(MASK_1).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 15).and(MASK_1).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 16).and(MASK_1).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 17).and(MASK_1).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 18).and(MASK_1).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 19).and(MASK_1).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 20).and(MASK_1).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 21).and(MASK_1).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 22).and(MASK_1).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 23).and(MASK_1).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 24).and(MASK_1).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 25).and(MASK_1).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 26).and(MASK_1).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 27).and(MASK_1).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 28).and(MASK_1).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 29).and(MASK_1).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 30).and(MASK_1).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 31).and(MASK_1).intoArray(out, outpos); - outpos += VLEN_256; - } - - private static void fastunpack2(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_512, in, inpos); - iV.and(MASK_2).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 2).and(MASK_2).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 4).and(MASK_2).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 6).and(MASK_2).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 8).and(MASK_2).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 10).and(MASK_2).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 12).and(MASK_2).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 14).and(MASK_2).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 16).and(MASK_2).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 18).and(MASK_2).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 20).and(MASK_2).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 22).and(MASK_2).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 24).and(MASK_2).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 26).and(MASK_2).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 28).and(MASK_2).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 30).and(MASK_2).intoArray(out, outpos); - outpos += VLEN_512; - } - - private static void fastunpack3(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - iV.and(MASK_3).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 3).and(MASK_3).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 6).and(MASK_3).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 9).and(MASK_3).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 12).and(MASK_3).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 15).and(MASK_3).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 18).and(MASK_3).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 21).and(MASK_3).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 24).and(MASK_3).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 27).and(MASK_3).intoArray(out, outpos); - outpos += VLEN_256; - - var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_3); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.and(1).lanewise(VectorOperators.LSHL, 2).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 1).and(MASK_3).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 4).and(MASK_3).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 7).and(MASK_3).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 10).and(MASK_3).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 13).and(MASK_3).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 16).and(MASK_3).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 19).and(MASK_3).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 22).and(MASK_3).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 25).and(MASK_3).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 28).and(MASK_3).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_3); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.and(3).lanewise(VectorOperators.LSHL, 1).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 2).and(MASK_3).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 5).and(MASK_3).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 8).and(MASK_3).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 11).and(MASK_3).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 14).and(MASK_3).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 17).and(MASK_3).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 20).and(MASK_3).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 23).and(MASK_3).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 26).and(MASK_3).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 29).and(MASK_3).intoArray(out, outpos); - outpos += VLEN_256; - } - - private static void fastunpack4(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_512, in, inpos); - iV.and(MASK_4).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 4).and(MASK_4).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 8).and(MASK_4).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 12).and(MASK_4).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 16).and(MASK_4).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 20).and(MASK_4).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 24).and(MASK_4).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 28).and(MASK_4).intoArray(out, outpos); - outpos += VLEN_512; - - var oV = iV.and(MASK_4); - oV = oV.zero(SPECIES_512); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); - oV = iV.and(0xf).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 4).and(MASK_4).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 8).and(MASK_4).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 12).and(MASK_4).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 16).and(MASK_4).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 20).and(MASK_4).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 24).and(MASK_4).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 28).and(MASK_4).intoArray(out, outpos); - outpos += VLEN_512; - } - - private static void fastunpack5(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - iV.and(MASK_5).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 5).and(MASK_5).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 10).and(MASK_5).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 15).and(MASK_5).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 20).and(MASK_5).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 25).and(MASK_5).intoArray(out, outpos); - outpos += VLEN_256; - - var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_5); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.and(7).lanewise(VectorOperators.LSHL, 2).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 3).and(MASK_5).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 8).and(MASK_5).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 13).and(MASK_5).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 18).and(MASK_5).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 23).and(MASK_5).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_5); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.and(1).lanewise(VectorOperators.LSHL, 4).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 1).and(MASK_5).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 6).and(MASK_5).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 11).and(MASK_5).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 16).and(MASK_5).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 21).and(MASK_5).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 26).and(MASK_5).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_5); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 1).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 4).and(MASK_5).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 9).and(MASK_5).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 14).and(MASK_5).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 19).and(MASK_5).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 24).and(MASK_5).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_5); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.and(3).lanewise(VectorOperators.LSHL, 3).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 2).and(MASK_5).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 7).and(MASK_5).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 12).and(MASK_5).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 17).and(MASK_5).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 22).and(MASK_5).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 27).and(MASK_5).intoArray(out, outpos); - outpos += VLEN_256; - } - - private static void fastunpack6(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_512, in, inpos); - iV.and(MASK_6).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 6).and(MASK_6).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 12).and(MASK_6).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 18).and(MASK_6).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 24).and(MASK_6).intoArray(out, outpos); - outpos += VLEN_512; - - var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_6); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); - oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 2).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 4).and(MASK_6).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 10).and(MASK_6).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 16).and(MASK_6).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 22).and(MASK_6).intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_6); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); - oV = iV.and(3).lanewise(VectorOperators.LSHL, 4).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 2).and(MASK_6).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 8).and(MASK_6).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 14).and(MASK_6).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 20).and(MASK_6).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 26).and(MASK_6).intoArray(out, outpos); - outpos += VLEN_512; - } - - private static void fastunpack7(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - iV.and(MASK_7).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 7).and(MASK_7).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 14).and(MASK_7).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 21).and(MASK_7).intoArray(out, outpos); - outpos += VLEN_256; - - var oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_7); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.and(7).lanewise(VectorOperators.LSHL, 4).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 3).and(MASK_7).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 10).and(MASK_7).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 17).and(MASK_7).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 24).and(MASK_7).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_7); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 1).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 6).and(MASK_7).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 13).and(MASK_7).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 20).and(MASK_7).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_7); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.and(3).lanewise(VectorOperators.LSHL, 5).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 2).and(MASK_7).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 9).and(MASK_7).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 16).and(MASK_7).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 23).and(MASK_7).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_7); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 2).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 5).and(MASK_7).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 12).and(MASK_7).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 19).and(MASK_7).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_7); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.and(1).lanewise(VectorOperators.LSHL, 6).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 1).and(MASK_7).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 8).and(MASK_7).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 15).and(MASK_7).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 22).and(MASK_7).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_7); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 3).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 4).and(MASK_7).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 11).and(MASK_7).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 18).and(MASK_7).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 25).and(MASK_7).intoArray(out, outpos); - outpos += VLEN_256; - } - - private static void fastunpack8(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_512, in, inpos); - iV.and(MASK_8).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos); - outpos += VLEN_512; - - var oV = iV.and(MASK_8); - oV = oV.zero(SPECIES_512); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); - oV = iV.and(0xff).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos); - outpos += VLEN_512; - - oV = oV.zero(SPECIES_512); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); - oV = iV.and(0xff).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos); - outpos += VLEN_512; - - oV = oV.zero(SPECIES_512); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); - oV = iV.and(0xff).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos); - outpos += VLEN_512; - } - - private static void fastunpack9(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - iV.and(MASK_9).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 9).and(MASK_9).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 18).and(MASK_9).intoArray(out, outpos); - outpos += VLEN_256; - - var oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_9); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 5).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 4).and(MASK_9).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 13).and(MASK_9).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 22).and(MASK_9).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_9); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 1).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 8).and(MASK_9).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 17).and(MASK_9).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_9); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.and(7).lanewise(VectorOperators.LSHL, 6).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 3).and(MASK_9).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 12).and(MASK_9).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 21).and(MASK_9).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_9); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 2).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 7).and(MASK_9).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 16).and(MASK_9).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_9); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.and(3).lanewise(VectorOperators.LSHL, 7).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 2).and(MASK_9).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 11).and(MASK_9).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 20).and(MASK_9).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_9); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 3).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 6).and(MASK_9).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 15).and(MASK_9).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_9); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.and(1).lanewise(VectorOperators.LSHL, 8).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 1).and(MASK_9).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 10).and(MASK_9).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 19).and(MASK_9).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_9); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 4).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 5).and(MASK_9).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 14).and(MASK_9).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 23).and(MASK_9).intoArray(out, outpos); - outpos += VLEN_256; - } - - private static void fastunpack10(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_512, in, inpos); - iV.and(MASK_10).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 10).and(MASK_10).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 20).and(MASK_10).intoArray(out, outpos); - outpos += VLEN_512; - - var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_10); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); - oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 2).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 8).and(MASK_10).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 18).and(MASK_10).intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_10); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); - oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 4).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 6).and(MASK_10).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 16).and(MASK_10).intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_10); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); - oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 6).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 4).and(MASK_10).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 14).and(MASK_10).intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_10); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); - oV = iV.and(3).lanewise(VectorOperators.LSHL, 8).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 2).and(MASK_10).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 12).and(MASK_10).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 22).and(MASK_10).intoArray(out, outpos); - outpos += VLEN_512; - } - - private static void fastunpack11(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - iV.and(MASK_11).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 11).and(MASK_11).intoArray(out, outpos); - outpos += VLEN_256; - - var oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_11); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.and(1).lanewise(VectorOperators.LSHL, 10).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 1).and(MASK_11).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 12).and(MASK_11).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_11); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.and(3).lanewise(VectorOperators.LSHL, 9).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 2).and(MASK_11).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 13).and(MASK_11).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_11); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.and(7).lanewise(VectorOperators.LSHL, 8).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 3).and(MASK_11).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 14).and(MASK_11).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_11); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 7).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 4).and(MASK_11).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 15).and(MASK_11).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_11); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 6).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 5).and(MASK_11).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 16).and(MASK_11).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_11); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 5).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 6).and(MASK_11).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 17).and(MASK_11).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_11); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 4).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 7).and(MASK_11).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 18).and(MASK_11).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_11); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 3).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 8).and(MASK_11).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 19).and(MASK_11).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_11); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 2).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 9).and(MASK_11).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 20).and(MASK_11).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_11); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 1).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 10).and(MASK_11).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 21).and(MASK_11).intoArray(out, outpos); - outpos += VLEN_256; - } - - private static void fastunpack12(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_512, in, inpos); - iV.and(MASK_12).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 12).and(MASK_12).intoArray(out, outpos); - outpos += VLEN_512; - - var oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_12); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); - oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 8).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 4).and(MASK_12).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 16).and(MASK_12).intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_12); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); - oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 4).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 8).and(MASK_12).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 20).and(MASK_12).intoArray(out, outpos); - outpos += VLEN_512; - - oV = oV.zero(SPECIES_512); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); - oV = iV.and(0xfff).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 12).and(MASK_12).intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_12); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); - oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 8).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 4).and(MASK_12).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 16).and(MASK_12).intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_12); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); - oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 4).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 8).and(MASK_12).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 20).and(MASK_12).intoArray(out, outpos); - outpos += VLEN_512; - } - - private static void fastunpack13(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - iV.and(MASK_13).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 13).and(MASK_13).intoArray(out, outpos); - outpos += VLEN_256; - - var oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_13); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 6).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 7).and(MASK_13).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_13); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.and(1).lanewise(VectorOperators.LSHL, 12).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 1).and(MASK_13).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 14).and(MASK_13).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_13); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 5).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 8).and(MASK_13).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_13); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.and(3).lanewise(VectorOperators.LSHL, 11).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 2).and(MASK_13).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 15).and(MASK_13).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_13); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 4).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 9).and(MASK_13).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_13); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.and(7).lanewise(VectorOperators.LSHL, 10).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 3).and(MASK_13).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 16).and(MASK_13).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_13); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 3).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 10).and(MASK_13).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_13); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 9).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 4).and(MASK_13).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 17).and(MASK_13).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_13); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 2).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 11).and(MASK_13).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_13); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 8).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 5).and(MASK_13).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 18).and(MASK_13).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_13); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 1).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 12).and(MASK_13).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_13); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 7).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 6).and(MASK_13).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 19).and(MASK_13).intoArray(out, outpos); - outpos += VLEN_256; - } - - private static void fastunpack14(final int[] in, int inpos, final int[] out, - int outpos) { + private static void fastunpack8(final int[] in, int inpos, final int[] out, + int outpos) { var iV = IntVector.fromArray(SPECIES_512, in, inpos); - iV.and(MASK_14).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 14).and(MASK_14).intoArray(out, outpos); - outpos += VLEN_512; - - var oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_14); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); - oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 4).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 10).and(MASK_14).intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_14); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); - oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 8).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 6).and(MASK_14).intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_14); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); - oV = iV.and(3).lanewise(VectorOperators.LSHL, 12).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 2).and(MASK_14).intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 16).and(MASK_14).intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_14); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); - oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 2).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 12).and(MASK_14).intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_14); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); - oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 6).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 8).and(MASK_14).intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_14); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); - oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 10).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 4).and(MASK_14).intoArray(out, outpos); + iV.and(MASK_8).intoArray(out, outpos); outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 18).and(MASK_14).intoArray(out, outpos); + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos); outpos += VLEN_512; - } - - private static void fastunpack15(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - iV.and(MASK_15).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 15).and(MASK_15).intoArray(out, outpos); - outpos += VLEN_256; - - var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_15); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 2).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - iV.lanewise(VectorOperators.LSHR, 13).and(MASK_15).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_15); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 4).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 11).and(MASK_15).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_15); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 6).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 9).and(MASK_15).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_15); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 8).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 7).and(MASK_15).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_15); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 10).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 5).and(MASK_15).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_15); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.and(7).lanewise(VectorOperators.LSHL, 12).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 3).and(MASK_15).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_15); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.and(1).lanewise(VectorOperators.LSHL, 14).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 1).and(MASK_15).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 16).and(MASK_15).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_15); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 1).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 14).and(MASK_15).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_15); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 3).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 12).and(MASK_15).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_15); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 5).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 10).and(MASK_15).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_15); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 7).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 8).and(MASK_15).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_15); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 9).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 6).and(MASK_15).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_15); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 11).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 4).and(MASK_15).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_15); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.and(3).lanewise(VectorOperators.LSHL, 13).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 2).and(MASK_15).intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 17).and(MASK_15).intoArray(out, outpos); - outpos += VLEN_256; - } - - private static void fastunpack16(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_512, in, inpos); - iV.and(MASK_16).intoArray(out, outpos); + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos); outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos); outpos += VLEN_512; - var oV = iV.and(MASK_16); + var oV = iV.and(MASK_8); oV = oV.zero(SPECIES_512); iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); - oV = iV.and(0xffff).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); - outpos += VLEN_512; - - oV = oV.zero(SPECIES_512); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); - oV = iV.and(0xffff).or(oV); + oV = iV.and(0xff).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos); outpos += VLEN_512; - oV = oV.zero(SPECIES_512); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); - oV = iV.and(0xffff).or(oV); - - oV.intoArray(out, outpos); + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos); outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos); outpos += VLEN_512; oV = oV.zero(SPECIES_512); - iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); - oV = iV.and(0xffff).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); + oV = iV.and(0xff).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos); outpos += VLEN_512; - oV = oV.zero(SPECIES_512); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); - oV = iV.and(0xffff).or(oV); - - oV.intoArray(out, outpos); + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos); outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos); outpos += VLEN_512; oV = oV.zero(SPECIES_512); - iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); - oV = iV.and(0xffff).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); + oV = iV.and(0xff).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos); outpos += VLEN_512; - oV = oV.zero(SPECIES_512); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); - oV = iV.and(0xffff).or(oV); - - oV.intoArray(out, outpos); + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos); outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos); outpos += VLEN_512; } - private static void fastunpack17(final int[] in, int inpos, final int[] out, + private static void fastunpack10(final int[] in, int inpos, final int[] out, int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - iV.and(MASK_17).intoArray(out, outpos); - outpos += VLEN_256; - - var oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_17); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.and(3).lanewise(VectorOperators.LSHL, 15).or(oV); + var iV = IntVector.fromArray(SPECIES_512, in, inpos); + iV.and(MASK_10).intoArray(out, outpos); + outpos += VLEN_512; - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 2).and(MASK_17).intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_17); + var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_10); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 13).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); + oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 2).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 4).and(MASK_17).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_17); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 11).or(oV); + outpos += VLEN_512; - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 6).and(MASK_17).intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_17); + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_10); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 9).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); + oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 4).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 8).and(MASK_17).intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_17); + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_10); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 7).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); + oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 6).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 10).and(MASK_17).intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_512; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_17); + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_10); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 5).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 8).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 12).and(MASK_17).intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_17); + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_512; - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 3).or(oV); + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_512; + } - oV.intoArray(out, outpos); - outpos += VLEN_256; + private static void fastunpack12(final int[] in, int inpos, final int[] out, + int outpos) { + var iV = IntVector.fromArray(SPECIES_512, in, inpos); + iV.and(MASK_12).intoArray(out, outpos); + outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 14).and(MASK_17).intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_17); + var oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_12); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 1).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); + oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 8).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_17); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.and(1).lanewise(VectorOperators.LSHL, 16).or(oV); + outpos += VLEN_512; - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 1).and(MASK_17).intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_17); + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_12); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.and(7).lanewise(VectorOperators.LSHL, 14).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); + oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 4).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 3).and(MASK_17).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_17); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 12).or(oV); + outpos += VLEN_512; - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 5).and(MASK_17).intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_17); + oV = oV.zero(SPECIES_512); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 10).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); + oV = iV.and(0xfff).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 7).and(MASK_17).intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_17); + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_12); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 8).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); + oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 8).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 9).and(MASK_17).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_17); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 6).or(oV); + outpos += VLEN_512; - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 11).and(MASK_17).intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_17); + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_12); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 4).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); + oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 4).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 13).and(MASK_17).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_17); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.and(0x7fff).lanewise(VectorOperators.LSHL, 2).or(oV); + outpos += VLEN_512; - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 15).and(MASK_17).intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_512; } - private static void fastunpack18(final int[] in, int inpos, final int[] out, + private static void fastunpack14(final int[] in, int inpos, final int[] out, int outpos) { var iV = IntVector.fromArray(SPECIES_512, in, inpos); - iV.and(MASK_18).intoArray(out, outpos); + iV.and(MASK_14).intoArray(out, outpos); outpos += VLEN_512; - var oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_18); + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_512; + + var oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_14); iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); - oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 14).or(oV); + oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 4).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 4).and(MASK_18).intoArray(out, outpos); + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_14).intoArray(out, outpos); outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_18); + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_14); iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); - oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 10).or(oV); + oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 8).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 8).and(MASK_18).intoArray(out, outpos); + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_14).intoArray(out, outpos); outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_18); + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_14); iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); - oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 6).or(oV); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 12).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 12).and(MASK_18).intoArray(out, outpos); + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_14).intoArray(out, outpos); outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_18); + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_512; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_14); iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); - oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 2).or(oV); + oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 2).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_18); + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_512; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_14); iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); - oV = iV.and(3).lanewise(VectorOperators.LSHL, 16).or(oV); + oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 6).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 2).and(MASK_18).intoArray(out, outpos); + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_14).intoArray(out, outpos); outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_18); + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_14); iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); - oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 12).or(oV); + oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 10).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 6).and(MASK_18).intoArray(out, outpos); + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_14).intoArray(out, outpos); outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_18); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); - oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 8).or(oV); + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_512; + } - oV.intoArray(out, outpos); + private static void fastunpack16(final int[] in, int inpos, final int[] out, + int outpos) { + var iV = IntVector.fromArray(SPECIES_512, in, inpos); + iV.and(MASK_16).intoArray(out, outpos); outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 10).and(MASK_18).intoArray(out, outpos); + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_18); + var oV = iV.and(MASK_16); + oV = oV.zero(SPECIES_512); - iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); - oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 4).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); + oV = iV.and(0xffff).or(oV); oV.intoArray(out, outpos); outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 14).and(MASK_18).intoArray(out, outpos); + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); outpos += VLEN_512; - } - - private static void fastunpack19(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - iV.and(MASK_19).intoArray(out, outpos); - outpos += VLEN_256; - - var oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_19); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 13).or(oV); - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 6).and(MASK_19).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_19); + oV = oV.zero(SPECIES_512); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 7).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); + oV = iV.and(0xffff).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 12).and(MASK_19).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_19); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 1).or(oV); + outpos += VLEN_512; - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_19); + oV = oV.zero(SPECIES_512); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 14).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); + oV = iV.and(0xffff).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 5).and(MASK_19).intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_19); + oV = oV.zero(SPECIES_512); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 8).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); + oV = iV.and(0xffff).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 11).and(MASK_19).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_19); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.and(0x1ffff).lanewise(VectorOperators.LSHL, 2).or(oV); + outpos += VLEN_512; - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_19); + oV = oV.zero(SPECIES_512); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 15).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); + oV = iV.and(0xffff).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 4).and(MASK_19).intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_19); + oV = oV.zero(SPECIES_512); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 9).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); + oV = iV.and(0xffff).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 10).and(MASK_19).intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_19); + oV = oV.zero(SPECIES_512); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 3).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); + oV = iV.and(0xffff).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_19); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.and(7).lanewise(VectorOperators.LSHL, 16).or(oV); + outpos += VLEN_512; - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_512; + } - iV.lanewise(VectorOperators.LSHR, 3).and(MASK_19).intoArray(out, outpos); - outpos += VLEN_256; + private static void fastunpack18(final int[] in, int inpos, final int[] out, + int outpos) { + var iV = IntVector.fromArray(SPECIES_512, in, inpos); + iV.and(MASK_18).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_19); + var oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_18); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 10).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 16); + oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 14).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 9).and(MASK_19).intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_19); + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_18); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.and(0x7fff).lanewise(VectorOperators.LSHL, 4).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 32); + oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 10).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_19); + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_18); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.and(3).lanewise(VectorOperators.LSHL, 17).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 48); + oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 6).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 2).and(MASK_19).intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_19); + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_18); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 11).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 64); + oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 2).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 8).and(MASK_19).intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_19); + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_18); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 5).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 80); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 16).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_19); + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_512; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_18); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.and(1).lanewise(VectorOperators.LSHL, 18).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 96); + oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 12).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 1).and(MASK_19).intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_19); + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_18); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 12).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); + oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 8).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 7).and(MASK_19).intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_19); + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_18); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 6).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); + oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 4).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 13).and(MASK_19).intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_512; } private static void fastunpack20(final int[] in, int inpos, final int[] out, @@ -10878,206 +3726,6 @@ private static void fastunpack20(final int[] in, int inpos, final int[] out, outpos += VLEN_512; } - private static void fastunpack21(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - iV.and(MASK_21).intoArray(out, outpos); - outpos += VLEN_256; - - var oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_21); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 11).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 10).and(MASK_21).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_21); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 1).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_21); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 12).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 9).and(MASK_21).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_21); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.and(0x7ffff).lanewise(VectorOperators.LSHL, 2).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_21); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 13).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 8).and(MASK_21).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_21); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 3).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_21); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 14).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 7).and(MASK_21).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_21); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.and(0x1ffff).lanewise(VectorOperators.LSHL, 4).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_21); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 15).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 6).and(MASK_21).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_21); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 5).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_21); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 5).and(MASK_21).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_21); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.and(0x7fff).lanewise(VectorOperators.LSHL, 6).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_21); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 17).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 4).and(MASK_21).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_21); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 7).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_21); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.and(7).lanewise(VectorOperators.LSHL, 18).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 3).and(MASK_21).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_21); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 8).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_21); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.and(3).lanewise(VectorOperators.LSHL, 19).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 2).and(MASK_21).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_21); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 9).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_21); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.and(1).lanewise(VectorOperators.LSHL, 20).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 1).and(MASK_21).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_21); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 10).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 11).and(MASK_21).intoArray(out, outpos); - outpos += VLEN_256; - } - private static void fastunpack22(final int[] in, int inpos, final int[] out, int outpos) { var iV = IntVector.fromArray(SPECIES_512, in, inpos); @@ -11138,256 +3786,46 @@ private static void fastunpack22(final int[] in, int inpos, final int[] out, oV.intoArray(out, outpos); outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 6).and(MASK_22).intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_22); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); - oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 4).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_22); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); - oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 14).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 8).and(MASK_22).intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_22); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); - oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 2).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_22); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); - oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 12).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 10).and(MASK_22).intoArray(out, outpos); - outpos += VLEN_512; - } - - private static void fastunpack23(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - iV.and(MASK_23).intoArray(out, outpos); - outpos += VLEN_256; - - var oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_23); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 9).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_23); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 18).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 5).and(MASK_23).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_23); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.and(0x7ffff).lanewise(VectorOperators.LSHL, 4).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_23); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 13).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_23); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.and(1).lanewise(VectorOperators.LSHL, 22).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 1).and(MASK_23).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_23); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.and(0x7fff).lanewise(VectorOperators.LSHL, 8).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_23); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 17).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 6).and(MASK_23).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_23); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 3).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_23); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 12).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_23); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.and(3).lanewise(VectorOperators.LSHL, 21).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 2).and(MASK_23).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_23); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 7).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_23); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 7).and(MASK_23).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_23); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.and(0x1fffff).lanewise(VectorOperators.LSHL, 2).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_23); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 11).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_23); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.and(7).lanewise(VectorOperators.LSHL, 20).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 3).and(MASK_23).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_23); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.and(0x1ffff).lanewise(VectorOperators.LSHL, 6).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_23); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 15).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 8).and(MASK_23).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_23); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.and(0x3fffff).lanewise(VectorOperators.LSHL, 1).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_22).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_23); + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_22); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 10).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 112); + oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 4).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_23); + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_22); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 19).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 128); + oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 14).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 4).and(MASK_23).intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_22).intoArray(out, outpos); + outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_23); + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_22); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 5).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 144); + oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 2).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_23); + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_22); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 14).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); + oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 12).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 9).and(MASK_23).intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_22).intoArray(out, outpos); + outpos += VLEN_512; } private static void fastunpack24(final int[] in, int inpos, final int[] out, @@ -11497,226 +3935,6 @@ private static void fastunpack24(final int[] in, int inpos, final int[] out, outpos += VLEN_512; } - private static void fastunpack25(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - iV.and(MASK_25).intoArray(out, outpos); - outpos += VLEN_256; - - var oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_25); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 7).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_25); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 14).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_25); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 21).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 4).and(MASK_25).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_25); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.and(0x3fffff).lanewise(VectorOperators.LSHL, 3).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_25); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.and(0x7fff).lanewise(VectorOperators.LSHL, 10).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_25); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 17).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_25); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.and(1).lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 1).and(MASK_25).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_25); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.and(0x7ffff).lanewise(VectorOperators.LSHL, 6).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_25); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 13).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_25); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 20).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 5).and(MASK_25).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_25); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.and(0x7fffff).lanewise(VectorOperators.LSHL, 2).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_25); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 9).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_25); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 9).and(MASK_25); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.and(3).lanewise(VectorOperators.LSHL, 23).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 2).and(MASK_25).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_25); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 5).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_25); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 12).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_25); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 19).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 6).and(MASK_25).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_25); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.and(0xffffff).lanewise(VectorOperators.LSHL, 1).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_25); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.and(0x1ffff).lanewise(VectorOperators.LSHL, 8).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_25); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 15).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_25); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.and(7).lanewise(VectorOperators.LSHL, 22).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 3).and(MASK_25).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_25); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.and(0x1fffff).lanewise(VectorOperators.LSHL, 4).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_25); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 11).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_25); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 18).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 7).and(MASK_25).intoArray(out, outpos); - outpos += VLEN_256; - } - private static void fastunpack26(final int[] in, int inpos, final int[] out, int outpos) { var iV = IntVector.fromArray(SPECIES_512, in, inpos); @@ -11799,264 +4017,34 @@ private static void fastunpack26(final int[] in, int inpos, final int[] out, oV = iV.and(0xffffff).lanewise(VectorOperators.LSHL, 2).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_26); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); - oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 8).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_26); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); - oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 14).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_26); - - iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); - oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 20).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_512; - - iV.lanewise(VectorOperators.LSHR, 6).and(MASK_26).intoArray(out, outpos); - outpos += VLEN_512; - } - - private static void fastunpack27(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - iV.and(MASK_27).intoArray(out, outpos); - outpos += VLEN_256; - - var oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_27); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.and(0x3fffff).lanewise(VectorOperators.LSHL, 5).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_27); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.and(0x1ffff).lanewise(VectorOperators.LSHL, 10).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_27); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 15).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_27); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 20).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 7).and(MASK_27); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.and(3).lanewise(VectorOperators.LSHL, 25).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 2).and(MASK_27).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_27); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.and(0xffffff).lanewise(VectorOperators.LSHL, 3).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_27); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.and(0x7ffff).lanewise(VectorOperators.LSHL, 8).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_27); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 13).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_27); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 18).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 9).and(MASK_27); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 23).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 4).and(MASK_27).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_27); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.and(0x3ffffff).lanewise(VectorOperators.LSHL, 1).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_27); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.and(0x1fffff).lanewise(VectorOperators.LSHL, 6).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_27); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 11).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_27); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_27); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 21).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 6).and(MASK_27); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.and(1).lanewise(VectorOperators.LSHL, 26).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 1).and(MASK_27).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_27); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.and(0x7fffff).lanewise(VectorOperators.LSHL, 4).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_27); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 9).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_27); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 14).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_27); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 19).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_27); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.and(7).lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 3).and(MASK_27).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_27); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.and(0x1ffffff).lanewise(VectorOperators.LSHL, 2).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_27); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 7).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_27); + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_26); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.and(0x7fff).lanewise(VectorOperators.LSHL, 12).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 160); + oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 8).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_27); + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_26); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 17).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 176); + oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 14).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_27); + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_26); - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 22).or(oV); + iV = IntVector.fromArray(SPECIES_512, in, inpos + 192); + oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 20).or(oV); oV.intoArray(out, outpos); - outpos += VLEN_256; + outpos += VLEN_512; - iV.lanewise(VectorOperators.LSHR, 5).and(MASK_27).intoArray(out, outpos); - outpos += VLEN_256; + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_26).intoArray(out, outpos); + outpos += VLEN_512; } private static void fastunpack28(final int[] in, int inpos, final int[] out, @@ -12176,246 +4164,6 @@ private static void fastunpack28(final int[] in, int inpos, final int[] out, outpos += VLEN_512; } - private static void fastunpack29(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - iV.and(MASK_29).intoArray(out, outpos); - outpos += VLEN_256; - - var oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_29); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.and(0x3ffffff).lanewise(VectorOperators.LSHL, 3).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_29); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.and(0x7fffff).lanewise(VectorOperators.LSHL, 6).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_29); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 9).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_29); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.and(0x1ffff).lanewise(VectorOperators.LSHL, 12).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_29); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 15).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_29); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 18).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_29); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 21).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_29); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 5).and(MASK_29); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.and(3).lanewise(VectorOperators.LSHL, 27).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 2).and(MASK_29).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_29); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.and(0xfffffff).lanewise(VectorOperators.LSHL, 1).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_29); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.and(0x1ffffff).lanewise(VectorOperators.LSHL, 4).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_29); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.and(0x3fffff).lanewise(VectorOperators.LSHL, 7).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_29); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.and(0x7ffff).lanewise(VectorOperators.LSHL, 10).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_29); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 13).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_29); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_29); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 19).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_29); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 22).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 7).and(MASK_29); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 25).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 4).and(MASK_29); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.and(1).lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 1).and(MASK_29).intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_29); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.and(0x7ffffff).lanewise(VectorOperators.LSHL, 2).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_29); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.and(0xffffff).lanewise(VectorOperators.LSHL, 5).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_29); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.and(0x1fffff).lanewise(VectorOperators.LSHL, 8).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_29); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 11).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_29); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.and(0x7fff).lanewise(VectorOperators.LSHL, 14).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_29); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 17).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_29); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 20).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 9).and(MASK_29); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 23).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 6).and(MASK_29); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.and(7).lanewise(VectorOperators.LSHL, 26).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 3).and(MASK_29).intoArray(out, outpos); - outpos += VLEN_256; - } - private static void fastunpack30(final int[] in, int inpos, final int[] out, int outpos) { var iV = IntVector.fromArray(SPECIES_512, in, inpos); @@ -12538,253 +4286,4 @@ private static void fastunpack30(final int[] in, int inpos, final int[] out, outpos += VLEN_512; } - private static void fastunpack31(final int[] in, int inpos, final int[] out, - int outpos) { - var iV = IntVector.fromArray(SPECIES_256, in, inpos); - iV.and(MASK_31).intoArray(out, outpos); - outpos += VLEN_256; - - var oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_31); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); - oV = iV.and(0x3fffffff).lanewise(VectorOperators.LSHL, 1).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_31); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); - oV = iV.and(0x1fffffff).lanewise(VectorOperators.LSHL, 2).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_31); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); - oV = iV.and(0xfffffff).lanewise(VectorOperators.LSHL, 3).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_31); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); - oV = iV.and(0x7ffffff).lanewise(VectorOperators.LSHL, 4).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_31); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); - oV = iV.and(0x3ffffff).lanewise(VectorOperators.LSHL, 5).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_31); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); - oV = iV.and(0x1ffffff).lanewise(VectorOperators.LSHL, 6).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_31); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); - oV = iV.and(0xffffff).lanewise(VectorOperators.LSHL, 7).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_31); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); - oV = iV.and(0x7fffff).lanewise(VectorOperators.LSHL, 8).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_31); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); - oV = iV.and(0x3fffff).lanewise(VectorOperators.LSHL, 9).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_31); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); - oV = iV.and(0x1fffff).lanewise(VectorOperators.LSHL, 10).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_31); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); - oV = iV.and(0xfffff).lanewise(VectorOperators.LSHL, 11).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_31); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); - oV = iV.and(0x7ffff).lanewise(VectorOperators.LSHL, 12).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_31); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); - oV = iV.and(0x3ffff).lanewise(VectorOperators.LSHL, 13).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_31); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); - oV = iV.and(0x1ffff).lanewise(VectorOperators.LSHL, 14).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_31); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); - oV = iV.and(0xffff).lanewise(VectorOperators.LSHL, 15).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_31); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); - oV = iV.and(0x7fff).lanewise(VectorOperators.LSHL, 16).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_31); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); - oV = iV.and(0x3fff).lanewise(VectorOperators.LSHL, 17).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_31); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); - oV = iV.and(0x1fff).lanewise(VectorOperators.LSHL, 18).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_31); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); - oV = iV.and(0xfff).lanewise(VectorOperators.LSHL, 19).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_31); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); - oV = iV.and(0x7ff).lanewise(VectorOperators.LSHL, 20).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_31); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); - oV = iV.and(0x3ff).lanewise(VectorOperators.LSHL, 21).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_31); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); - oV = iV.and(0x1ff).lanewise(VectorOperators.LSHL, 22).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 9).and(MASK_31); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); - oV = iV.and(0xff).lanewise(VectorOperators.LSHL, 23).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_31); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); - oV = iV.and(0x7f).lanewise(VectorOperators.LSHL, 24).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 7).and(MASK_31); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); - oV = iV.and(0x3f).lanewise(VectorOperators.LSHL, 25).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 6).and(MASK_31); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); - oV = iV.and(0x1f).lanewise(VectorOperators.LSHL, 26).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 5).and(MASK_31); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); - oV = iV.and(0xf).lanewise(VectorOperators.LSHL, 27).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 4).and(MASK_31); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); - oV = iV.and(7).lanewise(VectorOperators.LSHL, 28).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 3).and(MASK_31); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); - oV = iV.and(3).lanewise(VectorOperators.LSHL, 29).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - oV = iV.lanewise(VectorOperators.LSHR, 2).and(MASK_31); - - iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); - oV = iV.and(1).lanewise(VectorOperators.LSHL, 30).or(oV); - - oV.intoArray(out, outpos); - outpos += VLEN_256; - - iV.lanewise(VectorOperators.LSHR, 1).and(MASK_31).intoArray(out, outpos); - outpos += VLEN_256; - } } diff --git a/src/main/java/me/lemire/integercompression/vector/VectorBitPacker128.java b/src/main/java/me/lemire/integercompression/vector/VectorBitPacker128.java new file mode 100644 index 0000000..6e08546 --- /dev/null +++ b/src/main/java/me/lemire/integercompression/vector/VectorBitPacker128.java @@ -0,0 +1,31953 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ +package me.lemire.integercompression.vector; + +import java.util.Arrays; +import jdk.incubator.vector.*; + +/** + * Vectorized bit-packing routines using 128-bit (4 x int32) vectors. + * + * A 256-integer block is packed across 4 SIMD lanes, each lane packing 64 + * values into 2*b 32-bit words. Selected at runtime by VectorBitPackerKernels + * when the preferred hardware vector width is 128 bits (e.g. Arm NEON, Graviton). + */ +public class VectorBitPacker128 implements VectorBitPackerKernels { + private static final VectorSpecies SPECIES_128 = + IntVector.SPECIES_128; + private static final int VLEN_128 = 4; + private static final int BLOCK_SIZE = 256; + + private static final IntVector MASK_1 = + IntVector.broadcast(SPECIES_128, (1 << 1) - 1); + private static final IntVector MASK_2 = + IntVector.broadcast(SPECIES_128, (1 << 2) - 1); + private static final IntVector MASK_3 = + IntVector.broadcast(SPECIES_128, (1 << 3) - 1); + private static final IntVector MASK_4 = + IntVector.broadcast(SPECIES_128, (1 << 4) - 1); + private static final IntVector MASK_5 = + IntVector.broadcast(SPECIES_128, (1 << 5) - 1); + private static final IntVector MASK_6 = + IntVector.broadcast(SPECIES_128, (1 << 6) - 1); + private static final IntVector MASK_7 = + IntVector.broadcast(SPECIES_128, (1 << 7) - 1); + private static final IntVector MASK_8 = + IntVector.broadcast(SPECIES_128, (1 << 8) - 1); + private static final IntVector MASK_9 = + IntVector.broadcast(SPECIES_128, (1 << 9) - 1); + private static final IntVector MASK_10 = + IntVector.broadcast(SPECIES_128, (1 << 10) - 1); + private static final IntVector MASK_11 = + IntVector.broadcast(SPECIES_128, (1 << 11) - 1); + private static final IntVector MASK_12 = + IntVector.broadcast(SPECIES_128, (1 << 12) - 1); + private static final IntVector MASK_13 = + IntVector.broadcast(SPECIES_128, (1 << 13) - 1); + private static final IntVector MASK_14 = + IntVector.broadcast(SPECIES_128, (1 << 14) - 1); + private static final IntVector MASK_15 = + IntVector.broadcast(SPECIES_128, (1 << 15) - 1); + private static final IntVector MASK_16 = + IntVector.broadcast(SPECIES_128, (1 << 16) - 1); + private static final IntVector MASK_17 = + IntVector.broadcast(SPECIES_128, (1 << 17) - 1); + private static final IntVector MASK_18 = + IntVector.broadcast(SPECIES_128, (1 << 18) - 1); + private static final IntVector MASK_19 = + IntVector.broadcast(SPECIES_128, (1 << 19) - 1); + private static final IntVector MASK_20 = + IntVector.broadcast(SPECIES_128, (1 << 20) - 1); + private static final IntVector MASK_21 = + IntVector.broadcast(SPECIES_128, (1 << 21) - 1); + private static final IntVector MASK_22 = + IntVector.broadcast(SPECIES_128, (1 << 22) - 1); + private static final IntVector MASK_23 = + IntVector.broadcast(SPECIES_128, (1 << 23) - 1); + private static final IntVector MASK_24 = + IntVector.broadcast(SPECIES_128, (1 << 24) - 1); + private static final IntVector MASK_25 = + IntVector.broadcast(SPECIES_128, (1 << 25) - 1); + private static final IntVector MASK_26 = + IntVector.broadcast(SPECIES_128, (1 << 26) - 1); + private static final IntVector MASK_27 = + IntVector.broadcast(SPECIES_128, (1 << 27) - 1); + private static final IntVector MASK_28 = + IntVector.broadcast(SPECIES_128, (1 << 28) - 1); + private static final IntVector MASK_29 = + IntVector.broadcast(SPECIES_128, (1 << 29) - 1); + private static final IntVector MASK_30 = + IntVector.broadcast(SPECIES_128, (1 << 30) - 1); + private static final IntVector MASK_31 = + IntVector.broadcast(SPECIES_128, (1 << 31) - 1); + + @Override + public void fastpack(final int[] in, int inpos, final int[] out, + int outpos, int b) { + switch (b) { + case 0: + break; + case 1: + fastpack1(in, inpos, out, outpos); + break; + case 2: + fastpack2(in, inpos, out, outpos); + break; + case 3: + fastpack3(in, inpos, out, outpos); + break; + case 4: + fastpack4(in, inpos, out, outpos); + break; + case 5: + fastpack5(in, inpos, out, outpos); + break; + case 6: + fastpack6(in, inpos, out, outpos); + break; + case 7: + fastpack7(in, inpos, out, outpos); + break; + case 8: + fastpack8(in, inpos, out, outpos); + break; + case 9: + fastpack9(in, inpos, out, outpos); + break; + case 10: + fastpack10(in, inpos, out, outpos); + break; + case 11: + fastpack11(in, inpos, out, outpos); + break; + case 12: + fastpack12(in, inpos, out, outpos); + break; + case 13: + fastpack13(in, inpos, out, outpos); + break; + case 14: + fastpack14(in, inpos, out, outpos); + break; + case 15: + fastpack15(in, inpos, out, outpos); + break; + case 16: + fastpack16(in, inpos, out, outpos); + break; + case 17: + fastpack17(in, inpos, out, outpos); + break; + case 18: + fastpack18(in, inpos, out, outpos); + break; + case 19: + fastpack19(in, inpos, out, outpos); + break; + case 20: + fastpack20(in, inpos, out, outpos); + break; + case 21: + fastpack21(in, inpos, out, outpos); + break; + case 22: + fastpack22(in, inpos, out, outpos); + break; + case 23: + fastpack23(in, inpos, out, outpos); + break; + case 24: + fastpack24(in, inpos, out, outpos); + break; + case 25: + fastpack25(in, inpos, out, outpos); + break; + case 26: + fastpack26(in, inpos, out, outpos); + break; + case 27: + fastpack27(in, inpos, out, outpos); + break; + case 28: + fastpack28(in, inpos, out, outpos); + break; + case 29: + fastpack29(in, inpos, out, outpos); + break; + case 30: + fastpack30(in, inpos, out, outpos); + break; + case 31: + fastpack31(in, inpos, out, outpos); + break; + case 32: + System.arraycopy(in, inpos, out, outpos, BLOCK_SIZE); + break; + } + } + + @Override + public void fastpackNoMask(final int[] in, int inpos, final int[] out, + int outpos, int b) { + switch (b) { + case 0: + break; + case 1: + fastpackNoMask1(in, inpos, out, outpos); + break; + case 2: + fastpackNoMask2(in, inpos, out, outpos); + break; + case 3: + fastpackNoMask3(in, inpos, out, outpos); + break; + case 4: + fastpackNoMask4(in, inpos, out, outpos); + break; + case 5: + fastpackNoMask5(in, inpos, out, outpos); + break; + case 6: + fastpackNoMask6(in, inpos, out, outpos); + break; + case 7: + fastpackNoMask7(in, inpos, out, outpos); + break; + case 8: + fastpackNoMask8(in, inpos, out, outpos); + break; + case 9: + fastpackNoMask9(in, inpos, out, outpos); + break; + case 10: + fastpackNoMask10(in, inpos, out, outpos); + break; + case 11: + fastpackNoMask11(in, inpos, out, outpos); + break; + case 12: + fastpackNoMask12(in, inpos, out, outpos); + break; + case 13: + fastpackNoMask13(in, inpos, out, outpos); + break; + case 14: + fastpackNoMask14(in, inpos, out, outpos); + break; + case 15: + fastpackNoMask15(in, inpos, out, outpos); + break; + case 16: + fastpackNoMask16(in, inpos, out, outpos); + break; + case 17: + fastpackNoMask17(in, inpos, out, outpos); + break; + case 18: + fastpackNoMask18(in, inpos, out, outpos); + break; + case 19: + fastpackNoMask19(in, inpos, out, outpos); + break; + case 20: + fastpackNoMask20(in, inpos, out, outpos); + break; + case 21: + fastpackNoMask21(in, inpos, out, outpos); + break; + case 22: + fastpackNoMask22(in, inpos, out, outpos); + break; + case 23: + fastpackNoMask23(in, inpos, out, outpos); + break; + case 24: + fastpackNoMask24(in, inpos, out, outpos); + break; + case 25: + fastpackNoMask25(in, inpos, out, outpos); + break; + case 26: + fastpackNoMask26(in, inpos, out, outpos); + break; + case 27: + fastpackNoMask27(in, inpos, out, outpos); + break; + case 28: + fastpackNoMask28(in, inpos, out, outpos); + break; + case 29: + fastpackNoMask29(in, inpos, out, outpos); + break; + case 30: + fastpackNoMask30(in, inpos, out, outpos); + break; + case 31: + fastpackNoMask31(in, inpos, out, outpos); + break; + case 32: + System.arraycopy(in, inpos, out, outpos, BLOCK_SIZE); + break; + } + } + + @Override + public void fastunpack(final int[] in, int inpos, final int[] out, + int outpos, int b) { + switch (b) { + case 0: + Arrays.fill(out, outpos, outpos + 256, 0); + break; + case 1: + fastunpack1(in, inpos, out, outpos); + break; + case 2: + fastunpack2(in, inpos, out, outpos); + break; + case 3: + fastunpack3(in, inpos, out, outpos); + break; + case 4: + fastunpack4(in, inpos, out, outpos); + break; + case 5: + fastunpack5(in, inpos, out, outpos); + break; + case 6: + fastunpack6(in, inpos, out, outpos); + break; + case 7: + fastunpack7(in, inpos, out, outpos); + break; + case 8: + fastunpack8(in, inpos, out, outpos); + break; + case 9: + fastunpack9(in, inpos, out, outpos); + break; + case 10: + fastunpack10(in, inpos, out, outpos); + break; + case 11: + fastunpack11(in, inpos, out, outpos); + break; + case 12: + fastunpack12(in, inpos, out, outpos); + break; + case 13: + fastunpack13(in, inpos, out, outpos); + break; + case 14: + fastunpack14(in, inpos, out, outpos); + break; + case 15: + fastunpack15(in, inpos, out, outpos); + break; + case 16: + fastunpack16(in, inpos, out, outpos); + break; + case 17: + fastunpack17(in, inpos, out, outpos); + break; + case 18: + fastunpack18(in, inpos, out, outpos); + break; + case 19: + fastunpack19(in, inpos, out, outpos); + break; + case 20: + fastunpack20(in, inpos, out, outpos); + break; + case 21: + fastunpack21(in, inpos, out, outpos); + break; + case 22: + fastunpack22(in, inpos, out, outpos); + break; + case 23: + fastunpack23(in, inpos, out, outpos); + break; + case 24: + fastunpack24(in, inpos, out, outpos); + break; + case 25: + fastunpack25(in, inpos, out, outpos); + break; + case 26: + fastunpack26(in, inpos, out, outpos); + break; + case 27: + fastunpack27(in, inpos, out, outpos); + break; + case 28: + fastunpack28(in, inpos, out, outpos); + break; + case 29: + fastunpack29(in, inpos, out, outpos); + break; + case 30: + fastunpack30(in, inpos, out, outpos); + break; + case 31: + fastunpack31(in, inpos, out, outpos); + break; + case 32: + System.arraycopy(in, inpos, out, outpos, BLOCK_SIZE); + break; + } + } + + private static void fastpack1(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV.and(MASK_1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 21).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 23).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 25).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 26).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 27).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 28).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 29).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 30).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(MASK_1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 21).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 23).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 25).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 26).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 27).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 28).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 29).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 30).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastpackNoMask1(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastunpack1(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + iV.and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 11).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 13).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 15).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 17).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 19).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 21).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 23).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 25).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 26).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 27).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 28).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 29).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 30).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 31).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + iV.and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 11).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 13).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 15).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 17).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 19).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 21).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 23).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 25).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 26).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 27).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 28).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 29).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 30).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 31).and(MASK_1).intoArray(out, outpos); + } + + private static void fastpack2(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV.and(MASK_2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 26).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 28).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(MASK_2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 26).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 28).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(MASK_2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 26).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 28).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(MASK_2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 26).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 28).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastpackNoMask2(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastunpack2(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + iV.and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 26).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 28).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 30).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + iV.and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 26).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 28).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 30).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + iV.and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 26).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 28).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 30).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + iV.and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 26).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 28).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 30).and(MASK_2).intoArray(out, outpos); + } + + private static void fastpack3(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV.and(MASK_3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 21).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 27).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 25).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 28).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 23).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 26).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(MASK_3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 21).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 27).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 25).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 28).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 23).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 26).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastpackNoMask3(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastunpack3(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + iV.and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 15).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 21).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 27).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 13).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 19).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 25).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 28).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 11).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 17).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 23).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 26).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 29).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + iV.and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 15).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 21).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 27).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 13).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 19).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 25).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 28).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 11).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 17).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 23).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 26).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 29).and(MASK_3).intoArray(out, outpos); + } + + private static void fastpack4(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV.and(MASK_4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(MASK_4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(MASK_4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(MASK_4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(MASK_4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(MASK_4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(MASK_4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(MASK_4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastpackNoMask4(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastunpack4(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + iV.and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 28).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + iV.and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 28).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + iV.and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 28).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + iV.and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 28).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + iV.and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 28).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + iV.and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 28).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + iV.and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 28).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + iV.and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 28).and(MASK_4).intoArray(out, outpos); + } + + private static void fastpack5(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV.and(MASK_5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 25).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 23).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 21).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 26).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(MASK_5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 25).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 23).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 21).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 26).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastpackNoMask5(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastunpack5(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + iV.and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 15).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 25).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 13).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 23).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 11).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 21).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 26).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 19).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 17).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 27).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + iV.and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 15).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 25).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 13).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 23).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 11).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 21).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 26).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 19).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 17).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 27).and(MASK_5).intoArray(out, outpos); + } + + private static void fastpack6(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV.and(MASK_6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(MASK_6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(MASK_6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(MASK_6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastpackNoMask6(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastunpack6(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + iV.and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 26).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + iV.and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 26).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + iV.and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 26).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + iV.and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 26).and(MASK_6).intoArray(out, outpos); + } + + private static void fastpack7(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV.and(MASK_7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 21).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 23).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(MASK_7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 21).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 23).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastpackNoMask7(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastunpack7(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + iV.and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 21).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + var oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 17).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 13).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 23).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 19).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 15).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 11).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 25).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + iV.and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 21).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 17).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 13).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 23).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 19).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 15).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 11).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 25).and(MASK_7).intoArray(out, outpos); + } + + private static void fastpack8(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV.and(MASK_8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(MASK_8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(MASK_8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(MASK_8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(MASK_8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(MASK_8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(MASK_8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(MASK_8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(MASK_8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(MASK_8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(MASK_8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(MASK_8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(MASK_8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(MASK_8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(MASK_8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.and(MASK_8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastpackNoMask8(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastunpack8(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + iV.and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + iV.and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + iV.and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + iV.and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + iV.and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + iV.and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + iV.and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + iV.and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + iV.and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + iV.and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + iV.and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + iV.and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + iV.and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + iV.and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + iV.and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + iV.and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos); + } + + private static void fastpack9(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV.and(MASK_9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 21).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(MASK_9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 21).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastpackNoMask9(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastunpack9(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + iV.and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + var oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 13).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 17).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 21).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 11).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 15).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 19).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 23).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + iV.and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 13).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 17).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 21).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 11).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 15).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 19).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 23).and(MASK_9).intoArray(out, outpos); + } + + private static void fastpack10(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV.and(MASK_10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(MASK_10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(MASK_10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(MASK_10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastpackNoMask10(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastunpack10(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + iV.and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + iV.and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + iV.and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + iV.and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_10).intoArray(out, outpos); + } + + private static void fastpack11(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV.and(MASK_11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(MASK_11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastpackNoMask11(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastunpack11(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + iV.and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 11).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + var oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 13).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 15).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 17).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 19).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(511).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 21).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + iV.and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 11).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 13).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 15).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 17).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 19).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(511).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 21).and(MASK_11).intoArray(out, outpos); + } + + private static void fastpack12(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV.and(MASK_12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(MASK_12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(MASK_12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(MASK_12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(MASK_12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(MASK_12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(MASK_12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(MASK_12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastpackNoMask12(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastunpack12(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + iV.and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + var oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + iV.and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + iV.and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + iV.and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + iV.and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + iV.and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + iV.and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + iV.and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_12).intoArray(out, outpos); + } + + private static void fastpack13(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV.and(MASK_13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(MASK_13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastpackNoMask13(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastunpack13(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + iV.and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 13).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + var oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 15).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(511).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 17).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(2047).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 11).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 19).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + iV.and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 13).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 15).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(511).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 17).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(2047).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 11).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 19).and(MASK_13).intoArray(out, outpos); + } + + private static void fastpack14(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV.and(MASK_14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(MASK_14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(MASK_14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(MASK_14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastpackNoMask14(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastunpack14(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + iV.and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + var oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + iV.and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + iV.and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + iV.and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_14).intoArray(out, outpos); + } + + private static void fastpack15(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV.and(MASK_15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(MASK_15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastpackNoMask15(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastunpack15(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + iV.and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 15).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(8191).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 13).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(2047).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 11).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(511).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 17).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + iV.and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 15).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(8191).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 13).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(2047).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 11).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(511).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 17).and(MASK_15).intoArray(out, outpos); + } + + private static void fastpack16(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastpackNoMask16(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastunpack16(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + } + + private static void fastpack17(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV.and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastpackNoMask17(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastunpack17(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + iV.and(MASK_17).intoArray(out, outpos); + outpos += VLEN_128; + + var oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(511).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(2047).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 11).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(8191).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 13).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(32767).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 15).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + iV.and(MASK_17).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(511).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(2047).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 11).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(8191).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 13).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(32767).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 15).and(MASK_17).intoArray(out, outpos); + } + + private static void fastpack18(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV.and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastpackNoMask18(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastunpack18(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + iV.and(MASK_18).intoArray(out, outpos); + outpos += VLEN_128; + + var oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + iV.and(MASK_18).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + iV.and(MASK_18).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + iV.and(MASK_18).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_18).intoArray(out, outpos); + } + + private static void fastpack19(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV.and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastpackNoMask19(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastunpack19(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + iV.and(MASK_19).intoArray(out, outpos); + outpos += VLEN_128; + + var oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(2047).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 11).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(131071).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(511).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(32767).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(8191).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 13).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + iV.and(MASK_19).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(2047).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 11).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(131071).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(511).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(32767).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(8191).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 13).and(MASK_19).intoArray(out, outpos); + } + + private static void fastpack20(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV.and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastpackNoMask20(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastunpack20(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + iV.and(MASK_20).intoArray(out, outpos); + outpos += VLEN_128; + + var oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + iV.and(MASK_20).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + iV.and(MASK_20).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + iV.and(MASK_20).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + iV.and(MASK_20).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + iV.and(MASK_20).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + iV.and(MASK_20).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + iV.and(MASK_20).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_20).intoArray(out, outpos); + } + + private static void fastpack21(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV.and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastpackNoMask21(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastunpack21(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + iV.and(MASK_21).intoArray(out, outpos); + outpos += VLEN_128; + + var oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_21).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(511).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_21).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(524287).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_21).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_21).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(131071).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_21).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_21).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(32767).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_21).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_21).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(8191).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_21).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_21).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(2047).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 11).and(MASK_21).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + iV.and(MASK_21).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_21).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(511).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_21).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(524287).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_21).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_21).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(131071).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_21).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_21).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(32767).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_21).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_21).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(8191).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_21).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_21).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(2047).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 11).and(MASK_21).intoArray(out, outpos); + } + + private static void fastpack22(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV.and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastpackNoMask22(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastunpack22(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + iV.and(MASK_22).intoArray(out, outpos); + outpos += VLEN_128; + + var oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_22).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_22).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_22).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_22).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_22).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + iV.and(MASK_22).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_22).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_22).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_22).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_22).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_22).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + iV.and(MASK_22).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_22).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_22).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_22).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_22).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_22).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + iV.and(MASK_22).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_22).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_22).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_22).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_22).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_22).intoArray(out, outpos); + } + + private static void fastpack23(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV.and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastpackNoMask23(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastunpack23(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + iV.and(MASK_23).intoArray(out, outpos); + outpos += VLEN_128; + + var oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_23).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(524287).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_23).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(32767).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_23).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(2047).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_23).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_23).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(2097151).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_23).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(131071).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_23).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(4194303).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(8191).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_23).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(511).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_23).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + iV.and(MASK_23).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_23).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(524287).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_23).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(32767).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_23).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(2047).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_23).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_23).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(2097151).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_23).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(131071).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_23).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(4194303).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(8191).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_23).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(511).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_23).intoArray(out, outpos); + } + + private static void fastpack24(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV.and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastpackNoMask24(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastunpack24(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + iV.and(MASK_24).intoArray(out, outpos); + outpos += VLEN_128; + + var oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + iV.and(MASK_24).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + iV.and(MASK_24).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + iV.and(MASK_24).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + iV.and(MASK_24).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + iV.and(MASK_24).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + iV.and(MASK_24).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + iV.and(MASK_24).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + iV.and(MASK_24).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + iV.and(MASK_24).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + iV.and(MASK_24).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + iV.and(MASK_24).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + iV.and(MASK_24).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + iV.and(MASK_24).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + iV.and(MASK_24).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + iV.and(MASK_24).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos); + } + + private static void fastpack25(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV.and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastpackNoMask25(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastunpack25(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + iV.and(MASK_25).intoArray(out, outpos); + outpos += VLEN_128; + + var oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(2047).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_25).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(4194303).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(32767).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_25).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(524287).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_25).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(8388607).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(511).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 9).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_25).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(8191).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_25).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(131071).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_25).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(2097151).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_25).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + iV.and(MASK_25).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(2047).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_25).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(4194303).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(32767).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_25).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(524287).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_25).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(8388607).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(511).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 9).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_25).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(8191).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_25).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(131071).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_25).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(2097151).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_25).intoArray(out, outpos); + } + + private static void fastpack26(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV.and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastpackNoMask26(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastunpack26(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + iV.and(MASK_26).intoArray(out, outpos); + outpos += VLEN_128; + + var oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_26).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(4194303).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_26).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_26).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + iV.and(MASK_26).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_26).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(4194303).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_26).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_26).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + iV.and(MASK_26).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_26).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(4194303).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_26).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_26).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + iV.and(MASK_26).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_26).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(4194303).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_26).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_26).intoArray(out, outpos); + } + + private static void fastpack27(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV.and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastpackNoMask27(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastunpack27(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + iV.and(MASK_27).intoArray(out, outpos); + outpos += VLEN_128; + + var oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(4194303).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(131071).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 7).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_27).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(524287).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(511).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 9).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_27).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(67108863).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(2097151).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(2047).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_27).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(8388607).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(8191).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_27).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(33554431).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(32767).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_27).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + iV.and(MASK_27).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(4194303).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(131071).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 7).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_27).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(524287).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(511).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 9).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_27).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(67108863).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(2097151).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(2047).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_27).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(8388607).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(8191).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_27).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(33554431).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(32767).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_27).intoArray(out, outpos); + } + + private static void fastpack28(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV.and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastpackNoMask28(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastunpack28(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + iV.and(MASK_28).intoArray(out, outpos); + outpos += VLEN_128; + + var oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_28).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + iV.and(MASK_28).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_28).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + iV.and(MASK_28).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_28).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + iV.and(MASK_28).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_28).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + iV.and(MASK_28).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_28).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + iV.and(MASK_28).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_28).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + iV.and(MASK_28).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_28).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + iV.and(MASK_28).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_28).intoArray(out, outpos); + } + + private static void fastpack29(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV.and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastpackNoMask29(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastunpack29(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + iV.and(MASK_29).intoArray(out, outpos); + outpos += VLEN_128; + + var oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(67108863).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(8388607).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(131071).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(2047).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 5).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_29).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(268435455).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(33554431).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(4194303).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(524287).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(8191).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 7).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_29).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(134217727).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(2097151).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(32767).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(511).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 9).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_29).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + iV.and(MASK_29).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(67108863).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(8388607).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(131071).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(2047).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 5).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_29).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(268435455).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(33554431).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(4194303).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(524287).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(8191).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 7).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_29).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(134217727).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(2097151).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(32767).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(511).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 9).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_29).intoArray(out, outpos); + } + + private static void fastpack30(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV.and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastpackNoMask30(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastunpack30(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + iV.and(MASK_30).intoArray(out, outpos); + outpos += VLEN_128; + + var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(268435455).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(67108863).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(4194303).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_30).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + iV.and(MASK_30).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(268435455).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(67108863).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(4194303).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_30).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + iV.and(MASK_30).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(268435455).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(67108863).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(4194303).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_30).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + iV.and(MASK_30).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(268435455).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(67108863).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(4194303).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_30).intoArray(out, outpos); + } + + private static void fastpack31(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV.and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastpackNoMask31(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 21); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 23); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 25); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 27); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 29); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 252); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + } + + private static void fastunpack31(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_128, in, inpos); + iV.and(MASK_31).intoArray(out, outpos); + outpos += VLEN_128; + + var oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 4); + oV = iV.and(1073741823).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 8); + oV = iV.and(536870911).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 12); + oV = iV.and(268435455).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 16); + oV = iV.and(134217727).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 20); + oV = iV.and(67108863).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 24); + oV = iV.and(33554431).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 28); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 32); + oV = iV.and(8388607).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 36); + oV = iV.and(4194303).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 40); + oV = iV.and(2097151).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 44); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 48); + oV = iV.and(524287).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 52); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 56); + oV = iV.and(131071).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 60); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 64); + oV = iV.and(32767).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 68); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 72); + oV = iV.and(8191).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 76); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 80); + oV = iV.and(2047).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 84); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 88); + oV = iV.and(511).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 9).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 92); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 96); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 7).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 100); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 104); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 5).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 108); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 112); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 3).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 116); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 120); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_31).intoArray(out, outpos); + outpos += VLEN_128; + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 124); + iV.and(MASK_31).intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 128); + oV = iV.and(1073741823).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 132); + oV = iV.and(536870911).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 136); + oV = iV.and(268435455).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 140); + oV = iV.and(134217727).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 144); + oV = iV.and(67108863).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 148); + oV = iV.and(33554431).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 152); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 156); + oV = iV.and(8388607).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 160); + oV = iV.and(4194303).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 164); + oV = iV.and(2097151).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 168); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 172); + oV = iV.and(524287).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 176); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 180); + oV = iV.and(131071).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 184); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 188); + oV = iV.and(32767).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 192); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 196); + oV = iV.and(8191).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 200); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 204); + oV = iV.and(2047).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 208); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 212); + oV = iV.and(511).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 9).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 216); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 220); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 7).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 224); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 6).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 228); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 5).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 232); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 4).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 236); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 3).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 240); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + oV = iV.lanewise(VectorOperators.LSHR, 2).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_128, in, inpos + 244); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_128; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_31).intoArray(out, outpos); + } + +} diff --git a/src/main/java/me/lemire/integercompression/vector/VectorBitPacker256.java b/src/main/java/me/lemire/integercompression/vector/VectorBitPacker256.java new file mode 100644 index 0000000..02596f5 --- /dev/null +++ b/src/main/java/me/lemire/integercompression/vector/VectorBitPacker256.java @@ -0,0 +1,16226 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ +package me.lemire.integercompression.vector; + +import java.util.Arrays; +import jdk.incubator.vector.*; + +/** + * Vectorized bit-packing routines using 256-bit (8 x int32) vectors. + * + * A 256-integer block is packed across 8 SIMD lanes, each lane packing 32 + * values into b 32-bit words. Selected at runtime by VectorBitPackerKernels + * when the preferred hardware vector width is 256 bits (e.g. AVX2, Graviton3), + * where the 512-bit paths of VectorBitPacker fall back to slow emulation. + */ +public class VectorBitPacker256 implements VectorBitPackerKernels { + private static final VectorSpecies SPECIES_256 = + IntVector.SPECIES_256; + private static final int VLEN_256 = 8; + private static final int BLOCK_SIZE = 256; + + private static final IntVector MASK_1 = + IntVector.broadcast(SPECIES_256, (1 << 1) - 1); + private static final IntVector MASK_2 = + IntVector.broadcast(SPECIES_256, (1 << 2) - 1); + private static final IntVector MASK_3 = + IntVector.broadcast(SPECIES_256, (1 << 3) - 1); + private static final IntVector MASK_4 = + IntVector.broadcast(SPECIES_256, (1 << 4) - 1); + private static final IntVector MASK_5 = + IntVector.broadcast(SPECIES_256, (1 << 5) - 1); + private static final IntVector MASK_6 = + IntVector.broadcast(SPECIES_256, (1 << 6) - 1); + private static final IntVector MASK_7 = + IntVector.broadcast(SPECIES_256, (1 << 7) - 1); + private static final IntVector MASK_8 = + IntVector.broadcast(SPECIES_256, (1 << 8) - 1); + private static final IntVector MASK_9 = + IntVector.broadcast(SPECIES_256, (1 << 9) - 1); + private static final IntVector MASK_10 = + IntVector.broadcast(SPECIES_256, (1 << 10) - 1); + private static final IntVector MASK_11 = + IntVector.broadcast(SPECIES_256, (1 << 11) - 1); + private static final IntVector MASK_12 = + IntVector.broadcast(SPECIES_256, (1 << 12) - 1); + private static final IntVector MASK_13 = + IntVector.broadcast(SPECIES_256, (1 << 13) - 1); + private static final IntVector MASK_14 = + IntVector.broadcast(SPECIES_256, (1 << 14) - 1); + private static final IntVector MASK_15 = + IntVector.broadcast(SPECIES_256, (1 << 15) - 1); + private static final IntVector MASK_16 = + IntVector.broadcast(SPECIES_256, (1 << 16) - 1); + private static final IntVector MASK_17 = + IntVector.broadcast(SPECIES_256, (1 << 17) - 1); + private static final IntVector MASK_18 = + IntVector.broadcast(SPECIES_256, (1 << 18) - 1); + private static final IntVector MASK_19 = + IntVector.broadcast(SPECIES_256, (1 << 19) - 1); + private static final IntVector MASK_20 = + IntVector.broadcast(SPECIES_256, (1 << 20) - 1); + private static final IntVector MASK_21 = + IntVector.broadcast(SPECIES_256, (1 << 21) - 1); + private static final IntVector MASK_22 = + IntVector.broadcast(SPECIES_256, (1 << 22) - 1); + private static final IntVector MASK_23 = + IntVector.broadcast(SPECIES_256, (1 << 23) - 1); + private static final IntVector MASK_24 = + IntVector.broadcast(SPECIES_256, (1 << 24) - 1); + private static final IntVector MASK_25 = + IntVector.broadcast(SPECIES_256, (1 << 25) - 1); + private static final IntVector MASK_26 = + IntVector.broadcast(SPECIES_256, (1 << 26) - 1); + private static final IntVector MASK_27 = + IntVector.broadcast(SPECIES_256, (1 << 27) - 1); + private static final IntVector MASK_28 = + IntVector.broadcast(SPECIES_256, (1 << 28) - 1); + private static final IntVector MASK_29 = + IntVector.broadcast(SPECIES_256, (1 << 29) - 1); + private static final IntVector MASK_30 = + IntVector.broadcast(SPECIES_256, (1 << 30) - 1); + private static final IntVector MASK_31 = + IntVector.broadcast(SPECIES_256, (1 << 31) - 1); + + @Override + public void fastpack(final int[] in, int inpos, final int[] out, + int outpos, int b) { + switch (b) { + case 0: + break; + case 1: + fastpack1(in, inpos, out, outpos); + break; + case 2: + fastpack2(in, inpos, out, outpos); + break; + case 3: + fastpack3(in, inpos, out, outpos); + break; + case 4: + fastpack4(in, inpos, out, outpos); + break; + case 5: + fastpack5(in, inpos, out, outpos); + break; + case 6: + fastpack6(in, inpos, out, outpos); + break; + case 7: + fastpack7(in, inpos, out, outpos); + break; + case 8: + fastpack8(in, inpos, out, outpos); + break; + case 9: + fastpack9(in, inpos, out, outpos); + break; + case 10: + fastpack10(in, inpos, out, outpos); + break; + case 11: + fastpack11(in, inpos, out, outpos); + break; + case 12: + fastpack12(in, inpos, out, outpos); + break; + case 13: + fastpack13(in, inpos, out, outpos); + break; + case 14: + fastpack14(in, inpos, out, outpos); + break; + case 15: + fastpack15(in, inpos, out, outpos); + break; + case 16: + fastpack16(in, inpos, out, outpos); + break; + case 17: + fastpack17(in, inpos, out, outpos); + break; + case 18: + fastpack18(in, inpos, out, outpos); + break; + case 19: + fastpack19(in, inpos, out, outpos); + break; + case 20: + fastpack20(in, inpos, out, outpos); + break; + case 21: + fastpack21(in, inpos, out, outpos); + break; + case 22: + fastpack22(in, inpos, out, outpos); + break; + case 23: + fastpack23(in, inpos, out, outpos); + break; + case 24: + fastpack24(in, inpos, out, outpos); + break; + case 25: + fastpack25(in, inpos, out, outpos); + break; + case 26: + fastpack26(in, inpos, out, outpos); + break; + case 27: + fastpack27(in, inpos, out, outpos); + break; + case 28: + fastpack28(in, inpos, out, outpos); + break; + case 29: + fastpack29(in, inpos, out, outpos); + break; + case 30: + fastpack30(in, inpos, out, outpos); + break; + case 31: + fastpack31(in, inpos, out, outpos); + break; + case 32: + System.arraycopy(in, inpos, out, outpos, BLOCK_SIZE); + break; + } + } + + @Override + public void fastpackNoMask(final int[] in, int inpos, final int[] out, + int outpos, int b) { + switch (b) { + case 0: + break; + case 1: + fastpackNoMask1(in, inpos, out, outpos); + break; + case 2: + fastpackNoMask2(in, inpos, out, outpos); + break; + case 3: + fastpackNoMask3(in, inpos, out, outpos); + break; + case 4: + fastpackNoMask4(in, inpos, out, outpos); + break; + case 5: + fastpackNoMask5(in, inpos, out, outpos); + break; + case 6: + fastpackNoMask6(in, inpos, out, outpos); + break; + case 7: + fastpackNoMask7(in, inpos, out, outpos); + break; + case 8: + fastpackNoMask8(in, inpos, out, outpos); + break; + case 9: + fastpackNoMask9(in, inpos, out, outpos); + break; + case 10: + fastpackNoMask10(in, inpos, out, outpos); + break; + case 11: + fastpackNoMask11(in, inpos, out, outpos); + break; + case 12: + fastpackNoMask12(in, inpos, out, outpos); + break; + case 13: + fastpackNoMask13(in, inpos, out, outpos); + break; + case 14: + fastpackNoMask14(in, inpos, out, outpos); + break; + case 15: + fastpackNoMask15(in, inpos, out, outpos); + break; + case 16: + fastpackNoMask16(in, inpos, out, outpos); + break; + case 17: + fastpackNoMask17(in, inpos, out, outpos); + break; + case 18: + fastpackNoMask18(in, inpos, out, outpos); + break; + case 19: + fastpackNoMask19(in, inpos, out, outpos); + break; + case 20: + fastpackNoMask20(in, inpos, out, outpos); + break; + case 21: + fastpackNoMask21(in, inpos, out, outpos); + break; + case 22: + fastpackNoMask22(in, inpos, out, outpos); + break; + case 23: + fastpackNoMask23(in, inpos, out, outpos); + break; + case 24: + fastpackNoMask24(in, inpos, out, outpos); + break; + case 25: + fastpackNoMask25(in, inpos, out, outpos); + break; + case 26: + fastpackNoMask26(in, inpos, out, outpos); + break; + case 27: + fastpackNoMask27(in, inpos, out, outpos); + break; + case 28: + fastpackNoMask28(in, inpos, out, outpos); + break; + case 29: + fastpackNoMask29(in, inpos, out, outpos); + break; + case 30: + fastpackNoMask30(in, inpos, out, outpos); + break; + case 31: + fastpackNoMask31(in, inpos, out, outpos); + break; + case 32: + System.arraycopy(in, inpos, out, outpos, BLOCK_SIZE); + break; + } + } + + @Override + public void fastunpack(final int[] in, int inpos, final int[] out, + int outpos, int b) { + switch (b) { + case 0: + Arrays.fill(out, outpos, outpos + 256, 0); + break; + case 1: + fastunpack1(in, inpos, out, outpos); + break; + case 2: + fastunpack2(in, inpos, out, outpos); + break; + case 3: + fastunpack3(in, inpos, out, outpos); + break; + case 4: + fastunpack4(in, inpos, out, outpos); + break; + case 5: + fastunpack5(in, inpos, out, outpos); + break; + case 6: + fastunpack6(in, inpos, out, outpos); + break; + case 7: + fastunpack7(in, inpos, out, outpos); + break; + case 8: + fastunpack8(in, inpos, out, outpos); + break; + case 9: + fastunpack9(in, inpos, out, outpos); + break; + case 10: + fastunpack10(in, inpos, out, outpos); + break; + case 11: + fastunpack11(in, inpos, out, outpos); + break; + case 12: + fastunpack12(in, inpos, out, outpos); + break; + case 13: + fastunpack13(in, inpos, out, outpos); + break; + case 14: + fastunpack14(in, inpos, out, outpos); + break; + case 15: + fastunpack15(in, inpos, out, outpos); + break; + case 16: + fastunpack16(in, inpos, out, outpos); + break; + case 17: + fastunpack17(in, inpos, out, outpos); + break; + case 18: + fastunpack18(in, inpos, out, outpos); + break; + case 19: + fastunpack19(in, inpos, out, outpos); + break; + case 20: + fastunpack20(in, inpos, out, outpos); + break; + case 21: + fastunpack21(in, inpos, out, outpos); + break; + case 22: + fastunpack22(in, inpos, out, outpos); + break; + case 23: + fastunpack23(in, inpos, out, outpos); + break; + case 24: + fastunpack24(in, inpos, out, outpos); + break; + case 25: + fastunpack25(in, inpos, out, outpos); + break; + case 26: + fastunpack26(in, inpos, out, outpos); + break; + case 27: + fastunpack27(in, inpos, out, outpos); + break; + case 28: + fastunpack28(in, inpos, out, outpos); + break; + case 29: + fastunpack29(in, inpos, out, outpos); + break; + case 30: + fastunpack30(in, inpos, out, outpos); + break; + case 31: + fastunpack31(in, inpos, out, outpos); + break; + case 32: + System.arraycopy(in, inpos, out, outpos, BLOCK_SIZE); + break; + } + } + + static void fastpack1(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV.and(MASK_1); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 21).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 23).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 25).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 26).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 27).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 28).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 29).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 30).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.and(MASK_1).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastpackNoMask1(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastunpack1(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + iV.and(MASK_1).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 11).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 13).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 15).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 17).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 19).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 21).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 23).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 25).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 26).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 27).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 28).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 29).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 30).and(MASK_1).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 31).and(MASK_1).intoArray(out, outpos); + } + + static void fastpack2(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV.and(MASK_2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 26).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 28).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(MASK_2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 26).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 28).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.and(MASK_2).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastpackNoMask2(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastunpack2(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + iV.and(MASK_2).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 26).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 28).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 30).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + iV.and(MASK_2).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 26).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 28).and(MASK_2).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 30).and(MASK_2).intoArray(out, outpos); + } + + static void fastpack3(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV.and(MASK_3); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 21).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 27).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 25).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 28).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 23).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 26).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.and(MASK_3).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastpackNoMask3(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastunpack3(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + iV.and(MASK_3).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 15).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 21).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 27).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_256; + + var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_3); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 13).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 19).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 25).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 28).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_3); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 11).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 17).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 23).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 26).and(MASK_3).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 29).and(MASK_3).intoArray(out, outpos); + } + + static void fastpack4(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV.and(MASK_4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(MASK_4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(MASK_4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(MASK_4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.and(MASK_4).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastpackNoMask4(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastunpack4(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + iV.and(MASK_4).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 28).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + iV.and(MASK_4).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 28).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + iV.and(MASK_4).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 28).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + iV.and(MASK_4).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_4).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 28).and(MASK_4).intoArray(out, outpos); + } + + static void fastpack5(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV.and(MASK_5); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 25).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 23).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 21).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 26).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.and(MASK_5).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastpackNoMask5(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastunpack5(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + iV.and(MASK_5).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 15).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 25).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_256; + + var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_5); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 13).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 23).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_5); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 11).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 21).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 26).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_5); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 19).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_5); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 17).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_5).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 27).and(MASK_5).intoArray(out, outpos); + } + + static void fastpack6(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV.and(MASK_6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(MASK_6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.and(MASK_6).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastpackNoMask6(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastunpack6(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + iV.and(MASK_6).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_256; + + var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 26).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + iV.and(MASK_6).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_6).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 26).and(MASK_6).intoArray(out, outpos); + } + + static void fastpack7(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV.and(MASK_7); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 21).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 23).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.and(MASK_7).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastpackNoMask7(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastunpack7(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + iV.and(MASK_7).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 21).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_256; + + var oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_7); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 17).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_7); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 13).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_7); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 23).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_7); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 19).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_7); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 15).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_7); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 11).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_7).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 25).and(MASK_7).intoArray(out, outpos); + } + + static void fastpack8(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV.and(MASK_8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(MASK_8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(MASK_8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(MASK_8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(MASK_8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(MASK_8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(MASK_8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(MASK_8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.and(MASK_8).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastpackNoMask8(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastunpack8(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + iV.and(MASK_8).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + iV.and(MASK_8).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + iV.and(MASK_8).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + iV.and(MASK_8).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + iV.and(MASK_8).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + iV.and(MASK_8).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + iV.and(MASK_8).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + iV.and(MASK_8).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_8).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 24).and(MASK_8).intoArray(out, outpos); + } + + static void fastpack9(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV.and(MASK_9); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 21).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.and(MASK_9).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastpackNoMask9(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastunpack9(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + iV.and(MASK_9).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_256; + + var oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_9); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 13).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_9); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 17).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_9); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 21).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_9); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_9); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 11).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_9); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 15).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_9); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 19).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_9); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_9).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 23).and(MASK_9).intoArray(out, outpos); + } + + static void fastpack10(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV.and(MASK_10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(MASK_10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.and(MASK_10).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastpackNoMask10(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastunpack10(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + iV.and(MASK_10).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_256; + + var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + iV.and(MASK_10).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_10).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 22).and(MASK_10).intoArray(out, outpos); + } + + static void fastpack11(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV.and(MASK_11); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.and(MASK_11).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastpackNoMask11(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastunpack11(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + iV.and(MASK_11).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 11).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_256; + + var oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_11); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_11); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 13).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_11); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_11); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 15).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_11); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_11); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 17).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_11); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_11); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 19).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_11); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(511).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_11); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_11).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 21).and(MASK_11).intoArray(out, outpos); + } + + static void fastpack12(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV.and(MASK_12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(MASK_12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(MASK_12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(MASK_12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.and(MASK_12).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastpackNoMask12(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastunpack12(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + iV.and(MASK_12).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_256; + + var oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + iV.and(MASK_12).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + iV.and(MASK_12).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + iV.and(MASK_12).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_12).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 20).and(MASK_12).intoArray(out, outpos); + } + + static void fastpack13(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV.and(MASK_13); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.and(MASK_13).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastpackNoMask13(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastunpack13(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + iV.and(MASK_13).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 13).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_256; + + var oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 15).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(511).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 17).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(2047).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 11).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_13); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_13).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 19).and(MASK_13).intoArray(out, outpos); + } + + static void fastpack14(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV.and(MASK_14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(MASK_14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.and(MASK_14).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastpackNoMask14(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastunpack14(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + iV.and(MASK_14).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_256; + + var oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + iV.and(MASK_14).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_14).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 18).and(MASK_14).intoArray(out, outpos); + } + + static void fastpack15(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV.and(MASK_15); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.and(MASK_15).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastpackNoMask15(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastunpack15(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + iV.and(MASK_15).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 15).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_256; + + var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(8191).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 13).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(2047).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 11).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(511).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_15); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_15).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 17).and(MASK_15).intoArray(out, outpos); + } + + static void fastpack16(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.and(MASK_16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.and(MASK_16).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastpackNoMask16(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastunpack16(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + iV.and(MASK_16).intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 16).and(MASK_16).intoArray(out, outpos); + } + + static void fastpack17(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV.and(MASK_17); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.and(MASK_17).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastpackNoMask17(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastunpack17(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + iV.and(MASK_17).intoArray(out, outpos); + outpos += VLEN_256; + + var oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(511).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(2047).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 11).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(8191).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 13).and(MASK_17).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_17); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(32767).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 15).and(MASK_17).intoArray(out, outpos); + } + + static void fastpack18(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV.and(MASK_18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(MASK_18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.and(MASK_18).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastpackNoMask18(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastunpack18(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + iV.and(MASK_18).intoArray(out, outpos); + outpos += VLEN_256; + + var oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + iV.and(MASK_18).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_18).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 14).and(MASK_18).intoArray(out, outpos); + } + + static void fastpack19(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV.and(MASK_19); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.and(MASK_19).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastpackNoMask19(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastunpack19(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + iV.and(MASK_19).intoArray(out, outpos); + outpos += VLEN_256; + + var oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(2047).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 11).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(131071).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(511).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(32767).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_19).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_19); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(8191).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 13).and(MASK_19).intoArray(out, outpos); + } + + static void fastpack20(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV.and(MASK_20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(MASK_20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(MASK_20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(MASK_20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.and(MASK_20).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastpackNoMask20(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastunpack20(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + iV.and(MASK_20).intoArray(out, outpos); + outpos += VLEN_256; + + var oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + iV.and(MASK_20).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + iV.and(MASK_20).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + iV.and(MASK_20).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_20).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 12).and(MASK_20).intoArray(out, outpos); + } + + static void fastpack21(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV.and(MASK_21); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.and(MASK_21).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastpackNoMask21(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastunpack21(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + iV.and(MASK_21).intoArray(out, outpos); + outpos += VLEN_256; + + var oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_21).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(511).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_21).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(524287).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_21).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_21).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(131071).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_21).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_21).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(32767).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_21).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_21).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(8191).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_21).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_21).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_21); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(2047).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 11).and(MASK_21).intoArray(out, outpos); + } + + static void fastpack22(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV.and(MASK_22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(MASK_22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.and(MASK_22).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastpackNoMask22(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastunpack22(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + iV.and(MASK_22).intoArray(out, outpos); + outpos += VLEN_256; + + var oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_22).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_22).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_22).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_22).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_22).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + iV.and(MASK_22).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_22).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_22).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_22).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_22).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 10).and(MASK_22).intoArray(out, outpos); + } + + static void fastpack23(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV.and(MASK_23); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 21); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.and(MASK_23).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastpackNoMask23(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 21); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastunpack23(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + iV.and(MASK_23).intoArray(out, outpos); + outpos += VLEN_256; + + var oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_23).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(524287).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_23).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(32767).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_23).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(2047).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_23).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_23).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(2097151).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_23).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(131071).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_23).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(4194303).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(8191).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_23).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_23); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(511).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 9).and(MASK_23).intoArray(out, outpos); + } + + static void fastpack24(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV.and(MASK_24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(MASK_24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(MASK_24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(MASK_24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(MASK_24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(MASK_24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(MASK_24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(MASK_24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.and(MASK_24).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastpackNoMask24(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastunpack24(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + iV.and(MASK_24).intoArray(out, outpos); + outpos += VLEN_256; + + var oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + iV.and(MASK_24).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + iV.and(MASK_24).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + iV.and(MASK_24).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + iV.and(MASK_24).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + iV.and(MASK_24).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + iV.and(MASK_24).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + iV.and(MASK_24).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 8).and(MASK_24).intoArray(out, outpos); + } + + static void fastpack25(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV.and(MASK_25); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 21); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 23); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.and(MASK_25).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastpackNoMask25(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 21); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 23); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastunpack25(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + iV.and(MASK_25).intoArray(out, outpos); + outpos += VLEN_256; + + var oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(2047).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_25).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(4194303).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(32767).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_25).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(524287).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_25).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(8388607).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(511).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 9).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_25).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(8191).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_25).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(131071).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_25).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(2097151).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_25); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 7).and(MASK_25).intoArray(out, outpos); + } + + static void fastpack26(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV.and(MASK_26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(MASK_26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.and(MASK_26).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastpackNoMask26(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastunpack26(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + iV.and(MASK_26).intoArray(out, outpos); + outpos += VLEN_256; + + var oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_26).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(4194303).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_26).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_26).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + iV.and(MASK_26).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_26).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(4194303).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_26).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 6).and(MASK_26).intoArray(out, outpos); + } + + static void fastpack27(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV.and(MASK_27); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 25); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 23); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 21); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.and(MASK_27).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastpackNoMask27(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 25); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 23); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 21); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastunpack27(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + iV.and(MASK_27).intoArray(out, outpos); + outpos += VLEN_256; + + var oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(4194303).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(131071).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 7).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_27).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(524287).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(511).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 9).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_27).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(67108863).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(2097151).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(2047).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 6).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_27).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(8388607).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(8191).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_27).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(33554431).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(32767).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_27); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 5).and(MASK_27).intoArray(out, outpos); + } + + static void fastpack28(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV.and(MASK_28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(MASK_28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(MASK_28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(MASK_28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.and(MASK_28).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastpackNoMask28(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastunpack28(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + iV.and(MASK_28).intoArray(out, outpos); + outpos += VLEN_256; + + var oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_28).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + iV.and(MASK_28).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_28).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + iV.and(MASK_28).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_28).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + iV.and(MASK_28).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 4).and(MASK_28).intoArray(out, outpos); + } + + static void fastpack29(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV.and(MASK_29); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 21); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 27); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 25); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 23); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHR, 26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.and(MASK_29).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastpackNoMask29(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 21); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 27); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 25); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 23); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastunpack29(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + iV.and(MASK_29).intoArray(out, outpos); + outpos += VLEN_256; + + var oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(67108863).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(8388607).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(131071).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(2047).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 5).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_29).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(268435455).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(33554431).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(4194303).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(524287).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(8191).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 7).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_29).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(134217727).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(2097151).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(32767).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(511).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 9).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 6).and(MASK_29); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 3).and(MASK_29).intoArray(out, outpos); + } + + static void fastpack30(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV.and(MASK_30); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(MASK_30); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHR, 28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.and(MASK_30).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastpackNoMask30(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastunpack30(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + iV.and(MASK_30).intoArray(out, outpos); + outpos += VLEN_256; + + var oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(268435455).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(67108863).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(4194303).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 6).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_30).intoArray(out, outpos); + outpos += VLEN_256; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + iV.and(MASK_30).intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(268435455).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(67108863).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(4194303).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 6).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4).and(MASK_30); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 2).and(MASK_30).intoArray(out, outpos); + } + + static void fastpack31(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV.and(MASK_31); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 21); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 23); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 25); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 27); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 29); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHR, 30); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.and(MASK_31).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastpackNoMask31(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + var oV = iV; + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.lanewise(VectorOperators.LSHL, 31).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 1); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 2); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 3); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 5); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 6); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 7); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 9); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 10); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 11); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 13); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 14); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 15); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 17); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 18); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 19); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 21); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 22); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 23); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 25); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 26); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 27); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 29); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 30); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 248); + oV = iV.lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + } + + static void fastunpack31(final int[] in, int inpos, final int[] out, int outpos) { + var iV = IntVector.fromArray(SPECIES_256, in, inpos); + iV.and(MASK_31).intoArray(out, outpos); + outpos += VLEN_256; + + var oV = iV.lanewise(VectorOperators.LSHR, 31).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 8); + oV = iV.and(1073741823).lanewise(VectorOperators.LSHL, 1).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 30).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 16); + oV = iV.and(536870911).lanewise(VectorOperators.LSHL, 2).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 29).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 24); + oV = iV.and(268435455).lanewise(VectorOperators.LSHL, 3).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 28).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 32); + oV = iV.and(134217727).lanewise(VectorOperators.LSHL, 4).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 27).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 40); + oV = iV.and(67108863).lanewise(VectorOperators.LSHL, 5).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 26).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 48); + oV = iV.and(33554431).lanewise(VectorOperators.LSHL, 6).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 25).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 56); + oV = iV.and(16777215).lanewise(VectorOperators.LSHL, 7).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 24).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 64); + oV = iV.and(8388607).lanewise(VectorOperators.LSHL, 8).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 23).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 72); + oV = iV.and(4194303).lanewise(VectorOperators.LSHL, 9).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 22).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 80); + oV = iV.and(2097151).lanewise(VectorOperators.LSHL, 10).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 21).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 88); + oV = iV.and(1048575).lanewise(VectorOperators.LSHL, 11).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 20).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 96); + oV = iV.and(524287).lanewise(VectorOperators.LSHL, 12).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 19).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 104); + oV = iV.and(262143).lanewise(VectorOperators.LSHL, 13).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 18).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 112); + oV = iV.and(131071).lanewise(VectorOperators.LSHL, 14).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 17).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 120); + oV = iV.and(65535).lanewise(VectorOperators.LSHL, 15).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 16).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 128); + oV = iV.and(32767).lanewise(VectorOperators.LSHL, 16).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 15).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 136); + oV = iV.and(16383).lanewise(VectorOperators.LSHL, 17).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 14).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 144); + oV = iV.and(8191).lanewise(VectorOperators.LSHL, 18).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 13).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 152); + oV = iV.and(4095).lanewise(VectorOperators.LSHL, 19).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 12).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 160); + oV = iV.and(2047).lanewise(VectorOperators.LSHL, 20).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 11).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 168); + oV = iV.and(1023).lanewise(VectorOperators.LSHL, 21).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 10).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 176); + oV = iV.and(511).lanewise(VectorOperators.LSHL, 22).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 9).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 184); + oV = iV.and(255).lanewise(VectorOperators.LSHL, 23).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 8).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 192); + oV = iV.and(127).lanewise(VectorOperators.LSHL, 24).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 7).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 200); + oV = iV.and(63).lanewise(VectorOperators.LSHL, 25).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 6).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 208); + oV = iV.and(31).lanewise(VectorOperators.LSHL, 26).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 5).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 216); + oV = iV.and(15).lanewise(VectorOperators.LSHL, 27).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 4).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 224); + oV = iV.and(7).lanewise(VectorOperators.LSHL, 28).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 3).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 232); + oV = iV.and(3).lanewise(VectorOperators.LSHL, 29).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + oV = iV.lanewise(VectorOperators.LSHR, 2).and(MASK_31); + + iV = IntVector.fromArray(SPECIES_256, in, inpos + 240); + oV = iV.and(1).lanewise(VectorOperators.LSHL, 30).or(oV); + + oV.intoArray(out, outpos); + outpos += VLEN_256; + + iV.lanewise(VectorOperators.LSHR, 1).and(MASK_31).intoArray(out, outpos); + } + +} diff --git a/src/main/java/me/lemire/integercompression/vector/VectorBitPackerKernels.java b/src/main/java/me/lemire/integercompression/vector/VectorBitPackerKernels.java new file mode 100644 index 0000000..9179827 --- /dev/null +++ b/src/main/java/me/lemire/integercompression/vector/VectorBitPackerKernels.java @@ -0,0 +1,78 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ +package me.lemire.integercompression.vector; + +import jdk.incubator.vector.IntVector; + +/** + * Width-specific vectorized bit-packing kernels for a 256-integer block. + * Implemented by VectorBitPacker (512-bit lanes), VectorBitPacker256 (256-bit + * lanes) and VectorBitPacker128 (128-bit lanes). The packed layout differs per + * width, so a stream is decoded by the same kernel that packed it. + */ +public interface VectorBitPackerKernels { + + void fastpack(int[] in, int inpos, int[] out, int outpos, int b); + + void fastpackNoMask(int[] in, int inpos, int[] out, int outpos, int b); + + void fastunpack(int[] in, int inpos, int[] out, int outpos, int b); + + /** + * Hardware vector lane width a stream was packed for. The packed byte layout + * differs per width and is not interchangeable across widths, so the stream + * carries its width (as {@link #code}) and is decoded by {@link #kernel}. + */ + enum LaneWidth { + BITS_128(0, 128, new VectorBitPacker128()), + BITS_256(1, 256, new VectorBitPacker256()), + BITS_512(2, 512, new VectorBitPacker()); + + /** Compact wire tag stored in the stream (fits in 2 bits). */ + public final int code; + /** Native vector lane width in bits. */ + public final int bits; + /** Kernel that packs and unpacks at this width. */ + public final VectorBitPackerKernels kernel; + + LaneWidth(int code, int bits, VectorBitPackerKernels kernel) { + this.code = code; + this.bits = bits; + this.kernel = kernel; + } + + /** Width whose kernel runs natively on this machine (encode default). */ + public static final LaneWidth PREFERRED = + forHost(IntVector.SPECIES_PREFERRED.vectorBitSize()); + + /** Largest kernel width that runs natively on a machine of {@code hostBits}. */ + public static LaneWidth forHost(int hostBits) { + LaneWidth best = null; + for (LaneWidth width : values()) { + if (width.bits <= hostBits && (best == null || width.bits > best.bits)) { + best = width; + } + } + if (best == null) { + throw new IllegalStateException( + "no vector bit-packing kernel fits this machine's preferred vector width of " + + hostBits + " bits"); + } + return best; + } + + /** Maps a stream wire tag back to its width. */ + public static LaneWidth fromCode(int code) { + for (LaneWidth width : values()) { + if (width.code == code) { + return width; + } + } + throw new IllegalArgumentException("unknown vector lane-width tag " + code); + } + } +} diff --git a/src/main/java/me/lemire/integercompression/vector/VectorFastPFOR.java b/src/main/java/me/lemire/integercompression/vector/VectorFastPFOR.java index 7374fa5..cc755c8 100644 --- a/src/main/java/me/lemire/integercompression/vector/VectorFastPFOR.java +++ b/src/main/java/me/lemire/integercompression/vector/VectorFastPFOR.java @@ -12,6 +12,7 @@ import me.lemire.integercompression.IntegerCODEC; import me.lemire.integercompression.SkippableIntegerCODEC; import me.lemire.integercompression.IntWrapper; +import me.lemire.integercompression.vector.VectorBitPackerKernels.LaneWidth; /** * This is a patching scheme designed for speed. @@ -41,16 +42,33 @@ * For multi-threaded applications, each thread should use its own FastPFOR * object. * + * Blocks are packed in a vectorized layout that differs by hardware vector + * lane width, so each stream is tagged with the width it was packed for and is + * decoded by the matching kernel. Decoding requires a machine whose preferred + * vector width is at least the stream's; a narrower machine fails fast rather + * than emulating. The default constructor packs at this machine's preferred + * width; the {@code (int, LaneWidth)} constructor pins a width so a + * heterogeneous cluster can decode on its narrowest node. + * * @author Daniel Lemire */ public class VectorFastPFOR implements IntegerCODEC, SkippableIntegerCODEC { private final static int OVERHEAD_OF_EACH_EXCEPT = 8; + private static final int OVERHEAD_OF_EACH_PAGE_IN_INTS = 36; + private static final int OVERHEAD_OF_EACH_BLOCK_IN_INTS = 1; public final static int DEFAULT_PAGE_SIZE = 64 << 10; public final static int BLOCK_SIZE = 256; private final static int INTS_PER_BLOCK = BLOCK_SIZE >>> 5; + // The page header word holds the metadata offset in its low 30 bits and the + // packing lane-width tag in its top 2 bits. + private final static int WIDTH_SHIFT = 30; + private final static int WHEREMETA_MASK = (1 << WIDTH_SHIFT) - 1; + private final int pageSize; + private final LaneWidth encodeWidth; + private final VectorBitPackerKernels encoder; private final int[][] dataTobePacked = new int[33][]; private int[] exceptData = null; @@ -64,9 +82,18 @@ public class VectorFastPFOR implements IntegerCODEC, SkippableIntegerCODEC { * @param pagesize * the desired page size (recommended value is * FastPFOR.DEFAULT_PAGE_SIZE) + * @param encodeWidth + * the vector lane width to pack with. Use + * {@link LaneWidth#PREFERRED} for this machine's fastest layout, or pin a + * cluster to its narrowest node's width so every node can decode the stream. */ - private VectorFastPFOR(int pagesize) { + public VectorFastPFOR(int pagesize, LaneWidth encodeWidth) { + if (pagesize >= (1 << WIDTH_SHIFT)) + throw new IllegalArgumentException("page size must be smaller than " + + (1 << WIDTH_SHIFT)); pageSize = pagesize; + this.encodeWidth = encodeWidth; + this.encoder = encodeWidth.kernel; // Initiate arrrays. bem = new byte[3 * pageSize / BLOCK_SIZE + pagesize]; for (int k = 1; k < dataTobePacked.length; ++k) @@ -75,9 +102,10 @@ private VectorFastPFOR(int pagesize) { } /** - * Construct the fastPFOR CODEC with default parameters. + * Construct the fastPFOR CODEC with default parameters, packing with this + * machine's preferred vector lane width. */ - public VectorFastPFOR() { this(DEFAULT_PAGE_SIZE); } + public VectorFastPFOR() { this(DEFAULT_PAGE_SIZE, LaneWidth.PREFERRED); } /** * Compress data in blocks of BLOCK_SIZE integers (if fewer than BLOCK_SIZE @@ -165,11 +193,11 @@ private void encodePage(int[] in, IntWrapper inpos, int thissize, int[] out, } else { bindex += 2; } - VectorBitPacker.fastpack(in, tmpinpos, out, tmpoutpos, tmpbestb); + encoder.fastpack(in, tmpinpos, out, tmpoutpos, tmpbestb); tmpoutpos += INTS_PER_BLOCK * tmpbestb; } inpos.set(tmpinpos); - out[headerpos] = tmpoutpos - headerpos; + out[headerpos] = (tmpoutpos - headerpos) | (encodeWidth.code << WIDTH_SHIFT); int bytesize = bindex; out[tmpoutpos++] = bytesize; @@ -196,13 +224,13 @@ private void encodePage(int[] in, IntWrapper inpos, int thissize, int[] out, int j = 0; int n = (dataPointers[k] / BLOCK_SIZE) * BLOCK_SIZE; for (; j < n; j += BLOCK_SIZE) { - VectorBitPacker.fastpackNoMask(dataTobePacked[k], j, out, tmpoutpos, + encoder.fastpackNoMask(dataTobePacked[k], j, out, tmpoutpos, k); tmpoutpos += INTS_PER_BLOCK * k; } int r = dataPointers[k] % BLOCK_SIZE; if (r != 0) { - tmpoutpos = VectorBitPacker.slowpack(dataTobePacked[k], j, r, out, + tmpoutpos = slowpack(dataTobePacked[k], j, r, out, tmpoutpos, k); tmpoutpos++; } @@ -231,7 +259,11 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, @Override public int maxHeadlessCompressedLength(IntWrapper compressedPositions, int inlength) { - throw new UnsupportedOperationException("Calculating the max compressed length is not supported yet."); + inlength = inlength - inlength % BLOCK_SIZE; + int pageCount = (inlength + pageSize - 1) / pageSize; + int blockCount = inlength / BLOCK_SIZE; + int blockSizeInInts = OVERHEAD_OF_EACH_BLOCK_IN_INTS + BLOCK_SIZE; + return OVERHEAD_OF_EACH_PAGE_IN_INTS * pageCount + blockSizeInInts * blockCount + 24; } private void loadMetaData(int[] in, int inexcept, int bytesize) { @@ -246,10 +278,29 @@ private void loadMetaData(int[] in, int inexcept, int bytesize) { } } + /** + * Rejects a stream packed for wider lanes than this machine runs natively. + * Decoding it would silently fall back to scalar emulation; failing loud lets + * the caller re-encode the cluster at a lower width instead. + */ + static void checkDecodable(LaneWidth streamWidth, LaneWidth hostWidth) { + if (streamWidth.bits > hostWidth.bits) { + throw new IllegalStateException( + "VectorFastPFOR stream was packed for " + streamWidth.bits + + "-bit vector lanes, but this machine runs at most " + hostWidth.bits + + "-bit lanes natively. Re-encode with lanes <= " + hostWidth.bits + + " bits."); + } + } + private void decodePage(int[] in, IntWrapper inpos, int[] out, IntWrapper outpos, int thissize) { final int initpos = inpos.get(); - final int wheremeta = in[inpos.get()]; + final int header = in[inpos.get()]; + final int wheremeta = header & WHEREMETA_MASK; + final LaneWidth streamWidth = LaneWidth.fromCode(header >>> WIDTH_SHIFT); + checkDecodable(streamWidth, LaneWidth.PREFERRED); + final VectorBitPackerKernels decoder = streamWidth.kernel; inpos.increment(); int inexcept = initpos + wheremeta; @@ -268,11 +319,11 @@ private void decodePage(int[] in, IntWrapper inpos, int[] out, int j = 0; int len = (size / BLOCK_SIZE) * BLOCK_SIZE; for (; j < len; j += BLOCK_SIZE) { - VectorBitPacker.fastunpack(in, inexcept, dataTobePacked[k], j, k); + decoder.fastunpack(in, inexcept, dataTobePacked[k], j, k); inexcept += INTS_PER_BLOCK * k; } int r = size % BLOCK_SIZE; - inexcept = VectorBitPacker.slowunpack(in, inexcept, dataTobePacked[k], + inexcept = slowunpack(in, inexcept, dataTobePacked[k], j, r, k); } else { int j = 0; @@ -282,12 +333,12 @@ private void decodePage(int[] in, IntWrapper inpos, int[] out, System.arraycopy(in, inexcept, buf, 0, in.length - inexcept); int l = (size / BLOCK_SIZE) * BLOCK_SIZE; for (; j < l; j += BLOCK_SIZE) { - VectorBitPacker.fastunpack(buf, inexcept - initinexcept, + decoder.fastunpack(buf, inexcept - initinexcept, dataTobePacked[k], j, k); inexcept += INTS_PER_BLOCK * k; } int r = size % BLOCK_SIZE; - inexcept = VectorBitPacker.slowunpack(in, inexcept, dataTobePacked[k], + inexcept = slowunpack(in, inexcept, dataTobePacked[k], j, r, k); } } @@ -300,7 +351,7 @@ private void decodePage(int[] in, IntWrapper inpos, int[] out, ++run, tmpoutpos += BLOCK_SIZE) { final int b = bem[idx]; // byteContainer.get(); final int cexcept = bem[idx + 1] & 0xFF; // byteContainer.get() & 0xFF; - VectorBitPacker.fastunpack(in, tmpinpos, out, tmpoutpos, b); + decoder.fastunpack(in, tmpinpos, out, tmpoutpos, b); tmpinpos += INTS_PER_BLOCK * b; if (cexcept > 0) { final int maxbits = bem[idx + 2]; // byteContainer.get(); @@ -363,4 +414,73 @@ public String toString() { protected ByteBuffer makeBuffer(int sizeInBytes) { return ByteBuffer.allocateDirect(sizeInBytes); } + + /** + * Packs the sub-block exception remainder, which is not a multiple of the + * vector block size, into the sequential scalar layout read back by + * {@link #slowunpack}. Zeroes its target words first, then OR-accumulates the + * packed bits, so a reused output buffer carries no stale bits. + */ + private static int slowpack(final int[] in, int inpos, int inlen, + final int[] out, int outpos, int b) { + if (inlen == 0) + return outpos; + if (b == 32) { + System.arraycopy(in, inpos, out, outpos, inlen); + return outpos + inlen; + } + int mask = (1 << b) - 1; + Arrays.fill(out, outpos, outpos + (inlen * b + 31) / 32, 0); + int c = 0; + int l = 0; + int r = 0; + int val = 0; + for (int i = 0; i < inlen; i++) { + val = in[inpos + i] & mask; + out[outpos] |= val << (c + r); + c += b; + l = (32 - r) % b; + if (c + r >= 32) { + if (i < inlen - 1 || l != 0) + outpos++; + r = l == 0 ? 0 : b - l; + if (l != 0) + out[outpos] = val >> (b - r); + c = 0; + } + } + return outpos; + } + + /** Reverses {@link #slowpack}. */ + private static int slowunpack(final int[] in, int inpos, final int[] out, + int outpos, int outlen, int b) { + if (outlen == 0) { + return inpos; + } + if (b == 32) { + System.arraycopy(in, inpos, out, outpos, outlen); + return inpos + outlen; + } + int mask = (1 << b) - 1; + int limit = outpos + outlen; + int r = 0; + int val = 0; + int i = 0; + for (; outpos < limit; i++) { + if (r > 0) + out[outpos++] = + (val >>> (32 - (b - r))) | ((in[inpos + i] << (b - r)) & mask); + val = in[inpos + i]; + int j = 0; + int l = 32 - r; + int ll = l % b == 0 ? l : l - b; + while (j < ll && outpos < limit) { + out[outpos++] = (val >> (j + r)) & mask; + j += b; + } + r = l % b == 0 ? 0 : b - (l % b); + } + return inpos + i; + } } diff --git a/src/main/java/me/lemire/integercompression/vector/VectorIntBitPackBenchmark.java b/src/main/java/me/lemire/integercompression/vector/VectorIntBitPackBenchmark.java new file mode 100644 index 0000000..ee69c74 --- /dev/null +++ b/src/main/java/me/lemire/integercompression/vector/VectorIntBitPackBenchmark.java @@ -0,0 +1,103 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ +package me.lemire.integercompression.vector; + +import java.text.DecimalFormat; +import java.util.Random; + +import me.lemire.integercompression.BitPacking; + +/** + * Benchmarks the vectorized int bit-packing kernels against the scalar + * unrolled {@link BitPacking}, packing 256-integer blocks of each width. + * Speeds are millions of integers per second. (For expert use; requires + * --add-modules jdk.incubator.vector.) + */ +public class VectorIntBitPackBenchmark { + + private static final int BLOCK = 256; + private static final int SUBBLOCKS = BLOCK / 32; + + private static void scalarPack(int[] in, int[] out, int bit) { + for (int blk = 0; blk < SUBBLOCKS; blk++) { + BitPacking.fastpackwithoutmask(in, blk * 32, out, blk * bit, bit); + } + } + + private static void scalarUnpack(int[] in, int[] out, int bit) { + for (int blk = 0; blk < SUBBLOCKS; blk++) { + BitPacking.fastunpack(in, blk * bit, out, blk * 32, bit); + } + } + + private static void test(boolean verbose) { + DecimalFormat df = new DecimalFormat("0"); + final int times = 100000; + Random r = new Random(0); + int[] data = new int[BLOCK]; + int[] compressed = new int[8 * 32]; + int[] uncompressed = new int[BLOCK]; + VectorBitPacker256 vec256 = new VectorBitPacker256(); + VectorBitPacker vec = new VectorBitPacker(); + + for (int bit = 1; bit <= 32; ++bit) { + int mask = bit == 32 ? -1 : (1 << bit) - 1; + long scalarComp = 0; + long scalarDecomp = 0; + long vec256Comp = 0; + long vec256Decomp = 0; + long vecComp = 0; + long vecDecomp = 0; + for (int t = 0; t < times; ++t) { + for (int k = 0; k < BLOCK; ++k) { + data[k] = r.nextInt() & mask; + } + long time1 = System.nanoTime(); + scalarPack(data, compressed, bit); + long time2 = System.nanoTime(); + scalarUnpack(compressed, uncompressed, bit); + long time3 = System.nanoTime(); + vec256.fastpackNoMask(data, 0, compressed, 0, bit); + long time4 = System.nanoTime(); + vec256.fastunpack(compressed, 0, uncompressed, 0, bit); + long time5 = System.nanoTime(); + vec.fastpackNoMask(data, 0, compressed, 0, bit); + long time6 = System.nanoTime(); + vec.fastunpack(compressed, 0, uncompressed, 0, bit); + long time7 = System.nanoTime(); + scalarComp += time2 - time1; + scalarDecomp += time3 - time2; + vec256Comp += time4 - time3; + vec256Decomp += time5 - time4; + vecComp += time6 - time5; + vecDecomp += time7 - time6; + } + if (verbose) { + double sc = BLOCK * times * 1000.0; + System.out.println("bit = " + bit + + " | scalar comp = " + df.format(sc / scalarComp) + + " vec256 comp = " + df.format(sc / vec256Comp) + + " vec comp = " + df.format(sc / vecComp) + + " | scalar decomp = " + df.format(sc / scalarDecomp) + + " vec256 decomp = " + df.format(sc / vec256Decomp) + + " vec decomp = " + df.format(sc / vecDecomp)); + } + } + } + + /** + * Main method. + * + * @param args command-line arguments + */ + public static void main(String[] args) { + System.out.println("Testing int packing (scalar vs VectorBitPacker256 vs " + + "VectorBitPacker), 256-int blocks, speeds in millions of ints/s"); + test(false); + test(true); + } +} diff --git a/src/main/java/module-info.java b/src/main/java/module-info.java index f134601..4d652dd 100644 --- a/src/main/java/module-info.java +++ b/src/main/java/module-info.java @@ -2,11 +2,13 @@ // SPDX-License-Identifier: Apache-2.0 module me.lemire.integercompression { - // This is currently only for advanced users: - // requires jdk.incubator.vector; + // Optional at runtime: only consumers of the vector package (VectorFastPFOR) + // need jdk.incubator.vector resolved (e.g. --add-modules jdk.incubator.vector). + // Scalar consumers resolve without it. + requires static jdk.incubator.vector; exports me.lemire.integercompression; exports me.lemire.longcompression; exports me.lemire.longcompression.differential; exports me.lemire.integercompression.differential; - // exports me.lemire.integercompression.vector; + exports me.lemire.integercompression.vector; } diff --git a/src/test/java/me/lemire/integercompression/BasicTest.java b/src/test/java/me/lemire/integercompression/BasicTest.java index b29ae0d..6743017 100644 --- a/src/test/java/me/lemire/integercompression/BasicTest.java +++ b/src/test/java/me/lemire/integercompression/BasicTest.java @@ -17,6 +17,7 @@ import me.lemire.integercompression.differential.IntegratedVariableByte; import me.lemire.integercompression.differential.XorBinaryPacking; import me.lemire.integercompression.synth.ClusteredDataGenerator; +import me.lemire.integercompression.vector.VectorFastPFOR; import org.junit.Test; @@ -43,6 +44,7 @@ public class BasicTest { new Composition(new OptPFDS16(), new VariableByte()), new Composition(new FastPFOR128(), new VariableByte()), new Composition(new FastPFOR(), new VariableByte()), + new Composition(new VectorFastPFOR(), new VariableByte()), new Simple9(), new Simple16(), new GroupSimple9(), diff --git a/src/test/java/me/lemire/integercompression/SkippableBasicTest.java b/src/test/java/me/lemire/integercompression/SkippableBasicTest.java index 881dada..ca919d4 100644 --- a/src/test/java/me/lemire/integercompression/SkippableBasicTest.java +++ b/src/test/java/me/lemire/integercompression/SkippableBasicTest.java @@ -13,6 +13,7 @@ import me.lemire.integercompression.differential.IntegratedVariableByte; import me.lemire.integercompression.differential.SkippableIntegratedComposition; import me.lemire.integercompression.differential.SkippableIntegratedIntegerCODEC; +import me.lemire.integercompression.vector.VectorFastPFOR; import org.junit.Test; import static org.junit.Assert.assertArrayEquals; @@ -37,6 +38,7 @@ public class SkippableBasicTest { new SkippableComposition(new OptPFDS16(), new VariableByte()), new SkippableComposition(new FastPFOR128(), new VariableByte()), new SkippableComposition(new FastPFOR(), new VariableByte()), + new SkippableComposition(new VectorFastPFOR(), new VariableByte()), new Simple9(), new Simple16() }; diff --git a/src/test/java/me/lemire/integercompression/vector/VectorBitPackerTest.java b/src/test/java/me/lemire/integercompression/vector/VectorBitPackerTest.java new file mode 100644 index 0000000..6640890 --- /dev/null +++ b/src/test/java/me/lemire/integercompression/vector/VectorBitPackerTest.java @@ -0,0 +1,73 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ +package me.lemire.integercompression.vector; + +import static org.junit.Assert.assertArrayEquals; + +import java.util.Random; + +import org.junit.Test; + +/** + * Tests for the width-specific bit-packing kernels. + * + * VectorBitPacker (256/512-bit lanes), VectorBitPacker256 (256-bit lanes) and + * VectorBitPacker128 (128-bit lanes) use different lane strides, so their packed + * layouts differ and are not wire-compatible. Each must satisfy the same + * roundtrip contract: packing a 256-integer block whose values fit in b bits and + * unpacking it recovers the input for every width b. + */ +public class VectorBitPackerTest { + + private static final int BLOCK_SIZE = 256; + + private static int[] randomBlock(Random random, int b) { + int mask = b == 32 ? -1 : (1 << b) - 1; + int[] in = new int[BLOCK_SIZE]; + for (int i = 0; i < BLOCK_SIZE; i++) { + in[i] = random.nextInt() & mask; + } + return in; + } + + private static void roundTrip(VectorBitPackerKernels packer) { + Random random = new Random(42); + for (int b = 1; b <= 32; b++) { + int[] in = randomBlock(random, b); + + int[] packed = new int[8 * b]; + packer.fastpack(in, 0, packed, 0, b); + int[] recovered = new int[BLOCK_SIZE]; + packer.fastunpack(packed, 0, recovered, 0, b); + assertArrayEquals("fastpack b=" + b, in, recovered); + + int[] packedNoMask = new int[8 * b]; + packer.fastpackNoMask(in, 0, packedNoMask, 0, b); + int[] recoveredNoMask = new int[BLOCK_SIZE]; + packer.fastunpack(packedNoMask, 0, recoveredNoMask, 0, b); + assertArrayEquals("fastpackNoMask b=" + b, in, recoveredNoMask); + + assertArrayEquals("fastpack vs fastpackNoMask b=" + b, packed, + packedNoMask); + } + } + + @Test + public void vectorBitPacker128RoundTrip() { + roundTrip(new VectorBitPacker128()); + } + + @Test + public void vectorBitPacker256RoundTrip() { + roundTrip(new VectorBitPacker256()); + } + + @Test + public void vectorBitPackerRoundTrip() { + roundTrip(new VectorBitPacker()); + } +} diff --git a/src/test/java/me/lemire/integercompression/vector/VectorFastPFORTest.java b/src/test/java/me/lemire/integercompression/vector/VectorFastPFORTest.java new file mode 100644 index 0000000..e222c09 --- /dev/null +++ b/src/test/java/me/lemire/integercompression/vector/VectorFastPFORTest.java @@ -0,0 +1,88 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ +package me.lemire.integercompression.vector; + +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertThrows; + +import java.util.Arrays; + +import org.junit.Test; + +import me.lemire.integercompression.IntWrapper; +import me.lemire.integercompression.vector.VectorBitPackerKernels.LaneWidth; + +/** + * Tests for the vectorized FastPFOR codec. + */ +public class VectorFastPFORTest { + + /** + * A few exceptions in a single block leave a sub-block remainder that is + * packed with slowpack, which OR-accumulates into the output. Compressing + * into a non-zero buffer must still produce a clean roundtrip. + */ + @Test + public void dirtyOutputBufferRoundTrip() { + int[] data = new int[VectorFastPFOR.BLOCK_SIZE]; + for (int i = 0; i < data.length; i++) { + data[i] = i % 8; // base values fit in 3 bits + } + data[5] = 1 << 20; // exceptions sharing one width, count not a multiple + data[200] = 1 << 20; // of BLOCK_SIZE, so the remainder goes through slowpack + + VectorFastPFOR codec = new VectorFastPFOR(); + int[] compressed = new int[2 * data.length]; + Arrays.fill(compressed, -1); // stale bits the slowpack remainder must overwrite + IntWrapper inpos = new IntWrapper(0); + IntWrapper outpos = new IntWrapper(0); + codec.headlessCompress(data, inpos, data.length, compressed, outpos); + + int[] recovered = new int[data.length]; + codec.headlessUncompress(compressed, new IntWrapper(0), outpos.get(), + recovered, new IntWrapper(0), data.length); + + assertArrayEquals(data, recovered); + } + + /** A stream packed for wider lanes than the host runs natively is refused. */ + @Test + public void checkDecodableRejectsWiderStream() { + assertThrows(IllegalStateException.class, + () -> VectorFastPFOR.checkDecodable(LaneWidth.BITS_256, LaneWidth.BITS_128)); + assertThrows(IllegalStateException.class, + () -> VectorFastPFOR.checkDecodable(LaneWidth.BITS_512, LaneWidth.BITS_256)); + // equal or narrower stream decodes natively + VectorFastPFOR.checkDecodable(LaneWidth.BITS_128, LaneWidth.BITS_128); + VectorFastPFOR.checkDecodable(LaneWidth.BITS_128, LaneWidth.BITS_512); + VectorFastPFOR.checkDecodable(LaneWidth.BITS_256, LaneWidth.BITS_512); + } + + /** 128-bit lanes are the universal floor, so such a stream decodes on any host. */ + @Test + public void lowestCommonWidthRoundTripsOnAnyHost() { + int[] data = new int[3 * VectorFastPFOR.BLOCK_SIZE]; + for (int i = 0; i < data.length; i++) { + data[i] = i % 8; + } + data[5] = 1 << 20; + data[600] = 1 << 25; + + VectorFastPFOR codec = + new VectorFastPFOR(VectorFastPFOR.DEFAULT_PAGE_SIZE, LaneWidth.BITS_128); + int[] compressed = new int[2 * data.length]; + IntWrapper inpos = new IntWrapper(0); + IntWrapper outpos = new IntWrapper(0); + codec.headlessCompress(data, inpos, data.length, compressed, outpos); + + int[] recovered = new int[data.length]; + codec.headlessUncompress(compressed, new IntWrapper(0), outpos.get(), + recovered, new IntWrapper(0), data.length); + + assertArrayEquals(data, recovered); + } +}