From 4584dfed17140a37bed291f3744e537f40926198 Mon Sep 17 00:00:00 2001 From: Eduard Tudenhoefner Date: Wed, 24 Jun 2026 09:11:59 +0200 Subject: [PATCH 1/2] GH-3637: Fix corrupt output when reusing a dictionary writer across row groups after fallback --- .../dictionary/DictionaryValuesWriter.java | 1 + .../values/fallback/FallbackValuesWriter.java | 5 +++ .../values/dictionary/TestDictionary.java | 32 +++++++++++++++++++ 3 files changed, 38 insertions(+) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/DictionaryValuesWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/DictionaryValuesWriter.java index 92e88bac9f..f4ed350e3a 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/DictionaryValuesWriter.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/dictionary/DictionaryValuesWriter.java @@ -202,6 +202,7 @@ public void resetDictionary() { lastUsedDictionaryByteSize = 0; lastUsedDictionarySize = 0; dictionaryTooBig = false; + dictionaryByteSize = 0; clearDictionaryContent(); } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/fallback/FallbackValuesWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/fallback/FallbackValuesWriter.java index 7f56ef2192..dd94c5bf1d 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/fallback/FallbackValuesWriter.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/fallback/FallbackValuesWriter.java @@ -105,6 +105,11 @@ public void reset() { rawDataByteSize = 0; firstPage = false; currentWriter.reset(); + // After a fallback, currentWriter is the fallback writer, so the initial dictionary writer is never reset at + // row-group boundaries, which can silently corrupt the next row group + if (currentWriter != initialWriter) { + initialWriter.reset(); + } } @Override diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java b/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java index c7cf351990..b8a90b631a 100644 --- a/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java +++ b/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java @@ -285,6 +285,38 @@ public void testSecondPageFallBack() throws IOException { } } + @Test + public void testDictionaryWriterReusableAfterFallBack() throws IOException { + int COUNT = 1000; + try (final FallbackValuesWriter cw = + newPlainBinaryDictionaryValuesWriter(1000, 10000)) { + + // --- Row group 1 --- + // First page is dictionary encoded and committed, which keeps the dictionary alive for + // the whole row group. + writeRepeated(COUNT, cw, "a"); + getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY); + + // Second page no longer fits the dictionary so it falls back to plain. The current writer + // becomes the fallback writer, while the dictionary writer still buffers this page's ids. + writeDistinct(COUNT, cw, "b"); + getBytesAndCheckEncoding(cw, PLAIN); + + // End of row group 1: emit the dictionary page and reset the dictionary state for reuse. + Assert.assertNotNull(cw.toDictPageAndClose()); + cw.resetDictionary(); + + // --- Row group 2 --- + // The dictionary writer must be clean again + writeRepeated(COUNT, cw, "c"); + BytesInput rg2Bytes = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY); + + // The page must decode back to exactly the values written in row group 2. + DictionaryValuesReader cr = initDicReader(cw, BINARY); + checkRepeated(COUNT, rg2Bytes, cr, "c"); + } + } + @Test public void testLongDictionary() throws IOException { int COUNT = 1000; From a2886ebc64a10b16392390777b4afe6477bee36f Mon Sep 17 00:00:00 2001 From: Eduard Tudenhoefner Date: Wed, 1 Jul 2026 10:49:06 +0200 Subject: [PATCH 2/2] add test and reset for second use case --- .../values/fallback/FallbackValuesWriter.java | 8 ++-- .../values/dictionary/TestDictionary.java | 46 +++++++++++++++---- 2 files changed, 43 insertions(+), 11 deletions(-) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/fallback/FallbackValuesWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/fallback/FallbackValuesWriter.java index dd94c5bf1d..41fe484f37 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/fallback/FallbackValuesWriter.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/fallback/FallbackValuesWriter.java @@ -129,10 +129,12 @@ public DictionaryPage toDictPageAndClose() { @Override public void resetDictionary() { - if (initialUsedAndHadDictionary) { + currentWriter.resetDictionary(); + // After a fallback, currentWriter is the fallback writer, so the initial dictionary writer's + // dictionary is never reset at row-group boundaries, leaving stale dictionary entries/IDs that can silently + // corrupt the next row group + if (currentWriter != initialWriter) { initialWriter.resetDictionary(); - } else { - currentWriter.resetDictionary(); } currentWriter = initialWriter; fellBackAlready = false; diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java b/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java index b8a90b631a..52956a5bfc 100644 --- a/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java +++ b/parquet-column/src/test/java/org/apache/parquet/column/values/dictionary/TestDictionary.java @@ -24,6 +24,7 @@ import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; +import static org.assertj.core.api.Assertions.assertThat; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; @@ -287,33 +288,62 @@ public void testSecondPageFallBack() throws IOException { @Test public void testDictionaryWriterReusableAfterFallBack() throws IOException { - int COUNT = 1000; + int count = 1000; try (final FallbackValuesWriter cw = newPlainBinaryDictionaryValuesWriter(1000, 10000)) { // --- Row group 1 --- // First page is dictionary encoded and committed, which keeps the dictionary alive for // the whole row group. - writeRepeated(COUNT, cw, "a"); + writeRepeated(count, cw, "a"); getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY); // Second page no longer fits the dictionary so it falls back to plain. The current writer // becomes the fallback writer, while the dictionary writer still buffers this page's ids. - writeDistinct(COUNT, cw, "b"); + writeDistinct(count, cw, "b"); getBytesAndCheckEncoding(cw, PLAIN); // End of row group 1: emit the dictionary page and reset the dictionary state for reuse. - Assert.assertNotNull(cw.toDictPageAndClose()); + assertThat(cw.toDictPageAndClose()).isNotNull(); cw.resetDictionary(); // --- Row group 2 --- // The dictionary writer must be clean again - writeRepeated(COUNT, cw, "c"); + writeRepeated(count, cw, "c"); + BytesInput rg2Bytes = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY); + + // The page must decode back to exactly the values written in row group 2. + DictionaryValuesReader cr = initDicReader(cw, BINARY); + checkRepeated(count, rg2Bytes, cr, "c"); + } + } + + @Test + public void testDictionaryWriterReusableAfterFirstPageFallBack() throws IOException { + int count = 1000; + try (final FallbackValuesWriter cw = + newPlainBinaryDictionaryValuesWriter(10000, 10000)) { + + // --- Row group 1 --- + // The very first page falls back to plain because dictionary encoding is not efficient. Because the + // fallback happens on the first page, the dictionary was never committed as the page encoding, so + // initialUsedAndHadDictionary stays false and the current writer becomes the fallback writer. The + // dictionary writer, however, still holds this page's entries and byte size. + writeDistinct(count, cw, "a"); + getBytesAndCheckEncoding(cw, PLAIN); + + // End of row group 1: reset the dictionary state for reuse + cw.resetDictionary(); + + // --- Row group 2 --- + // The data is now dictionary friendly, so it must be dictionary encoded again. Without a clean initial + // dictionary writer, the stale entries/byte size from row group 1 would push this page back to plain. + writeRepeated(count, cw, "b"); BytesInput rg2Bytes = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY); // The page must decode back to exactly the values written in row group 2. DictionaryValuesReader cr = initDicReader(cw, BINARY); - checkRepeated(COUNT, rg2Bytes, cr, "c"); + checkRepeated(count, rg2Bytes, cr, "b"); } } @@ -859,7 +889,7 @@ private void checkDistinct(int COUNT, BytesInput bytes, ValuesReader cr, String private void checkRepeated(int COUNT, BytesInput bytes, ValuesReader cr, String prefix) throws IOException { cr.initFromPage(COUNT, bytes.toInputStream()); for (int i = 0; i < COUNT; i++) { - Assert.assertEquals(prefix + i % 10, cr.readBytes().toStringUsingUTF8()); + assertThat(cr.readBytes().toStringUsingUTF8()).isEqualTo(prefix + i % 10); } } @@ -886,7 +916,7 @@ private void writeRepeatedWithReuse(int COUNT, ValuesWriter cw, String prefix) { private BytesInput getBytesAndCheckEncoding(ValuesWriter cw, Encoding encoding) throws IOException { BytesInput bytes = BytesInput.copy(cw.getBytes()); - assertEquals(encoding, cw.getEncoding()); + assertThat(cw.getEncoding()).isEqualTo(encoding); cw.reset(); return bytes; }