From 82b78b0be9ebb9eb591097002fa4c118cb6c2251 Mon Sep 17 00:00:00 2001 From: "zainnadeem(RedOpsCell)" Date: Sat, 20 Jun 2026 18:24:04 +0500 Subject: [PATCH] gh-151763: Fix OOM-0034 tokenizer offset error handling --- Lib/test/test_tokenize.py | 63 ++++++++++++++++++- ...06-20-18-21-28.gh-issue-151763.OOM0034.rst | 4 ++ Parser/pegen.c | 3 + Python/Python-tokenize.c | 40 +++++++++--- 4 files changed, 99 insertions(+), 11 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2026-06-20-18-21-28.gh-issue-151763.OOM0034.rst diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index ab53a20cff55392..5aa8456f7237249 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -11,7 +11,7 @@ from textwrap import dedent from unittest import TestCase, mock from test import support -from test.support import os_helper +from test.support import import_helper, os_helper from test.support.script_helper import run_test_script, make_script, run_python_until_end from test.support.numbers import ( VALID_UNDERSCORE_LITERALS, @@ -2266,6 +2266,67 @@ def readline(encoding): )) self.assertEqual(tokens, expected) + @unittest.skipIf(support.Py_TRACE_REFS, + '_testcapi.set_nomemory() is unreliable with Py_TRACE_REFS') + def test_col_offset_conversion_oom(self): + import_helper.import_module('_testcapi') + code = dedent(r""" + import _testcapi + import _tokenize + + def check_indented_name(start): + source = "if True:\n \u00e9 = 1\n" + it = _tokenize.TokenizerIter( + iter(source.splitlines(True)).__next__, + extra_tokens=False, + ) + for _ in range(5): + next(it) + + _testcapi.set_nomemory(start, start + 1) + try: + next(it) + except MemoryError: + return True + finally: + _testcapi.remove_mem_hooks() + return False + + def check_multiline_string(start): + source = "x = '''abc\ndef'''\n" + it = _tokenize.TokenizerIter( + iter(source.splitlines(True)).__next__, + extra_tokens=False, + ) + next(it) + next(it) + + _testcapi.set_nomemory(start, start + 1) + try: + next(it) + except MemoryError: + return True + finally: + _testcapi.remove_mem_hooks() + return False + + def check_range(name, func): + seen_memory_error = False + for index in range(20): + if func(index): + seen_memory_error = True + if not seen_memory_error: + raise AssertionError(f"{name}: MemoryError not raised") + + check_range("line", check_indented_name) + check_range("raw", check_multiline_string) + print("MemoryError") + """) + with support.SuppressCrashReport(): + res, _ = run_python_until_end("-c", code) + self.assertEqual(res.rc, 0, res.err.decode("ascii", "replace")) + self.assertIn(b"MemoryError", res.out) + def test_int(self): self.check_tokenize('0xff <= 255', """\ diff --git a/Misc/NEWS.d/next/Library/2026-06-20-18-21-28.gh-issue-151763.OOM0034.rst b/Misc/NEWS.d/next/Library/2026-06-20-18-21-28.gh-issue-151763.OOM0034.rst new file mode 100644 index 000000000000000..258ae7f4fb616c6 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-06-20-18-21-28.gh-issue-151763.OOM0034.rst @@ -0,0 +1,4 @@ +Fix a possible crash in ``_tokenize.TokenizerIter`` when memory allocation +fails while converting byte offsets to character offsets for non-ASCII source +lines. The tokenizer now correctly propagates ``MemoryError`` instead of +dereferencing a NULL pointer or returning a result with an exception set. diff --git a/Parser/pegen.c b/Parser/pegen.c index bb222b50fc095f2..8ef30cf3da93479 100644 --- a/Parser/pegen.c +++ b/Parser/pegen.c @@ -27,6 +27,9 @@ Py_ssize_t _PyPegen_byte_offset_to_character_offset_line(PyObject *line, Py_ssize_t col_offset, Py_ssize_t end_col_offset) { const unsigned char *data = (const unsigned char*)PyUnicode_AsUTF8(line); + if (data == NULL) { + return -1; + } Py_ssize_t len = 0; while (col_offset < end_col_offset) { diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c index e6d39e4c7dc8235..76ae9c617ee1a6f 100644 --- a/Python/Python-tokenize.c +++ b/Python/Python-tokenize.c @@ -202,21 +202,27 @@ _get_current_line(tokenizeriterobject *it, const char *line_start, Py_ssize_t si return line; } -static void +static int _get_col_offsets(tokenizeriterobject *it, struct token token, const char *line_start, PyObject *line, int line_changed, Py_ssize_t lineno, Py_ssize_t end_lineno, Py_ssize_t *col_offset, Py_ssize_t *end_col_offset) { _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(it); Py_ssize_t byte_offset = -1; + Py_ssize_t byte_col_offset_diff = it->byte_col_offset_diff; if (token.start != NULL && token.start >= line_start) { byte_offset = token.start - line_start; if (line_changed) { - *col_offset = _PyPegen_byte_offset_to_character_offset_line(line, 0, byte_offset); - it->byte_col_offset_diff = byte_offset - *col_offset; + Py_ssize_t offset = _PyPegen_byte_offset_to_character_offset_line( + line, 0, byte_offset); + if (offset < 0) { + return -1; + } + *col_offset = offset; + byte_col_offset_diff = byte_offset - *col_offset; } else { - *col_offset = byte_offset - it->byte_col_offset_diff; + *col_offset = byte_offset - byte_col_offset_diff; } } @@ -226,17 +232,28 @@ _get_col_offsets(tokenizeriterobject *it, struct token token, const char *line_s // If the whole token is at the same line, we can just use the token.start // buffer for figuring out the new column offset, since using line is not // performant for very long lines. - Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_line(line, byte_offset, end_byte_offset); + Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_line( + line, byte_offset, end_byte_offset); + if (token_col_offset < 0) { + return -1; + } *end_col_offset = *col_offset + token_col_offset; - it->byte_col_offset_diff += token.end - token.start - token_col_offset; + byte_col_offset_diff += token.end - token.start - token_col_offset; } else { - *end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, end_byte_offset); - it->byte_col_offset_diff += end_byte_offset - *end_col_offset; + Py_ssize_t offset = _PyPegen_byte_offset_to_character_offset_raw( + it->tok->line_start, end_byte_offset); + if (offset < 0) { + return -1; + } + *end_col_offset = offset; + byte_col_offset_diff += end_byte_offset - *end_col_offset; } } + it->byte_col_offset_diff = byte_col_offset_diff; it->last_lineno = lineno; it->last_end_lineno = end_lineno; + return 0; } static PyObject * @@ -301,8 +318,11 @@ tokenizeriter_next(PyObject *op) Py_ssize_t end_lineno = it->tok->lineno; Py_ssize_t col_offset = -1; Py_ssize_t end_col_offset = -1; - _get_col_offsets(it, token, line_start, line, line_changed, - lineno, end_lineno, &col_offset, &end_col_offset); + if (_get_col_offsets(it, token, line_start, line, line_changed, + lineno, end_lineno, &col_offset, &end_col_offset) < 0) { + Py_DECREF(str); + goto exit; + } if (it->tok->tok_extra_tokens) { if (is_trailing_token) {