diff --git a/mypyc/codegen/emit.py b/mypyc/codegen/emit.py index b89c91343e66c..57cce6a3fe8fe 100644 --- a/mypyc/codegen/emit.py +++ b/mypyc/codegen/emit.py @@ -2,14 +2,12 @@ from __future__ import annotations -import pprint import sys -import textwrap from collections.abc import Callable from typing import Final from mypyc.codegen.cstring import c_string_initializer -from mypyc.codegen.literals import Literals +from mypyc.codegen.literals import Literals, literal_sort_key from mypyc.common import ( ATTR_PREFIX, BITMAP_BITS, @@ -237,24 +235,16 @@ def attr(self, name: str) -> str: return ATTR_PREFIX + name def object_annotation(self, obj: object, line: str) -> str: - """Build a C comment with an object's string representation. + """Build a C comment with a literal value's string representation. - If the comment exceeds the line length limit, it's wrapped into a - multiline string (with the extra lines indented to be aligned with - the first line's comment). + This is a debugging aid that makes generated C easier to read. - If it contains illegal characters, an empty string is returned.""" - line_width = self._indent + len(line) - formatted = pprint.pformat(obj, compact=True, indent=1, width=max(90 - line_width, 20)) - if any(x in formatted for x in ("/*", "*/", "\0")): + If it contains illegal characters or is too long, return an empty string. + """ + formatted = stable_literal_repr(obj) + if any(x in formatted for x in ("/*", "*/", "\0")) or len(formatted) >= 256: return "" - - if "\n" in formatted: - first_line, rest = formatted.split("\n", maxsplit=1) - comment_continued = textwrap.indent(rest, (line_width + 3) * " ") - return f" /* {first_line}\n{comment_continued} */" - else: - return f" /* {formatted} */" + return f" /* {formatted} */" def emit_line(self, line: str = "", *, ann: object = None) -> None: if line.startswith("}"): @@ -1486,3 +1476,21 @@ def native_function_doc_initializer(func: FuncIR) -> str: return "NULL" docstring = f"{text_sig}\n--\n\n" return c_string_initializer(docstring.encode("ascii", errors="backslashreplace")) + + +def stable_literal_repr(obj: object) -> str: + """Return a single-line repr of a literal value. + + Behaves like repr() for most values, but renders frozenset members in a + deterministic order (frozenset iteration order is hash-seed dependent). + """ + if isinstance(obj, frozenset): + if not obj: + return "frozenset()" + items = ", ".join(stable_literal_repr(item) for item in sorted(obj, key=literal_sort_key)) + return "frozenset({" + items + "})" + elif isinstance(obj, tuple): + if len(obj) == 1: + return "(" + stable_literal_repr(obj[0]) + ",)" + return "(" + ", ".join(stable_literal_repr(item) for item in obj) + ")" + return repr(obj) diff --git a/mypyc/codegen/literals.py b/mypyc/codegen/literals.py index ed1ff93277167..a8d4650b5c294 100644 --- a/mypyc/codegen/literals.py +++ b/mypyc/codegen/literals.py @@ -65,7 +65,8 @@ def record_literal(self, value: LiteralValue) -> None: elif isinstance(value, frozenset): frozenset_literals = self.frozenset_literals if value not in frozenset_literals: - for item in value: + # Sort members so that we don't depend on frozenset iteration order. + for item in sorted(value, key=literal_sort_key): assert _is_literal_value(item) self.record_literal(item) frozenset_literals[value] = len(frozenset_literals) @@ -140,10 +141,14 @@ def encoded_tuple_values(self) -> list[str]: return self._encode_collection_values(self.tuple_literals) def encoded_frozenset_values(self) -> list[str]: - return self._encode_collection_values(self.frozenset_literals) + # Ensure deterministic frozenset item order by sorting items. + return self._encode_collection_values(self.frozenset_literals, sort_items=True) def _encode_collection_values( - self, values: dict[tuple[object, ...], int] | dict[frozenset[object], int] + self, + values: dict[tuple[object, ...], int] | dict[frozenset[object], int], + *, + sort_items: bool = False, ) -> list[str]: """Encode tuple/frozenset values into a C array. @@ -164,7 +169,8 @@ def _encode_collection_values( for i in range(count): value = value_by_index[i] result.append(str(len(value))) - for item in value: + items = sorted(value, key=literal_sort_key) if sort_items else value + for item in items: assert _is_literal_value(item) index = self.literal_index(item) result.append(str(index)) @@ -299,3 +305,13 @@ def _encode_complex_values(values: dict[complex, int]) -> list[str]: result.append(float_to_c(value.real)) result.append(float_to_c(value.imag)) return result + + +def literal_sort_key(value: object) -> tuple[object, ...]: + """Return a sort key for a literal value.""" + if isinstance(value, frozenset): + # Sort items to avoid depending on the unpredictable iteration order. + return ("frozenset", tuple(sorted(literal_sort_key(item) for item in value))) + elif isinstance(value, tuple): + return ("tuple", tuple(literal_sort_key(item) for item in value)) + return (type(value).__name__, repr(value)) diff --git a/mypyc/test/test_emit.py b/mypyc/test/test_emit.py index 285488e03c9ae..b2199b2dcb3b2 100644 --- a/mypyc/test/test_emit.py +++ b/mypyc/test/test_emit.py @@ -42,9 +42,23 @@ def test_reg(self) -> None: def test_object_annotation(self) -> None: assert self.emitter.object_annotation("hello, world", "line;") == " /* 'hello, world' */" - assert self.emitter.object_annotation(list(range(30)), "line;") == """\ - /* [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, - 23, 24, 25, 26, 27, 28, 29] */""" + assert self.emitter.object_annotation(42, "line;") == " /* 42 */" + assert self.emitter.object_annotation((1, "x", None), "line;") == " /* (1, 'x', None) */" + # Annotations containing illegal C comment characters are dropped. + assert self.emitter.object_annotation("a /* b */ c", "line;") == "" + + def test_object_annotation_frozenset_is_deterministic(self) -> None: + assert ( + self.emitter.object_annotation(frozenset({"self", "cls"}), "line;") + == self.emitter.object_annotation(frozenset({"cls", "self"}), "line;") + == " /* frozenset({'cls', 'self'}) */" + ) + assert ( + self.emitter.object_annotation((frozenset({"b", "a"}),), "line;") + == self.emitter.object_annotation((frozenset({"a", "b"}),), "line;") + == " /* (frozenset({'a', 'b'}),) */" + ) + assert self.emitter.object_annotation(frozenset(), "line;") == " /* frozenset() */" def test_emit_line(self) -> None: emitter = self.emitter @@ -55,11 +69,9 @@ def test_emit_line(self) -> None: assert emitter.fragments == ["line;\n", "a {\n", " f();\n", "}\n"] emitter = Emitter(self.context, {}) emitter.emit_line("CPyStatics[0];", ann="hello, world") - emitter.emit_line("CPyStatics[1];", ann=list(range(30))) + emitter.emit_line("CPyStatics[1];", ann=42) assert emitter.fragments[0] == "CPyStatics[0]; /* 'hello, world' */\n" - assert emitter.fragments[1] == """\ -CPyStatics[1]; /* [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, - 21, 22, 23, 24, 25, 26, 27, 28, 29] */\n""" + assert emitter.fragments[1] == "CPyStatics[1]; /* 42 */\n" def test_emit_undefined_value_for_simple_type(self) -> None: emitter = self.emitter diff --git a/mypyc/test/test_literals.py b/mypyc/test/test_literals.py index a8c17d10d30d0..f1dc7f2434414 100644 --- a/mypyc/test/test_literals.py +++ b/mypyc/test/test_literals.py @@ -10,6 +10,7 @@ _encode_int_values, _encode_str_values, format_str_literal, + literal_sort_key, ) @@ -88,3 +89,50 @@ def test_tuple_literal(self) -> None: "7", # Second tuple (length=4) "0", # Third tuple (length=0) ] + + def test_frozenset_literal_index_is_deterministic(self) -> None: + # Index assignment for members must not depend on frozenset iteration + # order (which is hash-seed dependent), so that generated code is + # reproducible. + lit1 = Literals() + lit1.record_literal(frozenset({"self", "cls"})) + lit2 = Literals() + lit2.record_literal(frozenset({"cls", "self"})) + for s in ("self", "cls"): + assert lit1.literal_index(s) == lit2.literal_index(s) + # Members are recorded in sorted order. + assert lit1.literal_index("cls") == 3 + assert lit1.literal_index("self") == 4 + + def test_frozenset_encoding_is_deterministic(self) -> None: + lit1 = Literals() + lit1.record_literal(frozenset({"self", "cls"})) + lit2 = Literals() + lit2.record_literal(frozenset({"cls", "self"})) + assert lit1.encoded_frozenset_values() == lit2.encoded_frozenset_values() + + def test_literal_sort_key_is_total_over_types(self) -> None: + # Heterogeneous, individually unorderable items must still be sorted. + values = ["x", b"y", 1, None, (1, 2), frozenset({1, 2})] + values_reversed = list(reversed(values)) + assert sorted(values, key=literal_sort_key) == sorted( + values_reversed, key=literal_sort_key + ) + + def test_literal_sort_key_with_frozenset(self) -> None: + assert literal_sort_key(frozenset({"a", "b"})) == literal_sort_key(frozenset({"b", "a"})) + assert literal_sort_key((frozenset({"a", "b"}),)) == literal_sort_key( + (frozenset({"b", "a"}),) + ) + assert literal_sort_key(frozenset({"a", frozenset({"b", "c"})})) == literal_sort_key( + frozenset({frozenset({"c", "b"}), "a"}) + ) + + def test_nested_frozenset_literal_index_is_deterministic(self) -> None: + lit1 = Literals() + lit1.record_literal(frozenset({frozenset({"a", "b"}), frozenset({"c", "d"})})) + lit2 = Literals() + lit2.record_literal(frozenset({frozenset({"d", "c"}), frozenset({"b", "a"})})) + for s in ("a", "b", "c", "d"): + assert lit1.literal_index(s) == lit2.literal_index(s) + assert lit1.encoded_frozenset_values() == lit2.encoded_frozenset_values()