From b9cb38ffbdc07df4f65e9a339810ec7f089b715d Mon Sep 17 00:00:00 2001 From: Alvin Wan Date: Sat, 4 Apr 2026 05:17:57 -0700 Subject: [PATCH 1/3] reintroduce local string hoisting --- README.md | 19 ++++-- examples/pyminify.py | 28 ++++----- pymini/pymini.py | 142 ++++++++++++++++++++++++++++++++++++++++++- tests/test_api.py | 73 ++++++++++++++++++++++ 4 files changed, 241 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 68449e7..6c086d4 100644 --- a/README.md +++ b/README.md @@ -63,17 +63,24 @@ python3 -m pip install -e ".[dev]" python3 -m pytest ``` +## Compression Examples + +Checked-in minified outputs for the repo fixtures live in [examples](./examples) and +are regenerated by `scripts/regenerate_examples.py`. + +| Input | Original | `pymini` | `pyminifier` | `python-minifier` (`pyminify`) | +| --- | ---: | ---: | ---: | ---: | +| `tests/examples/pyminifier.py` | `1,355` bytes | `511` bytes, `62.3%` | `676` bytes, `50.1%` | `1,020` bytes, `24.7%` | +| `tests/examples/pyminify.py` | `1,990` bytes | `1,129` bytes, `43.3%` | `1,605` bytes, `19.3%` | `983` bytes, `50.6%` | +| `TexSoup/` raw Python source (`*.py`) | `98,181` bytes | `31,216` bytes, `68.2%` | `—` | `—` | +| `TexSoup/` compressed source (`.tar.gz`) | `70,532` bytes | `45,065` bytes, `36.1%` | `—` | `—` | + ## TexSoup Validation `pymini` has been validated against the upstream `TexSoup` test suite in package mode. Current validation: raw source code `68.2%` smaller, compressed source code (`.tar.gz`) `36.1%` smaller. - - -| Measurement | Original | Minified | Reduction | Reduction Rate | -| --- | ---: | ---: | ---: | ---: | -| Raw Python source (`*.py`) | `98,181` bytes | `31,212` bytes | `66,969` bytes | `68.2%` | -| `.tar.gz` of `TexSoup/` | `70,532` bytes | `45,054` bytes | `25,478` bytes | `36.1%` | + To reproduce that flow locally: diff --git a/examples/pyminify.py b/examples/pyminify.py index 0151546..c347057 100644 --- a/examples/pyminify.py +++ b/examples/pyminify.py @@ -1,22 +1,22 @@ def a(event,context): - l.info(event) + f='RequestType';g='PhysicalResourceId';h='None';i='Status';j='SUCCESS';k='Tags';m='OldResourceProperties';l.info(event) try: b=hashlib.new('md5',(event['RequestId']+event['StackId']).encode()).hexdigest();c=event['ResourceProperties'] - if event['RequestType']=='Create': - event['PhysicalResourceId']='None';event['PhysicalResourceId']=create_cert(c,b);add_tags(event['PhysicalResourceId'],c);validate(event['PhysicalResourceId'],c) - if wait_for_issuance(event['PhysicalResourceId'],context):event['Status']='SUCCESS';return send(event) + if event[f]=='Create': + event[g]=h;event[g]=create_cert(c,b);add_tags(event[g],c);validate(event[g],c) + if wait_for_issuance(event[g],context):event[i]=j;return send(event) else:return reinvoke(event,context) - elif event['RequestType']=='Delete': - if event['PhysicalResourceId']!='None':acm.delete_certificate(CertificateArn=event['PhysicalResourceId']) - event['Status']='SUCCESS';return send(event) - elif event['RequestType']=='Update': + elif event[f]=='Delete': + if event[g]!=h:acm.delete_certificate(CertificateArn=event[g]) + event[i]=j;return send(event) + elif event[f]=='Update': if replace_cert(event): - event['PhysicalResourceId']=create_cert(c,b);add_tags(event['PhysicalResourceId'],c);validate(event['PhysicalResourceId'],c) - if not wait_for_issuance(event['PhysicalResourceId'],context):return reinvoke(event,context) + event[g]=create_cert(c,b);add_tags(event[g],c);validate(event[g],c) + if not wait_for_issuance(event[g],context):return reinvoke(event,context) else: - if 'Tags' in event['OldResourceProperties']:acm.remove_tags_from_certificate(CertificateArn=event['PhysicalResourceId'],Tags=event['OldResourceProperties']['Tags']) - add_tags(event['PhysicalResourceId'],c) - event['Status']='SUCCESS';return send(event) + if k in event[m]:acm.remove_tags_from_certificate(CertificateArn=event[g],Tags=event[m][k]) + add_tags(event[g],c) + event[i]=j;return send(event) else:raise RuntimeError('Unknown RequestType') - except Exception as d:l.exception('');event['Status']='FAILED';event['Reason']=str(d);return send(event) + except Exception as d:l.exception('');event[i]='FAILED';event['Reason']=str(d);return send(event) handler=a \ No newline at end of file diff --git a/pymini/pymini.py b/pymini/pymini.py index 011849a..afd9098 100644 --- a/pymini/pymini.py +++ b/pymini/pymini.py @@ -165,7 +165,8 @@ class VariableShortener(NodeTransformer): # Deferred optimizations intentionally left off after validating against # TexSoup and similar package-shaped inputs: # - aliasing repeated name reads into generated locals - # - hoisting repeated string literals into generated locals + # - hoisting repeated string literals into generated locals at module or + # class scope # - renaming attribute call sites such as obj.method(...) # - renaming methods, class-body attributes, and top-level class names in # preserve-public-API mode @@ -667,6 +668,144 @@ def transform(self, *trees): return new_trees +def _is_unsupported_hoisted_string_context(node): + current = node + pattern_nodes = ( + ast.MatchValue, + ast.MatchSingleton, + ast.MatchSequence, + ast.MatchMapping, + ast.MatchClass, + ast.MatchAs, + ast.MatchOr, + ) + while hasattr(current, "parent"): + parent = current.parent + if isinstance(parent, (ast.JoinedStr, *pattern_nodes)): + return True + if isinstance(parent, ast.arg) and parent.annotation is current: + return True + if isinstance(parent, ast.AnnAssign) and parent.annotation is current: + return True + if isinstance(parent, (ast.FunctionDef, ast.AsyncFunctionDef)) and parent.returns is current: + return True + current = parent + return False + + +class RepeatedStringHoister(Transformer): + # Reintroduced in the narrowest safe form first: only hoist repeated string + # literals inside function bodies. Module and class scopes are still left + # alone because new bindings there change the public surface or class + # namespace more directly. + def __init__(self, generator): + super().__init__() + self.generator = generator + + def transform(self, *trees): + for tree in trees: + ParentSetter().visit(tree) + collector = RepeatedStringCollector() + collector.visit(tree) + RepeatedStringRewriter(self.generator, collector.repeated_strings_by_scope).visit(tree) + ParentSetter().visit(tree) + ast.fix_missing_locations(tree) + return trees + + +class RepeatedStringCollector(ast.NodeVisitor): + def __init__(self): + self.scope_stack = [] + self.repeated_strings_by_scope = {} + + def visit_FunctionDef(self, node): + counts = {} + self.scope_stack.append(counts) + for statement in node.body: + self.visit(statement) + self.scope_stack.pop() + repeated = [ + value + for value, count in counts.items() + if count > 1 and len(repr(value)) > 4 + ] + if repeated: + self.repeated_strings_by_scope[id(node)] = repeated + + visit_AsyncFunctionDef = visit_FunctionDef + + def visit_ClassDef(self, node): + for statement in node.body: + if isinstance(statement, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)): + self.visit(statement) + + def visit_Constant(self, node): + if not self.scope_stack or not isinstance(node.value, str): + return + if _is_unsupported_hoisted_string_context(node): + return + counts = self.scope_stack[-1] + counts[node.value] = counts.get(node.value, 0) + 1 + + +class RepeatedStringRewriter(ast.NodeTransformer): + def __init__(self, generator, repeated_strings_by_scope): + super().__init__() + self.generator = generator + self.repeated_strings_by_scope = repeated_strings_by_scope + self.scope_stack = [] + + def _prepend_assignments(self, body, mapping): + assignments = [] + for value, name in mapping.items(): + assignment = ast.Assign( + targets=[ast.Name(id=name, ctx=ast.Store())], + value=ast.Constant(value=value), + ) + assignment._pymini_generated = True + assignments.append(assignment) + return assignments + body + + def visit_FunctionDef(self, node): + mapping = {} + repeated = self.repeated_strings_by_scope.get(id(node), ()) + if repeated: + mapping = {value: next(self.generator) for value in repeated} + self.scope_stack.append(mapping) + node.body = [self.visit(statement) for statement in node.body] + self.scope_stack.pop() + if mapping: + node.body = self._prepend_assignments(node.body, mapping) + return node + + visit_AsyncFunctionDef = visit_FunctionDef + + def visit_ClassDef(self, node): + updated_body = [] + for statement in node.body: + if isinstance(statement, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)): + updated_body.append(self.visit(statement)) + else: + updated_body.append(statement) + node.body = updated_body + return node + + def visit_Assign(self, node): + if getattr(node, "_pymini_generated", False): + return node + return self.generic_visit(node) + + def visit_Constant(self, node): + if not self.scope_stack or not isinstance(node.value, str): + return node + if _is_unsupported_hoisted_string_context(node): + return node + mapping = self.scope_stack[-1] + if node.value not in mapping: + return node + return ast.copy_location(ast.Name(id=mapping[node.value], ctx=ast.Load()), node) + + class ImportedVariableShortener(VariableShortener): """Use different module shorteners to adjust variables in this module @@ -1127,6 +1266,7 @@ def minify(sources, modules='main', keep_module_names=False, modules=ind.modules, keep_module_names=keep_module_names, ), # obfuscate across files + RepeatedStringHoister(ind.generator), # optionally fuse files fuser := ( diff --git a/tests/test_api.py b/tests/test_api.py index 4e57d12..3e1ad26 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -160,6 +160,79 @@ def test_variable_name_generator_skips_python_keywords(): assert all(not keyword.iskeyword(name) for name in names) +def test_minify_hoists_repeated_strings_inside_functions(tmp_path): + cleaned, modules = minify( + py( + """ + def f(): + return { + "left": "PhysicalResourceId", + "right": "PhysicalResourceId", + } + + print(f()["left"], f()["right"]) + """ + ), + "main", + keep_global_variables=True, + keep_module_names=True, + ) + + tree = ast.parse(cleaned[0]) + function = next(node for node in tree.body if isinstance(node, ast.FunctionDef)) + helper = function.body[0] + + assert isinstance(helper, ast.Assign) + assert isinstance(helper.value, ast.Constant) + assert helper.value.value == "PhysicalResourceId" + assert cleaned[0].count("PhysicalResourceId") == 1 + + module_path = tmp_path / "module.py" + module_path.write_text(cleaned[0], encoding="utf-8") + result = subprocess.run( + [sys.executable, str(module_path)], + capture_output=True, + text=True, + check=False, + ) + + assert result.returncode == 0, result.stderr + assert result.stdout == "PhysicalResourceId PhysicalResourceId\n" + assert modules == ["main"] + + +def test_minify_does_not_hoist_repeated_strings_into_class_bodies(tmp_path): + cleaned, modules = minify( + py( + """ + class Token: + left = "PhysicalResourceId" + right = "PhysicalResourceId" + + print(Token.left, Token.right) + """ + ), + "main", + keep_global_variables=True, + keep_module_names=True, + ) + + assert cleaned[0].count("PhysicalResourceId") == 2 + + module_path = tmp_path / "module.py" + module_path.write_text(cleaned[0], encoding="utf-8") + result = subprocess.run( + [sys.executable, str(module_path)], + capture_output=True, + text=True, + check=False, + ) + + assert result.returncode == 0, result.stderr + assert result.stdout == "PhysicalResourceId PhysicalResourceId\n" + assert modules == ["main"] + + def test_minify_preserves_global_names_without_breaking_shadowed_locals(tmp_path): cleaned, modules = minify( py( From 316aca4743f7b26c7708c45c8a357831c38c0476 Mon Sep 17 00:00:00 2001 From: Alvin Wan Date: Sat, 4 Apr 2026 05:22:09 -0700 Subject: [PATCH 2/3] guard pattern AST nodes on Python 3.9 --- pymini/pymini.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/pymini/pymini.py b/pymini/pymini.py index afd9098..e98b0b7 100644 --- a/pymini/pymini.py +++ b/pymini/pymini.py @@ -670,18 +670,23 @@ def transform(self, *trees): def _is_unsupported_hoisted_string_context(node): current = node - pattern_nodes = ( - ast.MatchValue, - ast.MatchSingleton, - ast.MatchSequence, - ast.MatchMapping, - ast.MatchClass, - ast.MatchAs, - ast.MatchOr, + pattern_nodes = tuple( + node_type for node_type in ( + getattr(ast, "MatchValue", None), + getattr(ast, "MatchSingleton", None), + getattr(ast, "MatchSequence", None), + getattr(ast, "MatchMapping", None), + getattr(ast, "MatchClass", None), + getattr(ast, "MatchAs", None), + getattr(ast, "MatchOr", None), + ) + if node_type is not None ) while hasattr(current, "parent"): parent = current.parent - if isinstance(parent, (ast.JoinedStr, *pattern_nodes)): + if isinstance(parent, ast.JoinedStr): + return True + if pattern_nodes and isinstance(parent, pattern_nodes): return True if isinstance(parent, ast.arg) and parent.annotation is current: return True From 76b25e8090292cc4fd46b6f563dc41203ff356e3 Mon Sep 17 00:00:00 2001 From: Alvin Wan Date: Sat, 4 Apr 2026 05:25:19 -0700 Subject: [PATCH 3/3] avoid hoist helper collisions --- pymini/pymini.py | 31 ++++++++++++++++++++++++- tests/test_api.py | 59 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+), 1 deletion(-) diff --git a/pymini/pymini.py b/pymini/pymini.py index e98b0b7..14da509 100644 --- a/pymini/pymini.py +++ b/pymini/pymini.py @@ -698,6 +698,24 @@ def _is_unsupported_hoisted_string_context(node): return False +def _reserved_names_in_node(node): + names = set() + for current in ast.walk(node): + if isinstance(current, ast.Name): + names.add(current.id) + elif isinstance(current, ast.arg): + names.add(current.arg) + elif isinstance(current, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)): + names.add(current.name) + elif isinstance(current, ast.alias): + names.add(current.asname or current.name.split(".", 1)[0]) + elif isinstance(current, (ast.Global, ast.Nonlocal)): + names.update(current.names) + elif isinstance(current, ast.ExceptHandler) and current.name: + names.add(current.name) + return names + + class RepeatedStringHoister(Transformer): # Reintroduced in the narrowest safe form first: only hoist repeated string # literals inside function bodies. Module and class scopes are still left @@ -760,6 +778,13 @@ def __init__(self, generator, repeated_strings_by_scope): self.repeated_strings_by_scope = repeated_strings_by_scope self.scope_stack = [] + def _next_safe_name(self, reserved_names): + while True: + candidate = next(self.generator) + if candidate not in reserved_names: + reserved_names.add(candidate) + return candidate + def _prepend_assignments(self, body, mapping): assignments = [] for value, name in mapping.items(): @@ -775,7 +800,11 @@ def visit_FunctionDef(self, node): mapping = {} repeated = self.repeated_strings_by_scope.get(id(node), ()) if repeated: - mapping = {value: next(self.generator) for value in repeated} + reserved_names = _reserved_names_in_node(node) + mapping = { + value: self._next_safe_name(reserved_names) + for value in repeated + } self.scope_stack.append(mapping) node.body = [self.visit(statement) for statement in node.body] self.scope_stack.pop() diff --git a/tests/test_api.py b/tests/test_api.py index 3e1ad26..e18dc8c 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -233,6 +233,65 @@ class Token: assert modules == ["main"] +def test_minify_hoisted_strings_do_not_collide_with_lambda_parameters(tmp_path): + cleaned, modules = minify( + py( + """ + def outer(): + return (lambda b: ("hello", "hello"))("x") + + print(outer()) + """ + ), + "main", + keep_global_variables=True, + keep_module_names=True, + ) + + module_path = tmp_path / "module.py" + module_path.write_text(cleaned[0], encoding="utf-8") + result = subprocess.run( + [sys.executable, str(module_path)], + capture_output=True, + text=True, + check=False, + ) + + assert result.returncode == 0, result.stderr + assert result.stdout == "('hello', 'hello')\n" + assert modules == ["main"] + + +def test_minify_hoisted_strings_do_not_conflict_with_global_declarations(tmp_path): + cleaned, modules = minify( + py( + """ + def outer(): + global b + return ("hello", "hello") + + print(outer()) + """ + ), + "main", + keep_global_variables=True, + keep_module_names=True, + ) + + module_path = tmp_path / "module.py" + module_path.write_text(cleaned[0], encoding="utf-8") + result = subprocess.run( + [sys.executable, str(module_path)], + capture_output=True, + text=True, + check=False, + ) + + assert result.returncode == 0, result.stderr + assert result.stdout == "('hello', 'hello')\n" + assert modules == ["main"] + + def test_minify_preserves_global_names_without_breaking_shadowed_locals(tmp_path): cleaned, modules = minify( py(