From b9cb38ffbdc07df4f65e9a339810ec7f089b715d Mon Sep 17 00:00:00 2001
From: Alvin Wan
Date: Sat, 4 Apr 2026 05:17:57 -0700
Subject: [PATCH 1/3] reintroduce local string hoisting
---
README.md | 19 ++++--
examples/pyminify.py | 28 ++++-----
pymini/pymini.py | 142 ++++++++++++++++++++++++++++++++++++++++++-
tests/test_api.py | 73 ++++++++++++++++++++++
4 files changed, 241 insertions(+), 21 deletions(-)
diff --git a/README.md b/README.md
index 68449e7..6c086d4 100644
--- a/README.md
+++ b/README.md
@@ -63,17 +63,24 @@ python3 -m pip install -e ".[dev]"
python3 -m pytest
```
+## Compression Examples
+
+Checked-in minified outputs for the repo fixtures live in [examples](./examples) and
+are regenerated by `scripts/regenerate_examples.py`.
+
+| Input | Original | `pymini` | `pyminifier` | `python-minifier` (`pyminify`) |
+| --- | ---: | ---: | ---: | ---: |
+| `tests/examples/pyminifier.py` | `1,355` bytes | `511` bytes, `62.3%` | `676` bytes, `50.1%` | `1,020` bytes, `24.7%` |
+| `tests/examples/pyminify.py` | `1,990` bytes | `1,129` bytes, `43.3%` | `1,605` bytes, `19.3%` | `983` bytes, `50.6%` |
+| `TexSoup/` raw Python source (`*.py`) | `98,181` bytes | `31,216` bytes, `68.2%` | `—` | `—` |
+| `TexSoup/` compressed source (`.tar.gz`) | `70,532` bytes | `45,065` bytes, `36.1%` | `—` | `—` |
+
## TexSoup Validation
`pymini` has been validated against the upstream `TexSoup` test suite in package mode.
Current validation: raw source code `68.2%` smaller, compressed source code
(`.tar.gz`) `36.1%` smaller.
-
-
-| Measurement | Original | Minified | Reduction | Reduction Rate |
-| --- | ---: | ---: | ---: | ---: |
-| Raw Python source (`*.py`) | `98,181` bytes | `31,212` bytes | `66,969` bytes | `68.2%` |
-| `.tar.gz` of `TexSoup/` | `70,532` bytes | `45,054` bytes | `25,478` bytes | `36.1%` |
+
To reproduce that flow locally:
diff --git a/examples/pyminify.py b/examples/pyminify.py
index 0151546..c347057 100644
--- a/examples/pyminify.py
+++ b/examples/pyminify.py
@@ -1,22 +1,22 @@
def a(event,context):
- l.info(event)
+ f='RequestType';g='PhysicalResourceId';h='None';i='Status';j='SUCCESS';k='Tags';m='OldResourceProperties';l.info(event)
try:
b=hashlib.new('md5',(event['RequestId']+event['StackId']).encode()).hexdigest();c=event['ResourceProperties']
- if event['RequestType']=='Create':
- event['PhysicalResourceId']='None';event['PhysicalResourceId']=create_cert(c,b);add_tags(event['PhysicalResourceId'],c);validate(event['PhysicalResourceId'],c)
- if wait_for_issuance(event['PhysicalResourceId'],context):event['Status']='SUCCESS';return send(event)
+ if event[f]=='Create':
+ event[g]=h;event[g]=create_cert(c,b);add_tags(event[g],c);validate(event[g],c)
+ if wait_for_issuance(event[g],context):event[i]=j;return send(event)
else:return reinvoke(event,context)
- elif event['RequestType']=='Delete':
- if event['PhysicalResourceId']!='None':acm.delete_certificate(CertificateArn=event['PhysicalResourceId'])
- event['Status']='SUCCESS';return send(event)
- elif event['RequestType']=='Update':
+ elif event[f]=='Delete':
+ if event[g]!=h:acm.delete_certificate(CertificateArn=event[g])
+ event[i]=j;return send(event)
+ elif event[f]=='Update':
if replace_cert(event):
- event['PhysicalResourceId']=create_cert(c,b);add_tags(event['PhysicalResourceId'],c);validate(event['PhysicalResourceId'],c)
- if not wait_for_issuance(event['PhysicalResourceId'],context):return reinvoke(event,context)
+ event[g]=create_cert(c,b);add_tags(event[g],c);validate(event[g],c)
+ if not wait_for_issuance(event[g],context):return reinvoke(event,context)
else:
- if 'Tags' in event['OldResourceProperties']:acm.remove_tags_from_certificate(CertificateArn=event['PhysicalResourceId'],Tags=event['OldResourceProperties']['Tags'])
- add_tags(event['PhysicalResourceId'],c)
- event['Status']='SUCCESS';return send(event)
+ if k in event[m]:acm.remove_tags_from_certificate(CertificateArn=event[g],Tags=event[m][k])
+ add_tags(event[g],c)
+ event[i]=j;return send(event)
else:raise RuntimeError('Unknown RequestType')
- except Exception as d:l.exception('');event['Status']='FAILED';event['Reason']=str(d);return send(event)
+ except Exception as d:l.exception('');event[i]='FAILED';event['Reason']=str(d);return send(event)
handler=a
\ No newline at end of file
diff --git a/pymini/pymini.py b/pymini/pymini.py
index 011849a..afd9098 100644
--- a/pymini/pymini.py
+++ b/pymini/pymini.py
@@ -165,7 +165,8 @@ class VariableShortener(NodeTransformer):
# Deferred optimizations intentionally left off after validating against
# TexSoup and similar package-shaped inputs:
# - aliasing repeated name reads into generated locals
- # - hoisting repeated string literals into generated locals
+ # - hoisting repeated string literals into generated locals at module or
+ # class scope
# - renaming attribute call sites such as obj.method(...)
# - renaming methods, class-body attributes, and top-level class names in
# preserve-public-API mode
@@ -667,6 +668,144 @@ def transform(self, *trees):
return new_trees
+def _is_unsupported_hoisted_string_context(node):
+ current = node
+ pattern_nodes = (
+ ast.MatchValue,
+ ast.MatchSingleton,
+ ast.MatchSequence,
+ ast.MatchMapping,
+ ast.MatchClass,
+ ast.MatchAs,
+ ast.MatchOr,
+ )
+ while hasattr(current, "parent"):
+ parent = current.parent
+ if isinstance(parent, (ast.JoinedStr, *pattern_nodes)):
+ return True
+ if isinstance(parent, ast.arg) and parent.annotation is current:
+ return True
+ if isinstance(parent, ast.AnnAssign) and parent.annotation is current:
+ return True
+ if isinstance(parent, (ast.FunctionDef, ast.AsyncFunctionDef)) and parent.returns is current:
+ return True
+ current = parent
+ return False
+
+
+class RepeatedStringHoister(Transformer):
+ # Reintroduced in the narrowest safe form first: only hoist repeated string
+ # literals inside function bodies. Module and class scopes are still left
+ # alone because new bindings there change the public surface or class
+ # namespace more directly.
+ def __init__(self, generator):
+ super().__init__()
+ self.generator = generator
+
+ def transform(self, *trees):
+ for tree in trees:
+ ParentSetter().visit(tree)
+ collector = RepeatedStringCollector()
+ collector.visit(tree)
+ RepeatedStringRewriter(self.generator, collector.repeated_strings_by_scope).visit(tree)
+ ParentSetter().visit(tree)
+ ast.fix_missing_locations(tree)
+ return trees
+
+
+class RepeatedStringCollector(ast.NodeVisitor):
+ def __init__(self):
+ self.scope_stack = []
+ self.repeated_strings_by_scope = {}
+
+ def visit_FunctionDef(self, node):
+ counts = {}
+ self.scope_stack.append(counts)
+ for statement in node.body:
+ self.visit(statement)
+ self.scope_stack.pop()
+ repeated = [
+ value
+ for value, count in counts.items()
+ if count > 1 and len(repr(value)) > 4
+ ]
+ if repeated:
+ self.repeated_strings_by_scope[id(node)] = repeated
+
+ visit_AsyncFunctionDef = visit_FunctionDef
+
+ def visit_ClassDef(self, node):
+ for statement in node.body:
+ if isinstance(statement, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
+ self.visit(statement)
+
+ def visit_Constant(self, node):
+ if not self.scope_stack or not isinstance(node.value, str):
+ return
+ if _is_unsupported_hoisted_string_context(node):
+ return
+ counts = self.scope_stack[-1]
+ counts[node.value] = counts.get(node.value, 0) + 1
+
+
+class RepeatedStringRewriter(ast.NodeTransformer):
+ def __init__(self, generator, repeated_strings_by_scope):
+ super().__init__()
+ self.generator = generator
+ self.repeated_strings_by_scope = repeated_strings_by_scope
+ self.scope_stack = []
+
+ def _prepend_assignments(self, body, mapping):
+ assignments = []
+ for value, name in mapping.items():
+ assignment = ast.Assign(
+ targets=[ast.Name(id=name, ctx=ast.Store())],
+ value=ast.Constant(value=value),
+ )
+ assignment._pymini_generated = True
+ assignments.append(assignment)
+ return assignments + body
+
+ def visit_FunctionDef(self, node):
+ mapping = {}
+ repeated = self.repeated_strings_by_scope.get(id(node), ())
+ if repeated:
+ mapping = {value: next(self.generator) for value in repeated}
+ self.scope_stack.append(mapping)
+ node.body = [self.visit(statement) for statement in node.body]
+ self.scope_stack.pop()
+ if mapping:
+ node.body = self._prepend_assignments(node.body, mapping)
+ return node
+
+ visit_AsyncFunctionDef = visit_FunctionDef
+
+ def visit_ClassDef(self, node):
+ updated_body = []
+ for statement in node.body:
+ if isinstance(statement, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
+ updated_body.append(self.visit(statement))
+ else:
+ updated_body.append(statement)
+ node.body = updated_body
+ return node
+
+ def visit_Assign(self, node):
+ if getattr(node, "_pymini_generated", False):
+ return node
+ return self.generic_visit(node)
+
+ def visit_Constant(self, node):
+ if not self.scope_stack or not isinstance(node.value, str):
+ return node
+ if _is_unsupported_hoisted_string_context(node):
+ return node
+ mapping = self.scope_stack[-1]
+ if node.value not in mapping:
+ return node
+ return ast.copy_location(ast.Name(id=mapping[node.value], ctx=ast.Load()), node)
+
+
class ImportedVariableShortener(VariableShortener):
"""Use different module shorteners to adjust variables in this module
@@ -1127,6 +1266,7 @@ def minify(sources, modules='main', keep_module_names=False,
modules=ind.modules,
keep_module_names=keep_module_names,
), # obfuscate across files
+ RepeatedStringHoister(ind.generator),
# optionally fuse files
fuser := (
diff --git a/tests/test_api.py b/tests/test_api.py
index 4e57d12..3e1ad26 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -160,6 +160,79 @@ def test_variable_name_generator_skips_python_keywords():
assert all(not keyword.iskeyword(name) for name in names)
+def test_minify_hoists_repeated_strings_inside_functions(tmp_path):
+ cleaned, modules = minify(
+ py(
+ """
+ def f():
+ return {
+ "left": "PhysicalResourceId",
+ "right": "PhysicalResourceId",
+ }
+
+ print(f()["left"], f()["right"])
+ """
+ ),
+ "main",
+ keep_global_variables=True,
+ keep_module_names=True,
+ )
+
+ tree = ast.parse(cleaned[0])
+ function = next(node for node in tree.body if isinstance(node, ast.FunctionDef))
+ helper = function.body[0]
+
+ assert isinstance(helper, ast.Assign)
+ assert isinstance(helper.value, ast.Constant)
+ assert helper.value.value == "PhysicalResourceId"
+ assert cleaned[0].count("PhysicalResourceId") == 1
+
+ module_path = tmp_path / "module.py"
+ module_path.write_text(cleaned[0], encoding="utf-8")
+ result = subprocess.run(
+ [sys.executable, str(module_path)],
+ capture_output=True,
+ text=True,
+ check=False,
+ )
+
+ assert result.returncode == 0, result.stderr
+ assert result.stdout == "PhysicalResourceId PhysicalResourceId\n"
+ assert modules == ["main"]
+
+
+def test_minify_does_not_hoist_repeated_strings_into_class_bodies(tmp_path):
+ cleaned, modules = minify(
+ py(
+ """
+ class Token:
+ left = "PhysicalResourceId"
+ right = "PhysicalResourceId"
+
+ print(Token.left, Token.right)
+ """
+ ),
+ "main",
+ keep_global_variables=True,
+ keep_module_names=True,
+ )
+
+ assert cleaned[0].count("PhysicalResourceId") == 2
+
+ module_path = tmp_path / "module.py"
+ module_path.write_text(cleaned[0], encoding="utf-8")
+ result = subprocess.run(
+ [sys.executable, str(module_path)],
+ capture_output=True,
+ text=True,
+ check=False,
+ )
+
+ assert result.returncode == 0, result.stderr
+ assert result.stdout == "PhysicalResourceId PhysicalResourceId\n"
+ assert modules == ["main"]
+
+
def test_minify_preserves_global_names_without_breaking_shadowed_locals(tmp_path):
cleaned, modules = minify(
py(
From 316aca4743f7b26c7708c45c8a357831c38c0476 Mon Sep 17 00:00:00 2001
From: Alvin Wan
Date: Sat, 4 Apr 2026 05:22:09 -0700
Subject: [PATCH 2/3] guard pattern AST nodes on Python 3.9
---
pymini/pymini.py | 23 ++++++++++++++---------
1 file changed, 14 insertions(+), 9 deletions(-)
diff --git a/pymini/pymini.py b/pymini/pymini.py
index afd9098..e98b0b7 100644
--- a/pymini/pymini.py
+++ b/pymini/pymini.py
@@ -670,18 +670,23 @@ def transform(self, *trees):
def _is_unsupported_hoisted_string_context(node):
current = node
- pattern_nodes = (
- ast.MatchValue,
- ast.MatchSingleton,
- ast.MatchSequence,
- ast.MatchMapping,
- ast.MatchClass,
- ast.MatchAs,
- ast.MatchOr,
+ pattern_nodes = tuple(
+ node_type for node_type in (
+ getattr(ast, "MatchValue", None),
+ getattr(ast, "MatchSingleton", None),
+ getattr(ast, "MatchSequence", None),
+ getattr(ast, "MatchMapping", None),
+ getattr(ast, "MatchClass", None),
+ getattr(ast, "MatchAs", None),
+ getattr(ast, "MatchOr", None),
+ )
+ if node_type is not None
)
while hasattr(current, "parent"):
parent = current.parent
- if isinstance(parent, (ast.JoinedStr, *pattern_nodes)):
+ if isinstance(parent, ast.JoinedStr):
+ return True
+ if pattern_nodes and isinstance(parent, pattern_nodes):
return True
if isinstance(parent, ast.arg) and parent.annotation is current:
return True
From 76b25e8090292cc4fd46b6f563dc41203ff356e3 Mon Sep 17 00:00:00 2001
From: Alvin Wan
Date: Sat, 4 Apr 2026 05:25:19 -0700
Subject: [PATCH 3/3] avoid hoist helper collisions
---
pymini/pymini.py | 31 ++++++++++++++++++++++++-
tests/test_api.py | 59 +++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 89 insertions(+), 1 deletion(-)
diff --git a/pymini/pymini.py b/pymini/pymini.py
index e98b0b7..14da509 100644
--- a/pymini/pymini.py
+++ b/pymini/pymini.py
@@ -698,6 +698,24 @@ def _is_unsupported_hoisted_string_context(node):
return False
+def _reserved_names_in_node(node):
+ names = set()
+ for current in ast.walk(node):
+ if isinstance(current, ast.Name):
+ names.add(current.id)
+ elif isinstance(current, ast.arg):
+ names.add(current.arg)
+ elif isinstance(current, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
+ names.add(current.name)
+ elif isinstance(current, ast.alias):
+ names.add(current.asname or current.name.split(".", 1)[0])
+ elif isinstance(current, (ast.Global, ast.Nonlocal)):
+ names.update(current.names)
+ elif isinstance(current, ast.ExceptHandler) and current.name:
+ names.add(current.name)
+ return names
+
+
class RepeatedStringHoister(Transformer):
# Reintroduced in the narrowest safe form first: only hoist repeated string
# literals inside function bodies. Module and class scopes are still left
@@ -760,6 +778,13 @@ def __init__(self, generator, repeated_strings_by_scope):
self.repeated_strings_by_scope = repeated_strings_by_scope
self.scope_stack = []
+ def _next_safe_name(self, reserved_names):
+ while True:
+ candidate = next(self.generator)
+ if candidate not in reserved_names:
+ reserved_names.add(candidate)
+ return candidate
+
def _prepend_assignments(self, body, mapping):
assignments = []
for value, name in mapping.items():
@@ -775,7 +800,11 @@ def visit_FunctionDef(self, node):
mapping = {}
repeated = self.repeated_strings_by_scope.get(id(node), ())
if repeated:
- mapping = {value: next(self.generator) for value in repeated}
+ reserved_names = _reserved_names_in_node(node)
+ mapping = {
+ value: self._next_safe_name(reserved_names)
+ for value in repeated
+ }
self.scope_stack.append(mapping)
node.body = [self.visit(statement) for statement in node.body]
self.scope_stack.pop()
diff --git a/tests/test_api.py b/tests/test_api.py
index 3e1ad26..e18dc8c 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -233,6 +233,65 @@ class Token:
assert modules == ["main"]
+def test_minify_hoisted_strings_do_not_collide_with_lambda_parameters(tmp_path):
+ cleaned, modules = minify(
+ py(
+ """
+ def outer():
+ return (lambda b: ("hello", "hello"))("x")
+
+ print(outer())
+ """
+ ),
+ "main",
+ keep_global_variables=True,
+ keep_module_names=True,
+ )
+
+ module_path = tmp_path / "module.py"
+ module_path.write_text(cleaned[0], encoding="utf-8")
+ result = subprocess.run(
+ [sys.executable, str(module_path)],
+ capture_output=True,
+ text=True,
+ check=False,
+ )
+
+ assert result.returncode == 0, result.stderr
+ assert result.stdout == "('hello', 'hello')\n"
+ assert modules == ["main"]
+
+
+def test_minify_hoisted_strings_do_not_conflict_with_global_declarations(tmp_path):
+ cleaned, modules = minify(
+ py(
+ """
+ def outer():
+ global b
+ return ("hello", "hello")
+
+ print(outer())
+ """
+ ),
+ "main",
+ keep_global_variables=True,
+ keep_module_names=True,
+ )
+
+ module_path = tmp_path / "module.py"
+ module_path.write_text(cleaned[0], encoding="utf-8")
+ result = subprocess.run(
+ [sys.executable, str(module_path)],
+ capture_output=True,
+ text=True,
+ check=False,
+ )
+
+ assert result.returncode == 0, result.stderr
+ assert result.stdout == "('hello', 'hello')\n"
+ assert modules == ["main"]
+
+
def test_minify_preserves_global_names_without_breaking_shadowed_locals(tmp_path):
cleaned, modules = minify(
py(