diff options
author | bobzel <zzzman@gmail.com> | 2025-07-08 20:42:30 -0400 |
---|---|---|
committer | bobzel <zzzman@gmail.com> | 2025-07-08 20:42:30 -0400 |
commit | 0a6d0bb9b9630985ffd8a4b923e31f001bb03f7c (patch) | |
tree | d3303698aae0ce68b0ba1a05a8c5bdc6a53ef5e8 /tree_to_json.py | |
parent | 87a9c8082c122ad6bc7e8c4f9d6a50bc09ae38ee (diff) | |
parent | 95c0d9b0ed3cf8bf50f3a3eac2f1dff146ba131c (diff) |
Merge branch 'agent-paper-main' into lanyi-expanded-agent-paper-main
Diffstat (limited to 'tree_to_json.py')
-rw-r--r-- | tree_to_json.py | 206 |
1 files changed, 206 insertions, 0 deletions
diff --git a/tree_to_json.py b/tree_to_json.py new file mode 100644 index 000000000..594296894 --- /dev/null +++ b/tree_to_json.py @@ -0,0 +1,206 @@ +#!/usr/bin/env python3 +""" +make_jsons.py +============= + +1. From a tree-style directory listing (with summaries after an en-dash “–”) + produce <summaries>.json : { "full/file/path": "summary", ... } + +2. From a “concatenated source” file that looks like + ================================ + path/to/file.tsx + -------------------------------- + ...file content... + produce <contents>.json : { "full/file/path": "<entire source>", ... } + +3. Checks that the key-sets of both JSON files are identical and prints + any filenames that are missing in either mapping. + +--------------------------------------------------------------------------- +USAGE +----- + + python make_jsons.py tree.txt bundle.txt summaries.json contents.json + +where + + • tree.txt – your original `tree` output with summaries + • bundle.txt – the big text file with `=== / ---` separators + file bodies + • summaries.json, contents.json – output files + +--------------------------------------------------------------------------- +""" + +import json +import re +import sys +from pathlib import Path + +INDENT_WIDTH = 4 # one indent level = 4 glyphs ("│ " or " ") +EN_DASH_SPLIT = re.compile(r"\s+–\s+") # space–space delimiter + +# --------------------------------------------------------------------------- # +# Part 1 – Parse the `tree` listing +# --------------------------------------------------------------------------- # +def parse_tree_listing(lines): + """Yield (depth, name, summary_or_None) for each meaningful line.""" + for raw in lines: + if not raw.strip(): + continue + + # Strip the "tree art" section up to the first '── ' + m = re.search(r"[├└]──\s*", raw) + if m: + indent_prefix = raw[:m.start()] + content = raw[m.end():].rstrip() + else: # root line without glyphs + indent_prefix = "" + content = raw.strip() + + depth = len(indent_prefix) // INDENT_WIDTH + + # Split <name> – <summary> + if "–" in content: + name, summary = EN_DASH_SPLIT.split(content, maxsplit=1) + summary = summary.strip() + else: + name, summary = content, None + + yield depth, name.strip(), summary + + +def build_summary_map(tree_path: Path) -> dict: + with tree_path.open(encoding="utf-8") as fh: + lines = fh.readlines() + + stack, mapping = [], {} + for depth, name, summary in parse_tree_listing(lines): + stack = stack[:depth] + stack.append(name) + + if summary: # directories have no summary + full_path = "/".join(stack) + mapping[full_path] = summary + + return mapping + + +# --------------------------------------------------------------------------- # +# Part 2 – Parse the “bundle” file that has file bodies +# --------------------------------------------------------------------------- # +SEP_EQ = re.compile(r"^=+\s*$") # line of only '=' chars +SEP_DASH = re.compile(r"^-{3,}\s*$") # line of only '-' chars (3+) + +def parse_bundle_file(bundle_path: Path) -> dict: + """ + Return { "full/file/path": "<complete source text>", ... }. + + The expected pattern is: + ======== (80 × '=') ======== + path/to/file.ext + --- (dashes) --- + <zero-or-more lines of code/text> + ======== (next file...) + + Everything up to (but **excluding**) the next line of '=' is considered + file content. + """ + mapping = {} + lines = bundle_path.read_text(encoding="utf-8").splitlines() + + i = 0 + n = len(lines) + while i < n: + # 1) Find next "====" + while i < n and not SEP_EQ.match(lines[i]): + i += 1 + if i >= n: + break + i += 1 # move past the "====" line + + # 2) Skip blank lines, then grab the filepath line + while i < n and not lines[i].strip(): + i += 1 + if i >= n: + break + filepath = lines[i].strip() + i += 1 + + # 3) Skip the '----' separator + while i < n and not SEP_DASH.match(lines[i]): + i += 1 + if i < n: + i += 1 # past the '----' + + # 4) Gather content until next '====' + content_lines = [] + while i < n and not SEP_EQ.match(lines[i]): + content_lines.append(lines[i]) + i += 1 + + mapping[filepath] = "\n".join(content_lines).rstrip("\n") + + return mapping + + +# --------------------------------------------------------------------------- # +# Part 3 – Writing JSON + consistency check +# --------------------------------------------------------------------------- # +def write_json(obj: dict, out_path: Path): + with out_path.open("w", encoding="utf-8") as fh: + json.dump(obj, fh, indent=2, ensure_ascii=False) + print(f"✔ Wrote {len(obj):,} entries → {out_path}") + + +def compare_keys(map1: dict, map2: dict): + keys1, keys2 = set(map1), set(map2) + + if keys1 == keys2: + print("🎉 SUCCESS – both JSONs reference the exact same filenames.") + return True + + only_in_1 = sorted(keys1 - keys2) + only_in_2 = sorted(keys2 - keys1) + + if only_in_1: + print("\n⚠️ Present in summaries but missing in contents:") + for k in only_in_1: + print(" ", k) + + if only_in_2: + print("\n⚠️ Present in contents but missing in summaries:") + for k in only_in_2: + print(" ", k) + + print( + f"\n✖ Mismatch – summaries: {len(keys1)} paths, " + f"contents: {len(keys2)} paths." + ) + return False + + +# --------------------------------------------------------------------------- # +def main(): + if len(sys.argv) != 5: + sys.exit( + "USAGE:\n" + " python make_jsons.py <tree.txt> <bundle.txt> " + "<summaries.json> <contents.json>" + ) + + tree_txt, bundle_txt, summaries_json, contents_json = map(Path, sys.argv[1:]) + + print("• Building summary mapping …") + summary_map = build_summary_map(tree_txt) + write_json(summary_map, summaries_json) + + print("\n• Building contents mapping …") + contents_map = parse_bundle_file(bundle_txt) + write_json(contents_map, contents_json) + + print("\n• Comparing filename sets …") + compare_keys(summary_map, contents_map) + + +if __name__ == "__main__": + main() |