Merge branch 'agent-paper-main' into lanyi-expanded-agent-paper-main

author: bobzel <zzzman@gmail.com> 2025-07-08 20:42:30 -0400
committer: bobzel <zzzman@gmail.com> 2025-07-08 20:42:30 -0400
commit: 0a6d0bb9b9630985ffd8a4b923e31f001bb03f7c (patch)
tree: d3303698aae0ce68b0ba1a05a8c5bdc6a53ef5e8 /tree_to_json.py
parent: 87a9c8082c122ad6bc7e8c4f9d6a50bc09ae38ee (diff)
parent: 95c0d9b0ed3cf8bf50f3a3eac2f1dff146ba131c (diff)
1 files changed, 206 insertions, 0 deletions
diff --git a/tree_to_json.py b/tree_to_json.py
new file mode 100644
index 000000000..594296894
--- /dev/null
+++ b/tree_to_json.py
@@ -0,0 +1,206 @@
+#!/usr/bin/env python3
+"""
+make_jsons.py
+=============
+
+1. From a tree-style directory listing (with summaries after an en-dash “–”)
+   produce  <summaries>.json     :  { "full/file/path": "summary", ... }
+
+2. From a “concatenated source” file that looks like
+      ================================
+      path/to/file.tsx
+      --------------------------------
+      ...file content...
+   produce  <contents>.json      :  { "full/file/path": "<entire source>", ... }
+
+3. Checks that the key-sets of both JSON files are identical and prints
+   any filenames that are missing in either mapping.
+
+---------------------------------------------------------------------------
+USAGE
+-----
+
+    python make_jsons.py tree.txt bundle.txt summaries.json contents.json
+
+where
+
+  • tree.txt      – your original `tree` output with summaries
+  • bundle.txt    – the big text file with `=== / ---` separators + file bodies
+  • summaries.json, contents.json – output files
+
+---------------------------------------------------------------------------
+"""
+
+import json
+import re
+import sys
+from pathlib import Path
+
+INDENT_WIDTH = 4                 # one indent level = 4 glyphs ("│   " or "    ")
+EN_DASH_SPLIT = re.compile(r"\s+–\s+")   # space–space delimiter
+
+# --------------------------------------------------------------------------- #
+#  Part 1 –  Parse the `tree` listing
+# --------------------------------------------------------------------------- #
+def parse_tree_listing(lines):
+    """Yield (depth, name, summary_or_None) for each meaningful line."""
+    for raw in lines:
+        if not raw.strip():
+            continue
+
+        # Strip the "tree art" section up to the first '── '
+        m = re.search(r"[├└]──\s*", raw)
+        if m:
+            indent_prefix = raw[:m.start()]
+            content = raw[m.end():].rstrip()
+        else:                       # root line without glyphs
+            indent_prefix = ""
+            content = raw.strip()
+
+        depth = len(indent_prefix) // INDENT_WIDTH
+
+        # Split <name> – <summary>
+        if "–" in content:
+            name, summary = EN_DASH_SPLIT.split(content, maxsplit=1)
+            summary = summary.strip()
+        else:
+            name, summary = content, None
+
+        yield depth, name.strip(), summary
+
+
+def build_summary_map(tree_path: Path) -> dict:
+    with tree_path.open(encoding="utf-8") as fh:
+        lines = fh.readlines()
+
+    stack, mapping = [], {}
+    for depth, name, summary in parse_tree_listing(lines):
+        stack = stack[:depth]
+        stack.append(name)
+
+        if summary:                                 # directories have no summary
+            full_path = "/".join(stack)
+            mapping[full_path] = summary
+
+    return mapping
+
+
+# --------------------------------------------------------------------------- #
+#  Part 2 –  Parse the “bundle” file that has file bodies
+# --------------------------------------------------------------------------- #
+SEP_EQ = re.compile(r"^=+\s*$")        # line of only '=' chars
+SEP_DASH = re.compile(r"^-{3,}\s*$")   # line of only '-' chars (3+)
+
+def parse_bundle_file(bundle_path: Path) -> dict:
+    """
+    Return { "full/file/path": "<complete source text>", ... }.
+
+    The expected pattern is:
+        ======== (80 × '=') ========
+        path/to/file.ext
+        --- (dashes) ---
+        <zero-or-more lines of code/text>
+        ========  (next file...)
+
+    Everything up to (but **excluding**) the next line of '=' is considered
+    file content.
+    """
+    mapping = {}
+    lines = bundle_path.read_text(encoding="utf-8").splitlines()
+
+    i = 0
+    n = len(lines)
+    while i < n:
+        # 1) Find next "===="
+        while i < n and not SEP_EQ.match(lines[i]):
+            i += 1
+        if i >= n:
+            break
+        i += 1  # move past the "====" line
+
+        # 2) Skip blank lines, then grab the filepath line
+        while i < n and not lines[i].strip():
+            i += 1
+        if i >= n:
+            break
+        filepath = lines[i].strip()
+        i += 1
+
+        # 3) Skip the '----' separator
+        while i < n and not SEP_DASH.match(lines[i]):
+            i += 1
+        if i < n:
+            i += 1  # past the '----'
+
+        # 4) Gather content until next '===='
+        content_lines = []
+        while i < n and not SEP_EQ.match(lines[i]):
+            content_lines.append(lines[i])
+            i += 1
+
+        mapping[filepath] = "\n".join(content_lines).rstrip("\n")
+
+    return mapping
+
+
+# --------------------------------------------------------------------------- #
+#  Part 3 –  Writing JSON + consistency check
+# --------------------------------------------------------------------------- #
+def write_json(obj: dict, out_path: Path):
+    with out_path.open("w", encoding="utf-8") as fh:
+        json.dump(obj, fh, indent=2, ensure_ascii=False)
+    print(f"✔  Wrote {len(obj):,} entries → {out_path}")
+
+
+def compare_keys(map1: dict, map2: dict):
+    keys1, keys2 = set(map1), set(map2)
+
+    if keys1 == keys2:
+        print("🎉  SUCCESS – both JSONs reference the exact same filenames.")
+        return True
+
+    only_in_1 = sorted(keys1 - keys2)
+    only_in_2 = sorted(keys2 - keys1)
+
+    if only_in_1:
+        print("\n⚠️  Present in summaries but missing in contents:")
+        for k in only_in_1:
+            print("  ", k)
+
+    if only_in_2:
+        print("\n⚠️  Present in contents but missing in summaries:")
+        for k in only_in_2:
+            print("  ", k)
+
+    print(
+        f"\n✖  Mismatch – summaries: {len(keys1)} paths, "
+        f"contents: {len(keys2)} paths."
+    )
+    return False
+
+
+# --------------------------------------------------------------------------- #
+def main():
+    if len(sys.argv) != 5:
+        sys.exit(
+            "USAGE:\n"
+            "  python make_jsons.py <tree.txt> <bundle.txt> "
+            "<summaries.json> <contents.json>"
+        )
+
+    tree_txt, bundle_txt, summaries_json, contents_json = map(Path, sys.argv[1:])
+
+    print("• Building summary mapping …")
+    summary_map = build_summary_map(tree_txt)
+    write_json(summary_map, summaries_json)
+
+    print("\n• Building contents mapping …")
+    contents_map = parse_bundle_file(bundle_txt)
+    write_json(contents_map, contents_json)
+
+    print("\n• Comparing filename sets …")
+    compare_keys(summary_map, contents_map)
+
+
+if __name__ == "__main__":
+    main()
author	bobzel <zzzman@gmail.com>	2025-07-08 20:42:30 -0400
committer	bobzel <zzzman@gmail.com>	2025-07-08 20:42:30 -0400
commit	0a6d0bb9b9630985ffd8a4b923e31f001bb03f7c (patch)
tree	d3303698aae0ce68b0ba1a05a8c5bdc6a53ef5e8 /tree_to_json.py
parent	87a9c8082c122ad6bc7e8c4f9d6a50bc09ae38ee (diff)
parent	95c0d9b0ed3cf8bf50f3a3eac2f1dff146ba131c (diff)