tree_to_json.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206

#!/usr/bin/env python3
"""
make_jsons.py
=============

1. From a tree-style directory listing (with summaries after an en-dash “–”)
   produce  <summaries>.json     :  { "full/file/path": "summary", ... }

2. From a “concatenated source” file that looks like
      ================================
      path/to/file.tsx
      --------------------------------
      ...file content...
   produce  <contents>.json      :  { "full/file/path": "<entire source>", ... }

3. Checks that the key-sets of both JSON files are identical and prints
   any filenames that are missing in either mapping.

---------------------------------------------------------------------------
USAGE
-----

    python make_jsons.py tree.txt bundle.txt summaries.json contents.json

where

  • tree.txt      – your original `tree` output with summaries
  • bundle.txt    – the big text file with `=== / ---` separators + file bodies
  • summaries.json, contents.json – output files

---------------------------------------------------------------------------
"""

import json
import re
import sys
from pathlib import Path

INDENT_WIDTH = 4                 # one indent level = 4 glyphs ("│   " or "    ")
EN_DASH_SPLIT = re.compile(r"\s+–\s+")   # space–space delimiter

# --------------------------------------------------------------------------- #
#  Part 1 –  Parse the `tree` listing
# --------------------------------------------------------------------------- #
def parse_tree_listing(lines):
    """Yield (depth, name, summary_or_None) for each meaningful line."""
    for raw in lines:
        if not raw.strip():
            continue

        # Strip the "tree art" section up to the first '── '
        m = re.search(r"[├└]──\s*", raw)
        if m:
            indent_prefix = raw[:m.start()]
            content = raw[m.end():].rstrip()
        else:                       # root line without glyphs
            indent_prefix = ""
            content = raw.strip()

        depth = len(indent_prefix) // INDENT_WIDTH

        # Split <name> – <summary>
        if "–" in content:
            name, summary = EN_DASH_SPLIT.split(content, maxsplit=1)
            summary = summary.strip()
        else:
            name, summary = content, None

        yield depth, name.strip(), summary


def build_summary_map(tree_path: Path) -> dict:
    with tree_path.open(encoding="utf-8") as fh:
        lines = fh.readlines()

    stack, mapping = [], {}
    for depth, name, summary in parse_tree_listing(lines):
        stack = stack[:depth]
        stack.append(name)

        if summary:                                 # directories have no summary
            full_path = "/".join(stack)
            mapping[full_path] = summary

    return mapping


# --------------------------------------------------------------------------- #
#  Part 2 –  Parse the “bundle” file that has file bodies
# --------------------------------------------------------------------------- #
SEP_EQ = re.compile(r"^=+\s*$")        # line of only '=' chars
SEP_DASH = re.compile(r"^-{3,}\s*$")   # line of only '-' chars (3+)

def parse_bundle_file(bundle_path: Path) -> dict:
    """
    Return { "full/file/path": "<complete source text>", ... }.

    The expected pattern is:
        ======== (80 × '=') ========
        path/to/file.ext
        --- (dashes) ---
        <zero-or-more lines of code/text>
        ========  (next file...)

    Everything up to (but **excluding**) the next line of '=' is considered
    file content.
    """
    mapping = {}
    lines = bundle_path.read_text(encoding="utf-8").splitlines()

    i = 0
    n = len(lines)
    while i < n:
        # 1) Find next "===="
        while i < n and not SEP_EQ.match(lines[i]):
            i += 1
        if i >= n:
            break
        i += 1  # move past the "====" line

        # 2) Skip blank lines, then grab the filepath line
        while i < n and not lines[i].strip():
            i += 1
        if i >= n:
            break
        filepath = lines[i].strip()
        i += 1

        # 3) Skip the '----' separator
        while i < n and not SEP_DASH.match(lines[i]):
            i += 1
        if i < n:
            i += 1  # past the '----'

        # 4) Gather content until next '===='
        content_lines = []
        while i < n and not SEP_EQ.match(lines[i]):
            content_lines.append(lines[i])
            i += 1

        mapping[filepath] = "\n".join(content_lines).rstrip("\n")

    return mapping


# --------------------------------------------------------------------------- #
#  Part 3 –  Writing JSON + consistency check
# --------------------------------------------------------------------------- #
def write_json(obj: dict, out_path: Path):
    with out_path.open("w", encoding="utf-8") as fh:
        json.dump(obj, fh, indent=2, ensure_ascii=False)
    print(f"✔  Wrote {len(obj):,} entries → {out_path}")


def compare_keys(map1: dict, map2: dict):
    keys1, keys2 = set(map1), set(map2)

    if keys1 == keys2:
        print("🎉  SUCCESS – both JSONs reference the exact same filenames.")
        return True

    only_in_1 = sorted(keys1 - keys2)
    only_in_2 = sorted(keys2 - keys1)

    if only_in_1:
        print("\n⚠️  Present in summaries but missing in contents:")
        for k in only_in_1:
            print("  ", k)

    if only_in_2:
        print("\n⚠️  Present in contents but missing in summaries:")
        for k in only_in_2:
            print("  ", k)

    print(
        f"\n✖  Mismatch – summaries: {len(keys1)} paths, "
        f"contents: {len(keys2)} paths."
    )
    return False


# --------------------------------------------------------------------------- #
def main():
    if len(sys.argv) != 5:
        sys.exit(
            "USAGE:\n"
            "  python make_jsons.py <tree.txt> <bundle.txt> "
            "<summaries.json> <contents.json>"
        )

    tree_txt, bundle_txt, summaries_json, contents_json = map(Path, sys.argv[1:])

    print("• Building summary mapping …")
    summary_map = build_summary_map(tree_txt)
    write_json(summary_map, summaries_json)

    print("\n• Building contents mapping …")
    contents_map = parse_bundle_file(bundle_txt)
    write_json(contents_map, contents_json)

    print("\n• Comparing filename sets …")
    compare_keys(summary_map, contents_map)


if __name__ == "__main__":
    main()