aboutsummaryrefslogtreecommitdiff
path: root/summarize_dash_ts.py
diff options
context:
space:
mode:
authorA.J. Shulman <Shulman.aj@gmail.com>2025-06-11 13:22:58 -0400
committerA.J. Shulman <Shulman.aj@gmail.com>2025-06-11 13:22:58 -0400
commitccfdf905400cd4b81d8cde0f16bb0e15cd65621b (patch)
treeea32d8a59df4f3875d71d4e10f91b867132f4229 /summarize_dash_ts.py
parent656dbe6dc64013215eb312173df398fe4606d788 (diff)
improved agent tool generation
Diffstat (limited to 'summarize_dash_ts.py')
-rw-r--r--summarize_dash_ts.py248
1 files changed, 248 insertions, 0 deletions
diff --git a/summarize_dash_ts.py b/summarize_dash_ts.py
new file mode 100644
index 000000000..69f80fde5
--- /dev/null
+++ b/summarize_dash_ts.py
@@ -0,0 +1,248 @@
+#!/usr/bin/env python3
+"""
+summarize_dash_ts.py – v4 (periodic-save edition)
+
+• Dumps every .ts/.tsx file (skipping node_modules, etc.)
+• Calls GPT-4o with Structured Outputs (JSON-schema “const” on filename)
+• Prints each raw JSON reply (unless --quiet)
+• Flushes the growing summary file to disk every N files (default 10)
+
+pip install openai tqdm rich
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import pathlib
+import sys
+from textwrap import dedent
+from typing import Dict, Iterable, List
+
+import openai
+from rich.console import Console
+from rich.tree import Tree
+from tqdm import tqdm
+
+PERIODIC_SAVE_EVERY = 10 # ← change here if you want finer or coarser saves
+
+
+# ───────────────────────── CLI ──────────────────────────
+def parse_args() -> argparse.Namespace:
+ p = argparse.ArgumentParser(prog="summarize_dash_ts.py")
+ p.add_argument("-r", "--root", type=pathlib.Path, default=".", help="Repo root")
+ p.add_argument("--model", default="gpt-4o-2024-08-06")
+ p.add_argument("--api-key", help="OpenAI API key (else env var)")
+ p.add_argument("--max-tokens", type=int, default=512)
+ p.add_argument(
+ "--skip-dirs",
+ nargs="*",
+ default=["node_modules", ".git", "dist", "build", ".next"],
+ )
+ p.add_argument(
+ "--preview", type=int, default=5, help="How many summaries to echo at the end"
+ )
+ p.add_argument(
+ "--quiet",
+ action="store_true",
+ help="Suppress the per-file raw JSON spam once you trust the run",
+ )
+ return p.parse_args()
+
+
+# ────────────────── helpers ──────────────────
+def iter_ts(root: pathlib.Path, skip: List[str]) -> Iterable[pathlib.Path]:
+ for dpath, dnames, fnames in os.walk(root):
+ dnames[:] = [d for d in dnames if d not in skip]
+ for fn in fnames:
+ if fn.endswith((".ts", ".tsx")):
+ yield pathlib.Path(dpath) / fn
+
+
+def safe_open(p: pathlib.Path):
+ try:
+ return p.open(encoding="utf-8")
+ except UnicodeDecodeError:
+ return p.open(encoding="utf-8", errors="replace")
+
+
+def make_tree(paths: list[pathlib.Path], root: pathlib.Path) -> Tree:
+ t = Tree(str(root))
+ nodes: dict[pathlib.Path, Tree] = {root: t}
+ for p in sorted(paths):
+ cur = root
+ for part in p.relative_to(root).parts:
+ cur = cur / part
+ if cur not in nodes:
+ nodes[cur] = nodes[cur.parent].add(part)
+ return t
+
+
+def write_tree_with_summaries(*, tree: Tree, summaries: dict[pathlib.Path, str],
+ root: pathlib.Path, out_path: pathlib.Path) -> None:
+ tmp = out_path.with_suffix(".tmp")
+ with tmp.open("w", encoding="utf-8") as f:
+
+ def walk(node: Tree, rel_path: pathlib.Path = pathlib.Path("."), indent: str = ""):
+ last = node.children[-1] if node.children else None
+ for child in node.children:
+ marker = "└── " if child is last else "├── "
+ new_indent = indent + (" " if child is last else "│ ")
+ child_rel = rel_path / child.label # ← **the missing bit**
+
+ # absolute path used as dict-key during summarization loop
+ abs_path = root / child_rel
+ if abs_path in summaries:
+ f.write(f"{indent}{marker}{child.label} – {summaries[abs_path]}\n")
+ else:
+ f.write(f"{indent}{marker}{child.label}\n")
+
+ walk(child, child_rel, new_indent)
+
+ walk(tree)
+ tmp.replace(out_path)
+
+
+# ────────────────── prompt bits ──────────────────
+SYSTEM = """
+You are an expert TypeScript code summarizer for the Dash hypermedia code-base.
+
+You will be given ONE complete file and its **exact** relative path.
+
+Return ONLY JSON matching this shape:
+
+{
+ "filename": "<EXACT path you were given>",
+ "summary": "<3–5 sentences, <80 words>"
+}
+
+No markdown, no extra keys.
+""".strip()
+
+OVERVIEW = dedent(
+ """
+ Dash is a browser-based hypermedia system from Brown University that lets users
+ mix PDFs, web pages, audio, video, ink and rich-text on a free-form canvas,
+ create Vannevar-Bush-style “trails”, and tag/spatially arrange docs for
+ nonlinear workflows. 99 % of the code-base is TypeScript/React.
+ """
+).strip()
+
+SCHEMA_BASE = {
+ "type": "object",
+ "properties": {
+ "filename": {"type": "string"},
+ "summary": {"type": "string"},
+ },
+ "required": ["filename", "summary"],
+ "additionalProperties": False,
+}
+
+
+def ask_llm(
+ client: openai.OpenAI,
+ model: str,
+ rel_path: str,
+ code: str,
+ max_tokens: int,
+ verbose: bool = True,
+) -> str:
+ schema = {
+ "name": "dash_file_summary",
+ "strict": True,
+ "schema": dict(
+ SCHEMA_BASE,
+ properties=dict(
+ SCHEMA_BASE["properties"], filename={"type": "string", "const": rel_path}
+ ),
+ ),
+ }
+
+ messages = [
+ {"role": "system", "content": SYSTEM},
+ {
+ "role": "user",
+ "content": f"{OVERVIEW}\n\n(PATH = {rel_path})\n\n===== BEGIN FILE =====\n{code}\n===== END FILE =====",
+ },
+ ]
+
+ comp = client.chat.completions.create(
+ model=model,
+ messages=messages,
+ response_format={"type": "json_schema", "json_schema": schema},
+ max_tokens=max_tokens,
+ )
+
+ raw = comp.choices[0].message.content
+ if verbose:
+ print(f"\n📝 Raw JSON for {rel_path}:\n{raw}\n")
+
+ data = json.loads(raw)
+ if data["filename"] != rel_path:
+ Console().print(
+ f"[red]⚠︎ Filename mismatch – model said {data['filename']!r}[/red]"
+ )
+ data["filename"] = rel_path
+ return data["summary"].strip()
+
+
+# ────────────────── main ──────────────────
+def main() -> None:
+ args = parse_args()
+ openai.api_key = args.api_key or os.getenv("OPENAI_API_KEY") or sys.exit(
+ "Need OPENAI_API_KEY"
+ )
+
+ root = args.root.resolve()
+ con = Console()
+ con.print(f":mag: [bold]Scanning[/bold] {root}")
+
+ files = list(iter_ts(root, args.skip_dirs))
+ if not files:
+ con.print("[yellow]No TS/TSX files found[/yellow]")
+ return
+
+ # 1. full dump of file contents (unchanged)
+ tree = make_tree(files, root)
+ (root / "ts_files_with_content.txt").write_text(
+ Console(record=True, width=120).print(tree, end="") or ""
+ )
+ with (root / "ts_files_with_content.txt").open("a", encoding="utf-8") as fp:
+ for p in tqdm(files, desc="Dumping source"):
+ fp.write(f"{p.relative_to(root)}\n{'-'*80}\n")
+ fp.write(safe_open(p).read())
+ fp.write(f"\n{'='*80}\n\n")
+
+ # 2. summaries (periodic save)
+ client = openai.OpenAI()
+ summaries: Dict[pathlib.Path, str] = {}
+ out_file = root / "ts_files_with_summaries.txt"
+
+ for idx, p in enumerate(tqdm(files, desc="GPT-4o summarizing"), 1):
+ summaries[p] = ask_llm(
+ client,
+ args.model,
+ str(p.relative_to(root)),
+ safe_open(p).read(),
+ args.max_tokens,
+ verbose=not args.quiet,
+ )
+
+ if idx % PERIODIC_SAVE_EVERY == 0:
+ write_tree_with_summaries(tree=tree, summaries=summaries, root=root, out_path=out_file)
+ con.print(f"[green]✔ Flushed after {idx} files[/green]")
+
+ # final flush
+ write_tree_with_summaries(tree=tree, summaries=summaries, root=root, out_path=out_file)
+
+ # preview
+ con.print("\n[cyan]Sample summaries:[/cyan]")
+ for i, (p, s) in enumerate(list(summaries.items())[: args.preview], 1):
+ con.print(f"{i}. {p.relative_to(root)} → {s}")
+
+ con.print(f":sparkles: Done – wrote [bold]{out_file}[/bold]")
+
+
+if __name__ == "__main__":
+ main()