Skip to content

Instantly share code, notes, and snippets.

@rlewkowicz
Last active August 24, 2025 18:40
Show Gist options
  • Select an option

  • Save rlewkowicz/07be0e812664c1e43d20c75ea8054be7 to your computer and use it in GitHub Desktop.

Select an option

Save rlewkowicz/07be0e812664c1e43d20c75ea8054be7 to your computer and use it in GitHub Desktop.
import os
import subprocess
import pathlib
import argparse
import tempfile
import time
from typing import List, Dict, Tuple, Optional
from openai import OpenAI
from openai import BadRequestError
client = OpenAI()
parser = argparse.ArgumentParser(
description=
"Upload repo files to an OpenAI Vector Store and request a code review via File Search."
)
parser.add_argument(
"--diff",
action="store_true",
help="Upload only UNSTAGED changes (working tree). Default is all tracked files."
)
parser.add_argument(
"--prompt", type=str, default="", help="Complete override of the review prompt."
)
parser.add_argument(
"--add",
type=str,
default="",
help="Additional instructions appended to the default review prompt."
)
parser.add_argument(
"--structure",
choices=["repo", "selection", "none"],
default="selection",
help="Attach a REPO_STRUCTURE.md manifest: 'repo' = whole repo tree (default), "
"'selection' = only uploaded files' tree, 'none' = don't attach."
)
parser.add_argument(
"--model",
type=str,
default="gpt-5",
help="Model to use (default: gpt-5). Example alternatives: gpt-4.1, gpt-4o."
)
parser.add_argument(
"--api-key", type=str, default=None, help="Override OPENAI_API_KEY for this run."
)
parser.add_argument(
"--root",
action="append",
default=None,
metavar="DIRS",
help="Override allowed top-level root directories. "
"Repeat this flag or provide a comma-separated list. "
"Defaults to: core, models, utils"
)
parser.add_argument(
"--single",
action="append",
default=None,
metavar="FILES",
help="Override allowed single files (repo-relative paths). "
"Repeat this flag or provide a comma-separated list. "
"Default: 9001.py"
)
args = parser.parse_args()
if args.api_key:
client = OpenAI(api_key=args.api_key)
repo_root = pathlib.Path.cwd()
def run(cmd: List[str]) -> str:
return subprocess.check_output(cmd, text=True, cwd=repo_root).strip()
def list_tracked() -> List[str]:
out = run(["git", "ls-files"])
return [p for p in out.splitlines() if p.strip()]
def list_unstaged_only() -> List[str]:
modified = run(["git", "diff", "--name-only"]).splitlines()
return sorted(set([p for p in modified if p.strip()]))
def git_status_map() -> Dict[str, str]:
"""
Map path -> two-letter porcelain code (e.g., ' M', '??', 'A ', 'M ').
"""
out = run(["git", "status", "--porcelain"])
status = {}
for line in out.splitlines():
if not line.strip():
continue
code = line[:2]
path = line[3:]
status[path] = code
return status
candidates = list_unstaged_only() if args.diff else list_tracked()
ALLOWED_EXTS = {
"c", "cpp", "css", "csv", "doc", "docx", "gif", "go", "html", "java", "jpeg", "jpg", "js",
"json", "md", "pdf", "php", "pkl", "png", "pptx", "py", "rb", "tar", "tex", "ts", "txt", "webp",
"xlsx", "xml", "zip"
}
_DEFAULT_ALLOWED_ROOTS = {"core", "models", "utils"}
_DEFAULT_ALLOWED_SINGLE_FILES = {"9001.py"}
def _collect_overrides(values: Optional[List[str]]) -> Optional[set]:
if not values:
return None
items: set = set()
for v in values:
for piece in v.split(","):
piece = piece.strip()
if piece:
items.add(piece)
return items if items else None
_root_override = _collect_overrides(args.root)
_single_override = _collect_overrides(args.single)
ALLOWED_ROOTS = _root_override if _root_override is not None else _DEFAULT_ALLOWED_ROOTS
ALLOWED_SINGLE_FILES = _single_override if _single_override is not None else _DEFAULT_ALLOWED_SINGLE_FILES
def allowed_scope(p: str) -> bool:
parts = pathlib.PurePosixPath(p).parts
if not parts:
return False
top = parts[0]
if p in ALLOWED_SINGLE_FILES:
return True
return top in ALLOWED_ROOTS
def keep(p: str) -> bool:
path = (repo_root / p)
if not path.exists() or not path.is_file():
return False
ext = path.suffix.lower().lstrip(".")
if ext not in ALLOWED_EXTS:
return False
try:
size = path.stat().st_size
except OSError:
return False
if size <= 0: # <-- skip zero-byte files (prevents OpenAI 400 "File is empty.")
return False
return size < 2_000_000 # 2MB cap for review inputs
files = [f for f in candidates if allowed_scope(f) and keep(f)]
empty_skipped = []
for p in candidates:
path = repo_root / p
if allowed_scope(p) and path.exists() and path.is_file():
try:
if path.stat().st_size == 0:
empty_skipped.append(p)
except OSError:
pass
if empty_skipped:
print("[info] Skipped zero-byte files:", ", ".join(sorted(empty_skipped)))
if not files:
raise SystemExit("No matching files to upload (after filtering).")
def build_tree(paths: List[str]) -> str:
tree = {}
for p in sorted(paths):
parts = p.split("/")
node = tree
for i, part in enumerate(parts):
if i == len(parts) - 1:
node.setdefault("__files__", []).append(part)
else:
node = node.setdefault(part, {})
lines = []
def walk(node, prefix=""):
dirs = sorted([k for k in node.keys() if k != "__files__"])
files_ = sorted(node.get("__files__", []))
for i, d in enumerate(dirs):
is_last = (i == len(dirs) - 1) and not files_
lines.append(f"{prefix}{'└── ' if is_last else '├── '}{d}/")
walk(node[d], prefix + (" " if is_last else "│ "))
for j, f in enumerate(files_):
is_last = (j == len(files_) - 1)
lines.append(f"{prefix}{'└── ' if is_last else '├── '}{f}")
walk(tree)
return "\n".join(lines)
def format_size(n: int) -> str:
for unit in ["B", "KB", "MB", "GB"]:
if n < 1024 or unit == "GB":
return f"{n:.0f}{unit}"
n /= 1024.0
def write_manifest(scope: str):
if scope == "none":
return None
status = git_status_map()
scope_paths = list_tracked() if scope == "repo" else files
stamped = time.strftime("%Y-%m-%d %H:%M:%S")
sizes = {}
for p in scope_paths:
path = repo_root / p
if path.exists() and path.is_file():
try:
sizes[p] = format_size(path.stat().st_size)
except OSError:
sizes[p] = "?"
tree_txt = build_tree(scope_paths)
details = []
for p in sorted(scope_paths):
flag = status.get(p, " ")
sz = sizes.get(p, "?")
details.append(f"- `{p}` · **{sz}** · status: `{flag}`")
details_block = "\n".join(details)
md = f"""# Repository Structure
**Root:** `{repo_root.name}`
**Generated:** {stamped}
**Upload mode:** {"unstaged only" if args.diff else "all tracked"}
**Structure scope:** {scope}
```
{tree_txt}
```
{details_block}
"""
tmp = tempfile.NamedTemporaryFile(
mode="w", delete=False, suffix=".md", prefix="REPO_STRUCTURE_"
)
tmp.write(md)
tmp.flush()
tmp.close()
return tmp.name
manifest_path = write_manifest(args.structure)
vs_name = f"{repo_root.name}-review-{int(time.time())}"
vector_store = client.vector_stores.create(
name=vs_name,
metadata={"project": "repo-review", "repo": repo_root.name},
expires_after={"anchor": "last_active_at", "days": 1},
)
file_paths = [str(repo_root / p) for p in files]
if manifest_path:
file_paths.append(manifest_path)
file_streams = [open(p, "rb") for p in file_paths]
try:
batch = client.vector_stores.file_batches.upload_and_poll(
vector_store_id=vector_store.id,
files=file_streams,
)
finally:
for fh in file_streams:
try:
fh.close()
except Exception:
pass
if manifest_path:
try:
os.unlink(manifest_path)
except Exception:
pass
print(f"Vector store: {vector_store.id} (name='{vs_name}')")
print(
f"Batch status: {getattr(batch, 'status', 'unknown')}, counts: {getattr(batch, 'file_counts', None)}"
)
failed: List[Tuple[str, Optional[str]]] = []
files_list = client.vector_stores.files.list(vector_store_id=vector_store.id)
for f in files_list.data:
st = getattr(f, "status", None)
if st == "failed":
failed.append((f.id, getattr(f, "last_error", None)))
if failed:
print("\n[warn] Some files failed to ingest:")
for fid, err in failed:
print(f" - {fid}: {err}")
print("[warn] Continuing anyway; the model will retrieve from successfully ingested files.")
if args.prompt:
base_prompt = args.prompt
else:
base_prompt = (
"Don't reference any prior conversations, files, or contexts. Only look at what is provided in this chat. Act as a world class ML researcher. This is a YOLO variant. Starting with high priority needs, give me actionable items, if any, for this code base. At this time identify only critical bugs in mathematical functions. Don't focus on new features or improvements, flexibility. "
"Create a detailed plan for remediation. When providing code try to limit snippets to 30 lines per actionable fix. Abridge and generalize the examples where needed to fit this limit. Be explicit about what needs to be changed. "
"This repo does not use warmup. The optimizer is RLION, which does not use warmup, don't suggest warmup. You get stuck on the grey scale, and loop suggestions, ignore the grayscale bug"
"A manifest file describing the directory tree is included—use it to reference exact paths when helpful. "
"Really focus on interactions across files, such as detect head, DFL, aligner, and compute loss, dataloaders, iou, various box ops. Make sure to follow logic structures through completion. "
"For any change you propose, check what that change impacts and what relies on it, recommend changes to subsequent systems as well, and again from there. "
)
if args.add:
base_prompt += "\n\nAdditional instructions:\n" + args.add
content_items = [{"type": "input_text", "text": base_prompt}]
def create_response(model_id: str):
kwargs = {
"model": model_id,
"input": [{
"role": "user",
"content": content_items,
}],
"tools": [{
"type": "file_search",
"vector_store_ids": [vector_store.id],
}],
}
try:
return client.responses.create(**kwargs)
except BadRequestError as e:
raise e
try:
resp = create_response(args.model)
except BadRequestError as e:
msg = str(e)
if "model_not_found" in msg or "does not exist" in msg:
print(
f"[warn] Model '{args.model}' not available on this account. Falling back to 'gpt-4.1'."
)
resp = create_response("gpt-4.1")
else:
raise
out = getattr(resp, "output_text", None)
if out is None:
try:
out = resp.output[0].content[0].text
except Exception:
out = str(resp)
print(out)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment