Skip to content

Instantly share code, notes, and snippets.

@Klaudioz
Created June 21, 2025 22:12
Show Gist options
  • Select an option

  • Save Klaudioz/aa0599ebebbe3a1c04171fb6a43c750d to your computer and use it in GitHub Desktop.

Select an option

Save Klaudioz/aa0599ebebbe3a1c04171fb6a43c750d to your computer and use it in GitHub Desktop.
standalone Python script that replicates the core functionality of the gitingest tool
#!/usr/bin/env python3
"""
gitingest.py - Turn any Git repository into a prompt-friendly text and copy to clipboard.
Usage:
python gitingest.py <URL/LOCAL_FOLDER> [options]
Options:
--exclude PATTERN Patterns to exclude (can be used multiple times)
--max-size SIZE Include files under SIZE kb (default: 50kb)
Examples:
python gitingest.py https://github.com/user/repo
python gitingest.py ./local-folder --exclude "*.log" --exclude "tests/*"
python gitingest.py https://github.com/user/repo --max-size 100
"""
import os
import sys
import platform
import subprocess
import tempfile
import shutil
import argparse
from pathlib import Path
from typing import Set, Tuple, Optional
import re
from fnmatch import fnmatch
import uuid
# Default ignore patterns (from gitingest)
DEFAULT_IGNORE_PATTERNS = {
# Python
"*.pyc", "*.pyo", "*.pyd", "__pycache__", ".pytest_cache", ".coverage",
".tox", ".nox", ".mypy_cache", ".ruff_cache", ".hypothesis", "poetry.lock",
"Pipfile.lock",
# JavaScript/Node
"node_modules", "bower_components", "package-lock.json", "yarn.lock",
".npm", ".yarn", ".pnpm-store", "bun.lock", "bun.lockb",
# Java
"*.class", "*.jar", "*.war", "*.ear", "*.nar", ".gradle/", "build/",
".settings/", ".classpath", "gradle-app.setting", "*.gradle",
# IDEs and editors / Java
".project",
# C/C++
"*.o", "*.obj", "*.dll", "*.dylib", "*.exe", "*.lib", "*.out", "*.a", "*.pdb",
# Swift/Xcode
".build/", "*.xcodeproj/", "*.xcworkspace/", "*.pbxuser", "*.mode1v3",
"*.mode2v3", "*.perspectivev3", "*.xcuserstate", "xcuserdata/", ".swiftpm/",
# Ruby
"*.gem", ".bundle/", "vendor/bundle", "Gemfile.lock", ".ruby-version",
".ruby-gemset", ".rvmrc",
# Rust
"Cargo.lock", "**/*.rs.bk",
# Java / Rust
"target/",
# Go
"pkg/",
# .NET/C#
"obj/", "*.suo", "*.user", "*.userosscache", "*.sln.docstates", "packages/",
"*.nupkg",
# Go / .NET / C#
"bin/",
# Version control
".git", ".svn", ".hg", ".gitignore", ".gitattributes", ".gitmodules",
# Images and media
"*.svg", "*.png", "*.jpg", "*.jpeg", "*.gif", "*.ico", "*.pdf", "*.mov",
"*.mp4", "*.mp3", "*.wav",
# Virtual environments
"venv", ".venv", "env", ".env", "virtualenv",
# IDEs and editors
".idea", ".vscode", ".vs", "*.swo", "*.swn", ".settings", "*.sublime-*",
# Temporary and cache files
"*.log", "*.bak", "*.swp", "*.tmp", "*.temp", ".cache", ".sass-cache",
".eslintcache", ".DS_Store", "Thumbs.db", "desktop.ini",
# Build directories and artifacts
"build", "dist", "target", "out", "*.egg-info", "*.egg", "*.whl", "*.so",
# Documentation
"site-packages", ".docusaurus", ".next", ".nuxt",
# Other common patterns
"*.min.js", "*.min.css", "*.map", ".terraform", "*.tfstate*", "vendor/",
# Gitingest
"digest.txt",
}
SEPARATOR = "=" * 48
def copy_to_clipboard(text: str) -> bool:
"""Copy text to system clipboard."""
system = platform.system()
try:
if system == "Darwin": # macOS
process = subprocess.Popen(['pbcopy'], stdin=subprocess.PIPE)
process.communicate(text.encode('utf-8'))
return process.returncode == 0
elif system == "Windows":
process = subprocess.Popen(['clip'], stdin=subprocess.PIPE, shell=True)
process.communicate(text.encode('utf-8'))
return process.returncode == 0
else: # Linux/Unix
# Try xclip first
try:
process = subprocess.Popen(['xclip', '-selection', 'clipboard'], stdin=subprocess.PIPE)
process.communicate(text.encode('utf-8'))
return process.returncode == 0
except FileNotFoundError:
# Try xsel if xclip is not available
try:
process = subprocess.Popen(['xsel', '--clipboard', '--input'], stdin=subprocess.PIPE)
process.communicate(text.encode('utf-8'))
return process.returncode == 0
except FileNotFoundError:
print("Error: Neither xclip nor xsel found. Please install one of them:")
print(" sudo apt-get install xclip # or")
print(" sudo apt-get install xsel")
return False
except Exception as e:
print(f"Error copying to clipboard: {e}")
return False
def estimate_tokens(text: str) -> str:
"""Estimate token count for the text."""
# Simple approximation: ~4 characters per token (rough estimate for English text)
# This is a simplified version - the actual gitingest uses tiktoken
char_count = len(text)
token_estimate = char_count // 4
if token_estimate >= 1_000_000:
return f"{token_estimate / 1_000_000:.1f}M"
elif token_estimate >= 1_000:
return f"{token_estimate / 1_000:.1f}k"
else:
return str(token_estimate)
def is_text_file(path: Path) -> bool:
"""Check if a file is likely a text file."""
try:
with path.open('rb') as f:
chunk = f.read(1024)
if not chunk:
return True
# Check for binary markers
if b'\x00' in chunk or b'\xff' in chunk:
return False
# Try to decode as UTF-8
try:
chunk.decode('utf-8')
return True
except UnicodeDecodeError:
return False
except:
return False
def should_exclude(path: Path, base_path: Path, ignore_patterns: Set[str]) -> bool:
"""Check if a path should be excluded based on patterns."""
try:
rel_path = path.relative_to(base_path)
except ValueError:
return True
rel_str = str(rel_path)
for pattern in ignore_patterns:
if pattern and fnmatch(rel_str, pattern):
return True
return False
def clone_repo(url: str, target_path: Path) -> bool:
"""Clone a git repository to the target path."""
try:
subprocess.run(['git', 'clone', '--depth=1', url, str(target_path)],
check=True, capture_output=True, text=True)
return True
except subprocess.CalledProcessError as e:
print(f"Error cloning repository: {e.stderr}")
return False
except FileNotFoundError:
print("Error: git is not installed. Please install git first.")
return False
def process_directory(path: Path, max_file_size: int, ignore_patterns: Set[str]) -> Tuple[str, str, int]:
"""Process a directory and return tree structure, content, and file count."""
tree_lines = []
content_lines = []
file_count = 0
def build_tree(current_path: Path, prefix: str = "", is_last: bool = True):
nonlocal file_count
if should_exclude(current_path, path, ignore_patterns):
return
# Add current item to tree
current_prefix = "└── " if is_last else "├── "
name = current_path.name
if current_path.is_dir():
name += "/"
if current_path != path: # Don't add the root directory itself
tree_lines.append(f"{prefix}{current_prefix}{name}")
if current_path.is_file():
# Check file size
if current_path.stat().st_size <= max_file_size:
if is_text_file(current_path):
file_count += 1
# Add file content
try:
content = current_path.read_text(encoding='utf-8')
relative_path = current_path.relative_to(path)
content_lines.extend([
"",
SEPARATOR,
f"FILE: {relative_path}",
SEPARATOR,
content
])
except Exception as e:
content_lines.extend([
"",
SEPARATOR,
f"FILE: {relative_path}",
SEPARATOR,
f"[Error reading file: {e}]"
])
elif current_path.is_dir():
# Process subdirectory
items = sorted(current_path.iterdir(), key=lambda x: (x.is_file(), x.name.lower()))
# Filter out excluded items
items = [item for item in items if not should_exclude(item, path, ignore_patterns)]
new_prefix = prefix + (" " if is_last else "│ ")
for i, item in enumerate(items):
build_tree(item, new_prefix, i == len(items) - 1)
# Start building the tree
tree_lines.append(f"{path.name}/")
build_tree(path)
tree = "\n".join(tree_lines)
content = "\n".join(content_lines)
return tree, content, file_count
def parse_url(url: str) -> Tuple[str, str]:
"""Parse repository URL to extract user and repo name."""
# Remove .git suffix if present
url = url.rstrip('/').removesuffix('.git')
# Extract path from URL
if url.startswith(('http://', 'https://')):
parts = url.split('/')
if len(parts) >= 5:
user = parts[-2]
repo = parts[-1]
return user, repo
raise ValueError("Invalid repository URL")
def main():
parser = argparse.ArgumentParser(
description="Turn any Git repository into a prompt-friendly text and copy to clipboard.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""Examples:
python gitingest.py https://github.com/user/repo
python gitingest.py ./local-folder --exclude "*.log" --exclude "tests/*"
python gitingest.py https://github.com/user/repo --max-size 100""")
parser.add_argument('source', help='Repository URL or local folder path')
parser.add_argument('--exclude', action='append', default=[],
help='Patterns to exclude (can be used multiple times)')
parser.add_argument('--max-size', type=int, default=50,
help='Include files under SIZE kb (default: 50kb)')
args = parser.parse_args()
# Convert max-size from kb to bytes
max_file_size = args.max_size * 1024
# Prepare ignore patterns
ignore_patterns = DEFAULT_IGNORE_PATTERNS.copy()
for pattern in args.exclude:
ignore_patterns.add(pattern)
# Determine if source is URL or local path
is_url = args.source.startswith(('http://', 'https://'))
temp_dir = None
try:
if is_url:
# Clone repository to temporary directory
print(f"Cloning repository: {args.source}")
temp_dir = tempfile.mkdtemp(prefix='gitingest_')
# Extract repo name for the folder
try:
user, repo = parse_url(args.source)
repo_dir = Path(temp_dir) / f"{user}-{repo}"
except:
repo_dir = Path(temp_dir) / "repo"
if not clone_repo(args.source, repo_dir):
sys.exit(1)
source_path = repo_dir
repo_name = f"{user}/{repo}" if 'user' in locals() else args.source
else:
# Use local directory
source_path = Path(args.source).resolve()
if not source_path.exists():
print(f"Error: Path '{args.source}' does not exist")
sys.exit(1)
if not source_path.is_dir():
print(f"Error: '{args.source}' is not a directory")
sys.exit(1)
repo_name = source_path.name
print(f"Processing: {repo_name}")
print(f"Max file size: {args.max_size}kb")
if args.exclude:
print(f"Additional exclude patterns: {', '.join(args.exclude)}")
# Process the directory
tree, content, file_count = process_directory(source_path, max_file_size, ignore_patterns)
# Prepare the output
full_content = f"Directory structure:\n{tree}\n\n\nFiles Content:\n{content}"
# Estimate tokens
token_estimate = estimate_tokens(full_content)
# Create summary
summary = f"Repository: {repo_name}\n"
summary += f"Files analyzed: {file_count}\n"
summary += f"Estimated tokens: {token_estimate}"
print("\n" + summary)
# Copy to clipboard
print("\nCopying to clipboard...")
if copy_to_clipboard(full_content):
print("✓ Content copied to clipboard successfully!")
else:
print("✗ Failed to copy to clipboard")
# Optionally save to file as fallback
fallback_file = Path("gitingest_output.txt")
fallback_file.write_text(full_content, encoding='utf-8')
print(f"Content saved to: {fallback_file}")
finally:
# Cleanup temporary directory if created
if temp_dir and Path(temp_dir).exists():
shutil.rmtree(temp_dir)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment