Created
March 12, 2026 18:39
-
-
Save neilernst/43d9e329e895169faf960e3f00c3b3be to your computer and use it in GitHub Desktop.
Extract documents and frequencies from Taguette tags
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """Retrieve tag frequency data from a Taguette SQLite database. | |
| Produces: | |
| 1. Overall tag frequency (ordered by count descending) | |
| 2. Tag frequency per document | |
| 3. CSV output to stdout | |
| """ | |
| import argparse | |
| import csv | |
| import sqlite3 | |
| import sys | |
| def get_tag_frequencies(db_path, project_id=None): | |
| conn = sqlite3.connect(db_path) | |
| cur = conn.cursor() | |
| project_filter = "" | |
| params = () | |
| if project_id is not None: | |
| project_filter = "AND t.project_id = ?" | |
| params = (project_id,) | |
| # Get all documents (filtered by project if specified) | |
| if project_id is not None: | |
| cur.execute("SELECT id, name FROM documents WHERE project_id = ? ORDER BY name", | |
| (project_id,)) | |
| else: | |
| cur.execute("SELECT id, name FROM documents ORDER BY name") | |
| documents = cur.fetchall() | |
| # Overall tag frequency: count of highlights per tag | |
| cur.execute(f""" | |
| SELECT t.path, COUNT(ht.highlight_id) as freq | |
| FROM tags t | |
| JOIN highlight_tags ht ON ht.tag_id = t.id | |
| WHERE 1=1 {project_filter} | |
| GROUP BY t.id | |
| ORDER BY freq DESC | |
| """, params) | |
| overall = cur.fetchall() | |
| # Tag frequency per document | |
| cur.execute(f""" | |
| SELECT t.path, d.name, COUNT(ht.highlight_id) as freq | |
| FROM tags t | |
| JOIN highlight_tags ht ON ht.tag_id = t.id | |
| JOIN highlights h ON h.id = ht.highlight_id | |
| JOIN documents d ON d.id = h.document_id | |
| WHERE 1=1 {project_filter} | |
| GROUP BY t.id, d.id | |
| ORDER BY t.path, d.name | |
| """, params) | |
| per_doc = cur.fetchall() | |
| # Unique document count per tag | |
| cur.execute(f""" | |
| SELECT t.path, COUNT(DISTINCT h.document_id) as num_docs | |
| FROM tags t | |
| JOIN highlight_tags ht ON ht.tag_id = t.id | |
| JOIN highlights h ON h.id = ht.highlight_id | |
| WHERE 1=1 {project_filter} | |
| GROUP BY t.id | |
| ORDER BY t.path | |
| """, params) | |
| doc_counts = {path: n for path, n in cur.fetchall()} | |
| conn.close() | |
| return documents, overall, per_doc, doc_counts | |
| def list_projects(db_path): | |
| conn = sqlite3.connect(db_path) | |
| cur = conn.cursor() | |
| cur.execute("SELECT id, name FROM projects ORDER BY id") | |
| projects = cur.fetchall() | |
| conn.close() | |
| return projects | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Tag frequency from Taguette DB") | |
| parser.add_argument("db", nargs="?", default="taguette.sqlite3", | |
| help="Path to taguette.sqlite3 (default: taguette.sqlite3)") | |
| parser.add_argument("--per-doc", action="store_true", | |
| help="Also output per-document breakdown") | |
| parser.add_argument("--pivot", action="store_true", | |
| help="Output pivot table: tags as rows, documents as columns") | |
| parser.add_argument("--doc-count", action="store_true", | |
| help="Include number of unique documents each tag appears in") | |
| parser.add_argument("--project", type=str, default=None, | |
| help="Filter by project name or ID") | |
| parser.add_argument("--list-projects", action="store_true", | |
| help="List available projects and exit") | |
| args = parser.parse_args() | |
| if args.list_projects: | |
| for pid, name in list_projects(args.db): | |
| print(f"{pid}\t{name}") | |
| return | |
| # Resolve project | |
| project_id = None | |
| if args.project is not None: | |
| if args.project.isdigit(): | |
| project_id = int(args.project) | |
| else: | |
| projects = list_projects(args.db) | |
| matches = [(pid, name) for pid, name in projects | |
| if args.project.lower() in name.lower()] | |
| if len(matches) == 1: | |
| project_id = matches[0][0] | |
| elif len(matches) == 0: | |
| print(f"No project matching '{args.project}'. Use --list-projects.", | |
| file=sys.stderr) | |
| sys.exit(1) | |
| else: | |
| print(f"Ambiguous project '{args.project}': {matches}. Use --list-projects.", | |
| file=sys.stderr) | |
| sys.exit(1) | |
| documents, overall, per_doc, doc_counts = get_tag_frequencies(args.db, project_id) | |
| writer = csv.writer(sys.stdout) | |
| if args.pivot: | |
| doc_names = sorted(set(name for _, name in documents)) | |
| lookup = {} | |
| for tag, doc, freq in per_doc: | |
| lookup[(tag, doc)] = freq | |
| header = ["tag", "total"] | |
| if args.doc_count: | |
| header.append("num_documents") | |
| header += doc_names | |
| writer.writerow(header) | |
| for tag, total in overall: | |
| row = [tag, total] | |
| if args.doc_count: | |
| row.append(doc_counts.get(tag, 0)) | |
| row += [lookup.get((tag, d), 0) for d in doc_names] | |
| writer.writerow(row) | |
| elif args.per_doc: | |
| writer.writerow(["tag", "document", "frequency"]) | |
| for tag, doc, freq in per_doc: | |
| writer.writerow([tag, doc, freq]) | |
| else: | |
| header = ["tag", "frequency"] | |
| if args.doc_count: | |
| header.append("num_documents") | |
| writer.writerow(header) | |
| for tag, freq in overall: | |
| row = [tag, freq] | |
| if args.doc_count: | |
| row.append(doc_counts.get(tag, 0)) | |
| writer.writerow(row) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment