Skip to content

Instantly share code, notes, and snippets.

@neilernst
Created March 12, 2026 18:39
Show Gist options
  • Select an option

  • Save neilernst/43d9e329e895169faf960e3f00c3b3be to your computer and use it in GitHub Desktop.

Select an option

Save neilernst/43d9e329e895169faf960e3f00c3b3be to your computer and use it in GitHub Desktop.
Extract documents and frequencies from Taguette tags
#!/usr/bin/env python3
"""Retrieve tag frequency data from a Taguette SQLite database.
Produces:
1. Overall tag frequency (ordered by count descending)
2. Tag frequency per document
3. CSV output to stdout
"""
import argparse
import csv
import sqlite3
import sys
def get_tag_frequencies(db_path, project_id=None):
conn = sqlite3.connect(db_path)
cur = conn.cursor()
project_filter = ""
params = ()
if project_id is not None:
project_filter = "AND t.project_id = ?"
params = (project_id,)
# Get all documents (filtered by project if specified)
if project_id is not None:
cur.execute("SELECT id, name FROM documents WHERE project_id = ? ORDER BY name",
(project_id,))
else:
cur.execute("SELECT id, name FROM documents ORDER BY name")
documents = cur.fetchall()
# Overall tag frequency: count of highlights per tag
cur.execute(f"""
SELECT t.path, COUNT(ht.highlight_id) as freq
FROM tags t
JOIN highlight_tags ht ON ht.tag_id = t.id
WHERE 1=1 {project_filter}
GROUP BY t.id
ORDER BY freq DESC
""", params)
overall = cur.fetchall()
# Tag frequency per document
cur.execute(f"""
SELECT t.path, d.name, COUNT(ht.highlight_id) as freq
FROM tags t
JOIN highlight_tags ht ON ht.tag_id = t.id
JOIN highlights h ON h.id = ht.highlight_id
JOIN documents d ON d.id = h.document_id
WHERE 1=1 {project_filter}
GROUP BY t.id, d.id
ORDER BY t.path, d.name
""", params)
per_doc = cur.fetchall()
# Unique document count per tag
cur.execute(f"""
SELECT t.path, COUNT(DISTINCT h.document_id) as num_docs
FROM tags t
JOIN highlight_tags ht ON ht.tag_id = t.id
JOIN highlights h ON h.id = ht.highlight_id
WHERE 1=1 {project_filter}
GROUP BY t.id
ORDER BY t.path
""", params)
doc_counts = {path: n for path, n in cur.fetchall()}
conn.close()
return documents, overall, per_doc, doc_counts
def list_projects(db_path):
conn = sqlite3.connect(db_path)
cur = conn.cursor()
cur.execute("SELECT id, name FROM projects ORDER BY id")
projects = cur.fetchall()
conn.close()
return projects
def main():
parser = argparse.ArgumentParser(description="Tag frequency from Taguette DB")
parser.add_argument("db", nargs="?", default="taguette.sqlite3",
help="Path to taguette.sqlite3 (default: taguette.sqlite3)")
parser.add_argument("--per-doc", action="store_true",
help="Also output per-document breakdown")
parser.add_argument("--pivot", action="store_true",
help="Output pivot table: tags as rows, documents as columns")
parser.add_argument("--doc-count", action="store_true",
help="Include number of unique documents each tag appears in")
parser.add_argument("--project", type=str, default=None,
help="Filter by project name or ID")
parser.add_argument("--list-projects", action="store_true",
help="List available projects and exit")
args = parser.parse_args()
if args.list_projects:
for pid, name in list_projects(args.db):
print(f"{pid}\t{name}")
return
# Resolve project
project_id = None
if args.project is not None:
if args.project.isdigit():
project_id = int(args.project)
else:
projects = list_projects(args.db)
matches = [(pid, name) for pid, name in projects
if args.project.lower() in name.lower()]
if len(matches) == 1:
project_id = matches[0][0]
elif len(matches) == 0:
print(f"No project matching '{args.project}'. Use --list-projects.",
file=sys.stderr)
sys.exit(1)
else:
print(f"Ambiguous project '{args.project}': {matches}. Use --list-projects.",
file=sys.stderr)
sys.exit(1)
documents, overall, per_doc, doc_counts = get_tag_frequencies(args.db, project_id)
writer = csv.writer(sys.stdout)
if args.pivot:
doc_names = sorted(set(name for _, name in documents))
lookup = {}
for tag, doc, freq in per_doc:
lookup[(tag, doc)] = freq
header = ["tag", "total"]
if args.doc_count:
header.append("num_documents")
header += doc_names
writer.writerow(header)
for tag, total in overall:
row = [tag, total]
if args.doc_count:
row.append(doc_counts.get(tag, 0))
row += [lookup.get((tag, d), 0) for d in doc_names]
writer.writerow(row)
elif args.per_doc:
writer.writerow(["tag", "document", "frequency"])
for tag, doc, freq in per_doc:
writer.writerow([tag, doc, freq])
else:
header = ["tag", "frequency"]
if args.doc_count:
header.append("num_documents")
writer.writerow(header)
for tag, freq in overall:
row = [tag, freq]
if args.doc_count:
row.append(doc_counts.get(tag, 0))
writer.writerow(row)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment