Skip to content

Instantly share code, notes, and snippets.

@Xevion
Last active August 20, 2025 03:21
Show Gist options
  • Select an option

  • Save Xevion/53f65d423b199161694a49f67f085ce1 to your computer and use it in GitHub Desktop.

Select an option

Save Xevion/53f65d423b199161694a49f67f085ce1 to your computer and use it in GitHub Desktop.
Analyzes browser history to count unique domain visits with pattern normalization.
import sqlite3
import os
import platform
import shutil
from urllib.parse import urlparse
import collections
import datetime
import re
import argparse
import sys
def load_domain_patterns(pattern_file_path=None):
if pattern_file_path is None:
pattern_file_path = 'domain_patterns.txt'
patterns = []
default_patterns = [
r'^.+\.(cloudfront\.net)$', r'^.+\.(amazonaws\.com)$', r'^.+\.(herokuapp\.com)$',
r'^.+\.(netlify\.app)$', r'^.+\.(vercel\.app)$', r'^.+\.(github\.io)$',
r'^.+\.(firebaseapp\.com)$', r'^.+\.(appspot\.com)$', r'^.+\.(azurewebsites\.net)$',
r'^.+\.(cloudflare\.com)$', r'^.+\.(fastly\.com)$', r'^.+\.(cdn\.com)$',
r'^.+\.(cdn\.net)$', r'^.+\.(cdn\.org)$', r'^.+\.(s3\.amazonaws\.com)$',
r'^.+\.(s3-website-[^.]+\.amazonaws\.com)$', r'^.+\.(elasticbeanstalk\.com)$',
r'^.+\.(railway\.app)$', r'^.+\.(render\.com)$', r'^.+\.(fly\.io)$',
r'^.+\.(digitaloceanspaces\.com)$', r'^.+\.(bunnycdn\.com)$',
r'^.+\.(stackpathcdn\.com)$', r'^.+\.(keycdn\.com)$',
]
if os.path.exists(pattern_file_path):
try:
with open(pattern_file_path, 'r', encoding='utf-8') as f:
file_patterns = [line.strip() for line in f if line.strip() and not line.startswith('#')]
patterns.extend(file_patterns)
except Exception as e:
print(f"Warning: Could not load patterns from {pattern_file_path}: {e}")
patterns.extend(default_patterns)
compiled_patterns = []
for pattern in patterns:
try:
compiled_patterns.append(re.compile(pattern))
except re.error as e:
print(f"Warning: Invalid regex pattern '{pattern}': {e}")
return compiled_patterns
def apply_pattern_normalization(domain, patterns):
for pattern in patterns:
match = pattern.match(domain)
if match and match.groups():
return match.group(1)
return domain
def normalize_domain(domain, patterns=None):
if not domain:
return domain
parts = domain.split('.')
normalized_domain = domain if len(parts) <= 3 else '.'.join(parts[-3:])
if patterns:
normalized_domain = apply_pattern_normalization(normalized_domain, patterns)
return normalized_domain
def has_valid_tld(domain):
if not domain:
return False
parts = domain.split('.')
if len(parts) < 2:
return False
tld = parts[-1]
return len(tld) >= 2 and tld.islower() and tld.isalpha()
def get_browser_history_path(browser_name='Vivaldi'):
system = platform.system()
if browser_name.lower() == 'vivaldi':
if system == 'Windows':
return os.path.join(os.environ['LOCALAPPDATA'], 'Vivaldi', 'User Data', 'Default', 'History')
elif system == 'Darwin':
return os.path.join(os.path.expanduser('~'), 'Library', 'Application Support', 'Vivaldi', 'Default', 'History')
elif system == 'Linux':
return os.path.join(os.path.expanduser('~'), '.config', 'vivaldi', 'default', 'History')
raise OSError(f"Unsupported browser '{browser_name}' or operating system '{system}'.")
def copy_history_database(history_path, temp_path=None):
if temp_path is None:
temp_path = os.path.join(os.path.expanduser("~"), "browser_history_copy.db")
if not os.path.exists(history_path):
raise FileNotFoundError(f"History file not found at {history_path}")
shutil.copyfile(history_path, temp_path)
return temp_path
def get_date_range(cursor):
try:
cursor.execute("SELECT MIN(visit_time), MAX(visit_time) FROM visits")
date_range = cursor.fetchone()
earliest_timestamp, latest_timestamp = date_range
chrome_epoch = datetime.datetime(1601, 1, 1)
if earliest_timestamp and latest_timestamp:
earliest_date = chrome_epoch + datetime.timedelta(microseconds=earliest_timestamp)
latest_date = chrome_epoch + datetime.timedelta(microseconds=latest_timestamp)
def format_date(date):
day = date.day
suffix = "th" if 4 <= day <= 20 or 24 <= day <= 30 else ["st", "nd", "rd"][day % 10 - 1]
return date.strftime(f"%B {day}{suffix}, %Y")
days_between = (latest_date - earliest_date).days
return format_date(earliest_date), format_date(latest_date), days_between
else:
return "No data available", "No data available", 0
except sqlite3.OperationalError as e:
print(f"Error querying visit dates: {e}")
return "Error retrieving date", "Error retrieving date", 0
def extract_domains_from_urls(cursor, patterns=None):
try:
cursor.execute("SELECT url FROM urls")
except sqlite3.OperationalError as e:
raise Exception(f"Error querying the database: {e}. The 'urls' table might not exist or the database is corrupt.")
all_urls = cursor.fetchall()
unique_domains = set()
domain_counts = collections.Counter()
domains_removed = 0
for url_tuple in all_urls:
url = url_tuple[0]
try:
domain = urlparse(url).netloc
if domain:
if not has_valid_tld(domain):
domains_removed += 1
continue
normalized_domain = normalize_domain(domain, patterns)
if not has_valid_tld(normalized_domain):
domains_removed += 1
continue
unique_domains.add(normalized_domain)
domain_counts[normalized_domain] += 1
except Exception as e:
print(f"Could not parse URL: {url} - Error: {e}")
return unique_domains, domain_counts, domains_removed
def analyze_browser_history(browser_name='Vivaldi', temp_path=None, top_domains=10, pattern_file_path=None):
history_path = get_browser_history_path(browser_name)
temp_history_path = copy_history_database(history_path, temp_path)
patterns = load_domain_patterns(pattern_file_path)
try:
conn = sqlite3.connect(temp_history_path)
cursor = conn.cursor()
earliest_date, latest_date, days_between = get_date_range(cursor)
unique_domains, domain_counts, domains_removed = extract_domains_from_urls(cursor, patterns)
conn.close()
return {
'date_range': (earliest_date, latest_date, days_between),
'unique_domains': unique_domains,
'domain_counts': domain_counts,
'total_unique_domains': len(unique_domains),
'top_domains': domain_counts.most_common(top_domains),
'domains_removed': domains_removed
}
finally:
if os.path.exists(temp_history_path):
os.remove(temp_history_path)
def format_number(num):
return f"{num:,}"
def redact_domain(domain):
if not domain:
return domain
parts = domain.split('.')
if len(parts) <= 1:
return domain
if len(parts) >= 2 and len(parts[-2]) <= 3:
return f"???.{parts[-1]}"
redacted_parts = ['*' * len(part) for part in parts[:-1]]
redacted_parts.append(parts[-1])
return '.'.join(redacted_parts)
def print_analysis_results(results, browser_name='Vivaldi', top_domains=None, bottom_domains=None, redact=False):
earliest_date, latest_date, days_between = results['date_range']
print(f"\n--- {browser_name} History Analysis ---")
if days_between > 0:
print(f"Date range: {earliest_date} to {latest_date} ({format_number(days_between)} days)")
else:
print(f"Date range: {earliest_date} to {latest_date}")
print(f"Total unique domains found: {format_number(results['total_unique_domains'])}")
print(f"Domains removed (no valid TLD): {format_number(results['domains_removed'])}")
if top_domains is not None:
print(f"\nTop {min(top_domains, len(results['top_domains']))} most visited domains:")
for domain, count in results['top_domains'][:top_domains]:
display_domain = redact_domain(domain) if redact else domain
print(f"- {display_domain}: {format_number(count)} visits")
if bottom_domains is not None and bottom_domains > 0:
sorted_domains = sorted(results['domain_counts'].items(), key=lambda x: x[1])
bottom_domains_list = sorted_domains[:bottom_domains]
print(f"\nBottom {len(bottom_domains_list)} least visited domains:")
for domain, count in bottom_domains_list:
display_domain = redact_domain(domain) if redact else domain
print(f"- {display_domain}: {format_number(count)} visits")
def create_parser():
parser = argparse.ArgumentParser(
description='Analyze browser history to find unique domains and their visit counts.',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s # Default analysis with top 100 domains
%(prog)s --top 20 # Show only top 20 domains
%(prog)s --bottom 10 # Show only bottom 10 domains
%(prog)s --top 20 --bottom 10 # Show both top 20 and bottom 10
%(prog)s --browser Chrome # Analyze Chrome instead of Vivaldi
%(prog)s --patterns custom.txt # Use custom pattern file
%(prog)s --no-patterns # Disable pattern normalization
%(prog)s --temp-path /tmp/hist.db # Use custom temporary file path
%(prog)s --redact # Redact domain names for privacy
"""
)
parser.add_argument('--browser', '-b', default='Vivaldi', help='Browser to analyze (default: Vivaldi)')
parser.add_argument('--top', '-t', type=int, default=None, help='Number of top domains to display (default: 100 when no --bottom specified)')
parser.add_argument('--bottom', '-bt', type=int, default=None, help='Number of bottom domains to display')
parser.add_argument('--patterns', '-p', dest='pattern_file_path', help='Path to custom domain pattern file (default: domain_patterns.txt)')
parser.add_argument('--no-patterns', action='store_true', help='Disable pattern-based domain normalization')
parser.add_argument('--temp-path', help='Custom temporary file path for database copy')
parser.add_argument('--quiet', '-q', action='store_true', help='Suppress warning messages')
parser.add_argument('--redact', action='store_true', help='Redact domain names for privacy (shows only TLD)')
parser.add_argument('--version', action='version', version='%(prog)s 1.0')
return parser
def main_cli():
parser = create_parser()
args = parser.parse_args()
pattern_file_path = None if args.no_patterns else args.pattern_file_path
if args.top is not None and args.top < 0:
print("Error: --top must be non-negative", file=sys.stderr)
sys.exit(1)
if args.bottom is not None and args.bottom < 0:
print("Error: --bottom must be non-negative", file=sys.stderr)
sys.exit(1)
show_top = args.top is not None
show_bottom = args.bottom is not None
if not show_top and not show_bottom:
show_top = True
top_count = 100
else:
top_count = args.top if show_top else 1
if args.quiet:
import warnings
warnings.filterwarnings("ignore")
try:
results = analyze_browser_history(
browser_name=args.browser,
temp_path=args.temp_path,
top_domains=top_count,
pattern_file_path=pattern_file_path
)
print_analysis_results(
results,
browser_name=args.browser,
top_domains=args.top if show_top else None,
bottom_domains=args.bottom if show_bottom else None,
redact=args.redact
)
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
def main(browser_name='Vivaldi', top_domains=100, temp_path=None, pattern_file_path=None, redact=False):
try:
results = analyze_browser_history(
browser_name=browser_name,
temp_path=temp_path,
top_domains=top_domains,
pattern_file_path=pattern_file_path
)
print_analysis_results(results, browser_name, redact=redact)
except Exception as e:
print(f"An error occurred: {e}")
if __name__ == "__main__":
main_cli()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment