Last active
August 20, 2025 03:21
-
-
Save Xevion/53f65d423b199161694a49f67f085ce1 to your computer and use it in GitHub Desktop.
Analyzes browser history to count unique domain visits with pattern normalization.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import sqlite3 | |
| import os | |
| import platform | |
| import shutil | |
| from urllib.parse import urlparse | |
| import collections | |
| import datetime | |
| import re | |
| import argparse | |
| import sys | |
| def load_domain_patterns(pattern_file_path=None): | |
| if pattern_file_path is None: | |
| pattern_file_path = 'domain_patterns.txt' | |
| patterns = [] | |
| default_patterns = [ | |
| r'^.+\.(cloudfront\.net)$', r'^.+\.(amazonaws\.com)$', r'^.+\.(herokuapp\.com)$', | |
| r'^.+\.(netlify\.app)$', r'^.+\.(vercel\.app)$', r'^.+\.(github\.io)$', | |
| r'^.+\.(firebaseapp\.com)$', r'^.+\.(appspot\.com)$', r'^.+\.(azurewebsites\.net)$', | |
| r'^.+\.(cloudflare\.com)$', r'^.+\.(fastly\.com)$', r'^.+\.(cdn\.com)$', | |
| r'^.+\.(cdn\.net)$', r'^.+\.(cdn\.org)$', r'^.+\.(s3\.amazonaws\.com)$', | |
| r'^.+\.(s3-website-[^.]+\.amazonaws\.com)$', r'^.+\.(elasticbeanstalk\.com)$', | |
| r'^.+\.(railway\.app)$', r'^.+\.(render\.com)$', r'^.+\.(fly\.io)$', | |
| r'^.+\.(digitaloceanspaces\.com)$', r'^.+\.(bunnycdn\.com)$', | |
| r'^.+\.(stackpathcdn\.com)$', r'^.+\.(keycdn\.com)$', | |
| ] | |
| if os.path.exists(pattern_file_path): | |
| try: | |
| with open(pattern_file_path, 'r', encoding='utf-8') as f: | |
| file_patterns = [line.strip() for line in f if line.strip() and not line.startswith('#')] | |
| patterns.extend(file_patterns) | |
| except Exception as e: | |
| print(f"Warning: Could not load patterns from {pattern_file_path}: {e}") | |
| patterns.extend(default_patterns) | |
| compiled_patterns = [] | |
| for pattern in patterns: | |
| try: | |
| compiled_patterns.append(re.compile(pattern)) | |
| except re.error as e: | |
| print(f"Warning: Invalid regex pattern '{pattern}': {e}") | |
| return compiled_patterns | |
| def apply_pattern_normalization(domain, patterns): | |
| for pattern in patterns: | |
| match = pattern.match(domain) | |
| if match and match.groups(): | |
| return match.group(1) | |
| return domain | |
| def normalize_domain(domain, patterns=None): | |
| if not domain: | |
| return domain | |
| parts = domain.split('.') | |
| normalized_domain = domain if len(parts) <= 3 else '.'.join(parts[-3:]) | |
| if patterns: | |
| normalized_domain = apply_pattern_normalization(normalized_domain, patterns) | |
| return normalized_domain | |
| def has_valid_tld(domain): | |
| if not domain: | |
| return False | |
| parts = domain.split('.') | |
| if len(parts) < 2: | |
| return False | |
| tld = parts[-1] | |
| return len(tld) >= 2 and tld.islower() and tld.isalpha() | |
| def get_browser_history_path(browser_name='Vivaldi'): | |
| system = platform.system() | |
| if browser_name.lower() == 'vivaldi': | |
| if system == 'Windows': | |
| return os.path.join(os.environ['LOCALAPPDATA'], 'Vivaldi', 'User Data', 'Default', 'History') | |
| elif system == 'Darwin': | |
| return os.path.join(os.path.expanduser('~'), 'Library', 'Application Support', 'Vivaldi', 'Default', 'History') | |
| elif system == 'Linux': | |
| return os.path.join(os.path.expanduser('~'), '.config', 'vivaldi', 'default', 'History') | |
| raise OSError(f"Unsupported browser '{browser_name}' or operating system '{system}'.") | |
| def copy_history_database(history_path, temp_path=None): | |
| if temp_path is None: | |
| temp_path = os.path.join(os.path.expanduser("~"), "browser_history_copy.db") | |
| if not os.path.exists(history_path): | |
| raise FileNotFoundError(f"History file not found at {history_path}") | |
| shutil.copyfile(history_path, temp_path) | |
| return temp_path | |
| def get_date_range(cursor): | |
| try: | |
| cursor.execute("SELECT MIN(visit_time), MAX(visit_time) FROM visits") | |
| date_range = cursor.fetchone() | |
| earliest_timestamp, latest_timestamp = date_range | |
| chrome_epoch = datetime.datetime(1601, 1, 1) | |
| if earliest_timestamp and latest_timestamp: | |
| earliest_date = chrome_epoch + datetime.timedelta(microseconds=earliest_timestamp) | |
| latest_date = chrome_epoch + datetime.timedelta(microseconds=latest_timestamp) | |
| def format_date(date): | |
| day = date.day | |
| suffix = "th" if 4 <= day <= 20 or 24 <= day <= 30 else ["st", "nd", "rd"][day % 10 - 1] | |
| return date.strftime(f"%B {day}{suffix}, %Y") | |
| days_between = (latest_date - earliest_date).days | |
| return format_date(earliest_date), format_date(latest_date), days_between | |
| else: | |
| return "No data available", "No data available", 0 | |
| except sqlite3.OperationalError as e: | |
| print(f"Error querying visit dates: {e}") | |
| return "Error retrieving date", "Error retrieving date", 0 | |
| def extract_domains_from_urls(cursor, patterns=None): | |
| try: | |
| cursor.execute("SELECT url FROM urls") | |
| except sqlite3.OperationalError as e: | |
| raise Exception(f"Error querying the database: {e}. The 'urls' table might not exist or the database is corrupt.") | |
| all_urls = cursor.fetchall() | |
| unique_domains = set() | |
| domain_counts = collections.Counter() | |
| domains_removed = 0 | |
| for url_tuple in all_urls: | |
| url = url_tuple[0] | |
| try: | |
| domain = urlparse(url).netloc | |
| if domain: | |
| if not has_valid_tld(domain): | |
| domains_removed += 1 | |
| continue | |
| normalized_domain = normalize_domain(domain, patterns) | |
| if not has_valid_tld(normalized_domain): | |
| domains_removed += 1 | |
| continue | |
| unique_domains.add(normalized_domain) | |
| domain_counts[normalized_domain] += 1 | |
| except Exception as e: | |
| print(f"Could not parse URL: {url} - Error: {e}") | |
| return unique_domains, domain_counts, domains_removed | |
| def analyze_browser_history(browser_name='Vivaldi', temp_path=None, top_domains=10, pattern_file_path=None): | |
| history_path = get_browser_history_path(browser_name) | |
| temp_history_path = copy_history_database(history_path, temp_path) | |
| patterns = load_domain_patterns(pattern_file_path) | |
| try: | |
| conn = sqlite3.connect(temp_history_path) | |
| cursor = conn.cursor() | |
| earliest_date, latest_date, days_between = get_date_range(cursor) | |
| unique_domains, domain_counts, domains_removed = extract_domains_from_urls(cursor, patterns) | |
| conn.close() | |
| return { | |
| 'date_range': (earliest_date, latest_date, days_between), | |
| 'unique_domains': unique_domains, | |
| 'domain_counts': domain_counts, | |
| 'total_unique_domains': len(unique_domains), | |
| 'top_domains': domain_counts.most_common(top_domains), | |
| 'domains_removed': domains_removed | |
| } | |
| finally: | |
| if os.path.exists(temp_history_path): | |
| os.remove(temp_history_path) | |
| def format_number(num): | |
| return f"{num:,}" | |
| def redact_domain(domain): | |
| if not domain: | |
| return domain | |
| parts = domain.split('.') | |
| if len(parts) <= 1: | |
| return domain | |
| if len(parts) >= 2 and len(parts[-2]) <= 3: | |
| return f"???.{parts[-1]}" | |
| redacted_parts = ['*' * len(part) for part in parts[:-1]] | |
| redacted_parts.append(parts[-1]) | |
| return '.'.join(redacted_parts) | |
| def print_analysis_results(results, browser_name='Vivaldi', top_domains=None, bottom_domains=None, redact=False): | |
| earliest_date, latest_date, days_between = results['date_range'] | |
| print(f"\n--- {browser_name} History Analysis ---") | |
| if days_between > 0: | |
| print(f"Date range: {earliest_date} to {latest_date} ({format_number(days_between)} days)") | |
| else: | |
| print(f"Date range: {earliest_date} to {latest_date}") | |
| print(f"Total unique domains found: {format_number(results['total_unique_domains'])}") | |
| print(f"Domains removed (no valid TLD): {format_number(results['domains_removed'])}") | |
| if top_domains is not None: | |
| print(f"\nTop {min(top_domains, len(results['top_domains']))} most visited domains:") | |
| for domain, count in results['top_domains'][:top_domains]: | |
| display_domain = redact_domain(domain) if redact else domain | |
| print(f"- {display_domain}: {format_number(count)} visits") | |
| if bottom_domains is not None and bottom_domains > 0: | |
| sorted_domains = sorted(results['domain_counts'].items(), key=lambda x: x[1]) | |
| bottom_domains_list = sorted_domains[:bottom_domains] | |
| print(f"\nBottom {len(bottom_domains_list)} least visited domains:") | |
| for domain, count in bottom_domains_list: | |
| display_domain = redact_domain(domain) if redact else domain | |
| print(f"- {display_domain}: {format_number(count)} visits") | |
| def create_parser(): | |
| parser = argparse.ArgumentParser( | |
| description='Analyze browser history to find unique domains and their visit counts.', | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| epilog=""" | |
| Examples: | |
| %(prog)s # Default analysis with top 100 domains | |
| %(prog)s --top 20 # Show only top 20 domains | |
| %(prog)s --bottom 10 # Show only bottom 10 domains | |
| %(prog)s --top 20 --bottom 10 # Show both top 20 and bottom 10 | |
| %(prog)s --browser Chrome # Analyze Chrome instead of Vivaldi | |
| %(prog)s --patterns custom.txt # Use custom pattern file | |
| %(prog)s --no-patterns # Disable pattern normalization | |
| %(prog)s --temp-path /tmp/hist.db # Use custom temporary file path | |
| %(prog)s --redact # Redact domain names for privacy | |
| """ | |
| ) | |
| parser.add_argument('--browser', '-b', default='Vivaldi', help='Browser to analyze (default: Vivaldi)') | |
| parser.add_argument('--top', '-t', type=int, default=None, help='Number of top domains to display (default: 100 when no --bottom specified)') | |
| parser.add_argument('--bottom', '-bt', type=int, default=None, help='Number of bottom domains to display') | |
| parser.add_argument('--patterns', '-p', dest='pattern_file_path', help='Path to custom domain pattern file (default: domain_patterns.txt)') | |
| parser.add_argument('--no-patterns', action='store_true', help='Disable pattern-based domain normalization') | |
| parser.add_argument('--temp-path', help='Custom temporary file path for database copy') | |
| parser.add_argument('--quiet', '-q', action='store_true', help='Suppress warning messages') | |
| parser.add_argument('--redact', action='store_true', help='Redact domain names for privacy (shows only TLD)') | |
| parser.add_argument('--version', action='version', version='%(prog)s 1.0') | |
| return parser | |
| def main_cli(): | |
| parser = create_parser() | |
| args = parser.parse_args() | |
| pattern_file_path = None if args.no_patterns else args.pattern_file_path | |
| if args.top is not None and args.top < 0: | |
| print("Error: --top must be non-negative", file=sys.stderr) | |
| sys.exit(1) | |
| if args.bottom is not None and args.bottom < 0: | |
| print("Error: --bottom must be non-negative", file=sys.stderr) | |
| sys.exit(1) | |
| show_top = args.top is not None | |
| show_bottom = args.bottom is not None | |
| if not show_top and not show_bottom: | |
| show_top = True | |
| top_count = 100 | |
| else: | |
| top_count = args.top if show_top else 1 | |
| if args.quiet: | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| try: | |
| results = analyze_browser_history( | |
| browser_name=args.browser, | |
| temp_path=args.temp_path, | |
| top_domains=top_count, | |
| pattern_file_path=pattern_file_path | |
| ) | |
| print_analysis_results( | |
| results, | |
| browser_name=args.browser, | |
| top_domains=args.top if show_top else None, | |
| bottom_domains=args.bottom if show_bottom else None, | |
| redact=args.redact | |
| ) | |
| except Exception as e: | |
| print(f"Error: {e}", file=sys.stderr) | |
| sys.exit(1) | |
| def main(browser_name='Vivaldi', top_domains=100, temp_path=None, pattern_file_path=None, redact=False): | |
| try: | |
| results = analyze_browser_history( | |
| browser_name=browser_name, | |
| temp_path=temp_path, | |
| top_domains=top_domains, | |
| pattern_file_path=pattern_file_path | |
| ) | |
| print_analysis_results(results, browser_name, redact=redact) | |
| except Exception as e: | |
| print(f"An error occurred: {e}") | |
| if __name__ == "__main__": | |
| main_cli() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment