Xevion · August 20, 2025 03:21
diff --git a/main.py b/main.py
 import sqlite3
 import os
 import platform
 import shutil
 from urllib.parse import urlparse
 import collections
 import datetime
 import re
 import argparse
 import sys

 def load_domain_patterns(pattern_file_path=None):
    if pattern_file_path is None:
        pattern_file_path = 'domain_patterns.txt'
    
    patterns = []
    
    default_patterns = [
        r'^.+\.(cloudfront\.net)$', r'^.+\.(amazonaws\.com)$', r'^.+\.(herokuapp\.com)$',
        r'^.+\.(netlify\.app)$', r'^.+\.(vercel\.app)$', r'^.+\.(github\.io)$',
        r'^.+\.(firebaseapp\.com)$', r'^.+\.(appspot\.com)$', r'^.+\.(azurewebsites\.net)$',
        r'^.+\.(cloudflare\.com)$', r'^.+\.(fastly\.com)$', r'^.+\.(cdn\.com)$',
        r'^.+\.(cdn\.net)$', r'^.+\.(cdn\.org)$', r'^.+\.(s3\.amazonaws\.com)$',
        r'^.+\.(s3-website-[^.]+\.amazonaws\.com)$', r'^.+\.(elasticbeanstalk\.com)$',
        r'^.+\.(railway\.app)$', r'^.+\.(render\.com)$', r'^.+\.(fly\.io)$',
        r'^.+\.(digitaloceanspaces\.com)$', r'^.+\.(bunnycdn\.com)$',
        r'^.+\.(stackpathcdn\.com)$', r'^.+\.(keycdn\.com)$',
    ]
    
    if os.path.exists(pattern_file_path):
        try:
            with open(pattern_file_path, 'r', encoding='utf-8') as f:
                file_patterns = [line.strip() for line in f if line.strip() and not line.startswith('#')]
                patterns.extend(file_patterns)
        except Exception as e:
            print(f"Warning: Could not load patterns from {pattern_file_path}: {e}")
    
    patterns.extend(default_patterns)
    
    compiled_patterns = []
    for pattern in patterns:
        try:
            compiled_patterns.append(re.compile(pattern))
        except re.error as e:
            print(f"Warning: Invalid regex pattern '{pattern}': {e}")
    
    return compiled_patterns

 def apply_pattern_normalization(domain, patterns):
    for pattern in patterns:
        match = pattern.match(domain)
        if match and match.groups():
            return match.group(1)
    return domain

 def normalize_domain(domain, patterns=None):
    if not domain:
        return domain
    
    parts = domain.split('.')
    normalized_domain = domain if len(parts) <= 3 else '.'.join(parts[-3:])
    
    if patterns:
        normalized_domain = apply_pattern_normalization(normalized_domain, patterns)
    
    return normalized_domain

 def has_valid_tld(domain):
    if not domain:
        return False
    
    parts = domain.split('.')
    if len(parts) < 2:
        return False
    
    tld = parts[-1]
    return len(tld) >= 2 and tld.islower() and tld.isalpha()

 def get_browser_history_path(browser_name='Vivaldi'):
    system = platform.system()
    
    if browser_name.lower() == 'vivaldi':
        if system == 'Windows':
            return os.path.join(os.environ['LOCALAPPDATA'], 'Vivaldi', 'User Data', 'Default', 'History')
        elif system == 'Darwin':
            return os.path.join(os.path.expanduser('~'), 'Library', 'Application Support', 'Vivaldi', 'Default', 'History')
        elif system == 'Linux':
            return os.path.join(os.path.expanduser('~'), '.config', 'vivaldi', 'default', 'History')
    
    raise OSError(f"Unsupported browser '{browser_name}' or operating system '{system}'.")

 def copy_history_database(history_path, temp_path=None):
    if temp_path is None:
        temp_path = os.path.join(os.path.expanduser("~"), "browser_history_copy.db")
    
    if not os.path.exists(history_path):
        raise FileNotFoundError(f"History file not found at {history_path}")
    
    shutil.copyfile(history_path, temp_path)
    return temp_path

 def get_date_range(cursor):
    try:
        cursor.execute("SELECT MIN(visit_time), MAX(visit_time) FROM visits")
        date_range = cursor.fetchone()
        earliest_timestamp, latest_timestamp = date_range
        
        chrome_epoch = datetime.datetime(1601, 1, 1)
        
        if earliest_timestamp and latest_timestamp:
            earliest_date = chrome_epoch + datetime.timedelta(microseconds=earliest_timestamp)
            latest_date = chrome_epoch + datetime.timedelta(microseconds=latest_timestamp)
            
            def format_date(date):
                day = date.day
                suffix = "th" if 4 <= day <= 20 or 24 <= day <= 30 else ["st", "nd", "rd"][day % 10 - 1]
                return date.strftime(f"%B {day}{suffix}, %Y")
            
            days_between = (latest_date - earliest_date).days
            return format_date(earliest_date), format_date(latest_date), days_between
        else:
            return "No data available", "No data available", 0
            
    except sqlite3.OperationalError as e:
        print(f"Error querying visit dates: {e}")
        return "Error retrieving date", "Error retrieving date", 0

 def extract_domains_from_urls(cursor, patterns=None):
    try:
        cursor.execute("SELECT url FROM urls")
    except sqlite3.OperationalError as e:
        raise Exception(f"Error querying the database: {e}. The 'urls' table might not exist or the database is corrupt.")
    
    all_urls = cursor.fetchall()
    unique_domains = set()
    domain_counts = collections.Counter()
    domains_removed = 0

    for url_tuple in all_urls:
        url = url_tuple[0]
        try:
            domain = urlparse(url).netloc
            if domain:
                if not has_valid_tld(domain):
                    domains_removed += 1
                    continue
                
                normalized_domain = normalize_domain(domain, patterns)
                
                if not has_valid_tld(normalized_domain):
                    domains_removed += 1
                    continue
                
                unique_domains.add(normalized_domain)
                domain_counts[normalized_domain] += 1
        except Exception as e:
            print(f"Could not parse URL: {url} - Error: {e}")

    return unique_domains, domain_counts, domains_removed

 def analyze_browser_history(browser_name='Vivaldi', temp_path=None, top_domains=10, pattern_file_path=None):
    history_path = get_browser_history_path(browser_name)
    temp_history_path = copy_history_database(history_path, temp_path)
    patterns = load_domain_patterns(pattern_file_path)
    
    try:
        conn = sqlite3.connect(temp_history_path)
        cursor = conn.cursor()
        
        earliest_date, latest_date, days_between = get_date_range(cursor)
        unique_domains, domain_counts, domains_removed = extract_domains_from_urls(cursor, patterns)
        
        conn.close()
        
        return {
            'date_range': (earliest_date, latest_date, days_between),
            'unique_domains': unique_domains,
            'domain_counts': domain_counts,
            'total_unique_domains': len(unique_domains),
            'top_domains': domain_counts.most_common(top_domains),
            'domains_removed': domains_removed
        }
    
    finally:
        if os.path.exists(temp_history_path):
            os.remove(temp_history_path)

 def format_number(num):
    return f"{num:,}"

 def redact_domain(domain):
    if not domain:
        return domain
    
    parts = domain.split('.')
    if len(parts) <= 1:
        return domain
    
    if len(parts) >= 2 and len(parts[-2]) <= 3:
        return f"???.{parts[-1]}"
    
    redacted_parts = ['*' * len(part) for part in parts[:-1]]
    redacted_parts.append(parts[-1])
    
    return '.'.join(redacted_parts)

 def print_analysis_results(results, browser_name='Vivaldi', top_domains=None, bottom_domains=None, redact=False):
    earliest_date, latest_date, days_between = results['date_range']
    
    print(f"\n--- {browser_name} History Analysis ---")
    
    if days_between > 0:
        print(f"Date range: {earliest_date} to {latest_date} ({format_number(days_between)} days)")
    else:
        print(f"Date range: {earliest_date} to {latest_date}")
    
    print(f"Total unique domains found: {format_number(results['total_unique_domains'])}")
    print(f"Domains removed (no valid TLD): {format_number(results['domains_removed'])}")
    
    if top_domains is not None:
        print(f"\nTop {min(top_domains, len(results['top_domains']))} most visited domains:")
        for domain, count in results['top_domains'][:top_domains]:
            display_domain = redact_domain(domain) if redact else domain
            print(f"- {display_domain}: {format_number(count)} visits")
    
    if bottom_domains is not None and bottom_domains > 0:
        sorted_domains = sorted(results['domain_counts'].items(), key=lambda x: x[1])
        bottom_domains_list = sorted_domains[:bottom_domains]
        
        print(f"\nBottom {len(bottom_domains_list)} least visited domains:")
        for domain, count in bottom_domains_list:
            display_domain = redact_domain(domain) if redact else domain
            print(f"- {display_domain}: {format_number(count)} visits")

 def create_parser():
    parser = argparse.ArgumentParser(
        description='Analyze browser history to find unique domains and their visit counts.',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 Examples:
  %(prog)s                           # Default analysis with top 100 domains
  %(prog)s --top 20                  # Show only top 20 domains
  %(prog)s --bottom 10               # Show only bottom 10 domains
  %(prog)s --top 20 --bottom 10      # Show both top 20 and bottom 10
  %(prog)s --browser Chrome          # Analyze Chrome instead of Vivaldi
  %(prog)s --patterns custom.txt     # Use custom pattern file
  %(prog)s --no-patterns             # Disable pattern normalization
  %(prog)s --temp-path /tmp/hist.db  # Use custom temporary file path
  %(prog)s --redact                  # Redact domain names for privacy
        """
    )
    
    parser.add_argument('--browser', '-b', default='Vivaldi', help='Browser to analyze (default: Vivaldi)')
    parser.add_argument('--top', '-t', type=int, default=None, help='Number of top domains to display (default: 100 when no --bottom specified)')
    parser.add_argument('--bottom', '-bt', type=int, default=None, help='Number of bottom domains to display')
    parser.add_argument('--patterns', '-p', dest='pattern_file_path', help='Path to custom domain pattern file (default: domain_patterns.txt)')
    parser.add_argument('--no-patterns', action='store_true', help='Disable pattern-based domain normalization')
    parser.add_argument('--temp-path', help='Custom temporary file path for database copy')
    parser.add_argument('--quiet', '-q', action='store_true', help='Suppress warning messages')
    parser.add_argument('--redact', action='store_true', help='Redact domain names for privacy (shows only TLD)')
    parser.add_argument('--version', action='version', version='%(prog)s 1.0')
    
    return parser

 def main_cli():
    parser = create_parser()
    args = parser.parse_args()
    
    pattern_file_path = None if args.no_patterns else args.pattern_file_path
    
    if args.top is not None and args.top < 0:
        print("Error: --top must be non-negative", file=sys.stderr)
        sys.exit(1)
    
    if args.bottom is not None and args.bottom < 0:
        print("Error: --bottom must be non-negative", file=sys.stderr)
        sys.exit(1)
    
    show_top = args.top is not None
    show_bottom = args.bottom is not None
    
    if not show_top and not show_bottom:
        show_top = True
        top_count = 100
    else:
        top_count = args.top if show_top else 1
    
    if args.quiet:
        import warnings
        warnings.filterwarnings("ignore")
    
    try:
        results = analyze_browser_history(
            browser_name=args.browser,
            temp_path=args.temp_path,
            top_domains=top_count,
            pattern_file_path=pattern_file_path
        )
        
        print_analysis_results(
            results, 
            browser_name=args.browser,
            top_domains=args.top if show_top else None,
            bottom_domains=args.bottom if show_bottom else None,
            redact=args.redact
        )
        
    except Exception as e:
        print(f"Error: {e}", file=sys.stderr)
        sys.exit(1)

 def main(browser_name='Vivaldi', top_domains=100, temp_path=None, pattern_file_path=None, redact=False):
    try:
        results = analyze_browser_history(
            browser_name=browser_name,
            temp_path=temp_path,
            top_domains=top_domains,
            pattern_file_path=pattern_file_path
        )
        print_analysis_results(results, browser_name, redact=redact)
        
    except Exception as e:
        print(f"An error occurred: {e}")

 if __name__ == "__main__":
    main_cli()
	import sqlite3
	import os
	import platform
	import shutil
	from urllib.parse import urlparse
	import collections
	import datetime
	import re
	import argparse
	import sys

	def load_domain_patterns(pattern_file_path=None):
	if pattern_file_path is None:
	pattern_file_path = 'domain_patterns.txt'

	patterns = []

	default_patterns = [
	r'^.+\.(cloudfront\.net)$', r'^.+\.(amazonaws\.com)$', r'^.+\.(herokuapp\.com)$',
	r'^.+\.(netlify\.app)$', r'^.+\.(vercel\.app)$', r'^.+\.(github\.io)$',
	r'^.+\.(firebaseapp\.com)$', r'^.+\.(appspot\.com)$', r'^.+\.(azurewebsites\.net)$',
	r'^.+\.(cloudflare\.com)$', r'^.+\.(fastly\.com)$', r'^.+\.(cdn\.com)$',
	r'^.+\.(cdn\.net)$', r'^.+\.(cdn\.org)$', r'^.+\.(s3\.amazonaws\.com)$',
	r'^.+\.(s3-website-[^.]+\.amazonaws\.com)$', r'^.+\.(elasticbeanstalk\.com)$',
	r'^.+\.(railway\.app)$', r'^.+\.(render\.com)$', r'^.+\.(fly\.io)$',
	r'^.+\.(digitaloceanspaces\.com)$', r'^.+\.(bunnycdn\.com)$',
	r'^.+\.(stackpathcdn\.com)$', r'^.+\.(keycdn\.com)$',
	]

	if os.path.exists(pattern_file_path):
	try:
	with open(pattern_file_path, 'r', encoding='utf-8') as f:
	file_patterns = [line.strip() for line in f if line.strip() and not line.startswith('#')]
	patterns.extend(file_patterns)
	except Exception as e:
	print(f"Warning: Could not load patterns from {pattern_file_path}: {e}")

	patterns.extend(default_patterns)

	compiled_patterns = []
	for pattern in patterns:
	try:
	compiled_patterns.append(re.compile(pattern))
	except re.error as e:
	print(f"Warning: Invalid regex pattern '{pattern}': {e}")

	return compiled_patterns

	def apply_pattern_normalization(domain, patterns):
	for pattern in patterns:
	match = pattern.match(domain)
	if match and match.groups():
	return match.group(1)
	return domain

	def normalize_domain(domain, patterns=None):
	if not domain:
	return domain

	parts = domain.split('.')
	normalized_domain = domain if len(parts) <= 3 else '.'.join(parts[-3:])

	if patterns:
	normalized_domain = apply_pattern_normalization(normalized_domain, patterns)

	return normalized_domain

	def has_valid_tld(domain):
	if not domain:
	return False

	parts = domain.split('.')
	if len(parts) < 2:
	return False

	tld = parts[-1]
	return len(tld) >= 2 and tld.islower() and tld.isalpha()

	def get_browser_history_path(browser_name='Vivaldi'):
	system = platform.system()

	if browser_name.lower() == 'vivaldi':
	if system == 'Windows':
	return os.path.join(os.environ['LOCALAPPDATA'], 'Vivaldi', 'User Data', 'Default', 'History')
	elif system == 'Darwin':
	return os.path.join(os.path.expanduser('~'), 'Library', 'Application Support', 'Vivaldi', 'Default', 'History')
	elif system == 'Linux':
	return os.path.join(os.path.expanduser('~'), '.config', 'vivaldi', 'default', 'History')

	raise OSError(f"Unsupported browser '{browser_name}' or operating system '{system}'.")

	def copy_history_database(history_path, temp_path=None):
	if temp_path is None:
	temp_path = os.path.join(os.path.expanduser("~"), "browser_history_copy.db")

	if not os.path.exists(history_path):
	raise FileNotFoundError(f"History file not found at {history_path}")

	shutil.copyfile(history_path, temp_path)
	return temp_path

	def get_date_range(cursor):
	try:
	cursor.execute("SELECT MIN(visit_time), MAX(visit_time) FROM visits")
	date_range = cursor.fetchone()
	earliest_timestamp, latest_timestamp = date_range

	chrome_epoch = datetime.datetime(1601, 1, 1)

	if earliest_timestamp and latest_timestamp:
	earliest_date = chrome_epoch + datetime.timedelta(microseconds=earliest_timestamp)
	latest_date = chrome_epoch + datetime.timedelta(microseconds=latest_timestamp)

	def format_date(date):
	day = date.day
	suffix = "th" if 4 <= day <= 20 or 24 <= day <= 30 else ["st", "nd", "rd"][day % 10 - 1]
	return date.strftime(f"%B {day}{suffix}, %Y")

	days_between = (latest_date - earliest_date).days
	return format_date(earliest_date), format_date(latest_date), days_between
	else:
	return "No data available", "No data available", 0

	except sqlite3.OperationalError as e:
	print(f"Error querying visit dates: {e}")
	return "Error retrieving date", "Error retrieving date", 0

	def extract_domains_from_urls(cursor, patterns=None):
	try:
	cursor.execute("SELECT url FROM urls")
	except sqlite3.OperationalError as e:
	raise Exception(f"Error querying the database: {e}. The 'urls' table might not exist or the database is corrupt.")

	all_urls = cursor.fetchall()
	unique_domains = set()
	domain_counts = collections.Counter()
	domains_removed = 0

	for url_tuple in all_urls:
	url = url_tuple[0]
	try:
	domain = urlparse(url).netloc
	if domain:
	if not has_valid_tld(domain):
	domains_removed += 1
	continue

	normalized_domain = normalize_domain(domain, patterns)

	if not has_valid_tld(normalized_domain):
	domains_removed += 1
	continue

	unique_domains.add(normalized_domain)
	domain_counts[normalized_domain] += 1
	except Exception as e:
	print(f"Could not parse URL: {url} - Error: {e}")

	return unique_domains, domain_counts, domains_removed

	def analyze_browser_history(browser_name='Vivaldi', temp_path=None, top_domains=10, pattern_file_path=None):
	history_path = get_browser_history_path(browser_name)
	temp_history_path = copy_history_database(history_path, temp_path)
	patterns = load_domain_patterns(pattern_file_path)

	try:
	conn = sqlite3.connect(temp_history_path)
	cursor = conn.cursor()

	earliest_date, latest_date, days_between = get_date_range(cursor)
	unique_domains, domain_counts, domains_removed = extract_domains_from_urls(cursor, patterns)

	conn.close()

	return {
	'date_range': (earliest_date, latest_date, days_between),
	'unique_domains': unique_domains,
	'domain_counts': domain_counts,
	'total_unique_domains': len(unique_domains),
	'top_domains': domain_counts.most_common(top_domains),
	'domains_removed': domains_removed
	}

	finally:
	if os.path.exists(temp_history_path):
	os.remove(temp_history_path)

	def format_number(num):
	return f"{num:,}"

	def redact_domain(domain):
	if not domain:
	return domain

	parts = domain.split('.')
	if len(parts) <= 1:
	return domain

	if len(parts) >= 2 and len(parts[-2]) <= 3:
	return f"???.{parts[-1]}"

	redacted_parts = ['' len(part) for part in parts[:-1]]
	redacted_parts.append(parts[-1])

	return '.'.join(redacted_parts)

	def print_analysis_results(results, browser_name='Vivaldi', top_domains=None, bottom_domains=None, redact=False):
	earliest_date, latest_date, days_between = results['date_range']

	print(f"\n--- {browser_name} History Analysis ---")

	if days_between > 0:
	print(f"Date range: {earliest_date} to {latest_date} ({format_number(days_between)} days)")
	else:
	print(f"Date range: {earliest_date} to {latest_date}")

	print(f"Total unique domains found: {format_number(results['total_unique_domains'])}")
	print(f"Domains removed (no valid TLD): {format_number(results['domains_removed'])}")

	if top_domains is not None:
	print(f"\nTop {min(top_domains, len(results['top_domains']))} most visited domains:")
	for domain, count in results['top_domains'][:top_domains]:
	display_domain = redact_domain(domain) if redact else domain
	print(f"- {display_domain}: {format_number(count)} visits")

	if bottom_domains is not None and bottom_domains > 0:
	sorted_domains = sorted(results['domain_counts'].items(), key=lambda x: x[1])
	bottom_domains_list = sorted_domains[:bottom_domains]

	print(f"\nBottom {len(bottom_domains_list)} least visited domains:")
	for domain, count in bottom_domains_list:
	display_domain = redact_domain(domain) if redact else domain
	print(f"- {display_domain}: {format_number(count)} visits")

	def create_parser():
	parser = argparse.ArgumentParser(
	description='Analyze browser history to find unique domains and their visit counts.',
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog="""
	Examples:
	%(prog)s # Default analysis with top 100 domains
	%(prog)s --top 20 # Show only top 20 domains
	%(prog)s --bottom 10 # Show only bottom 10 domains
	%(prog)s --top 20 --bottom 10 # Show both top 20 and bottom 10
	%(prog)s --browser Chrome # Analyze Chrome instead of Vivaldi
	%(prog)s --patterns custom.txt # Use custom pattern file
	%(prog)s --no-patterns # Disable pattern normalization
	%(prog)s --temp-path /tmp/hist.db # Use custom temporary file path
	%(prog)s --redact # Redact domain names for privacy
	"""
	)

	parser.add_argument('--browser', '-b', default='Vivaldi', help='Browser to analyze (default: Vivaldi)')
	parser.add_argument('--top', '-t', type=int, default=None, help='Number of top domains to display (default: 100 when no --bottom specified)')
	parser.add_argument('--bottom', '-bt', type=int, default=None, help='Number of bottom domains to display')
	parser.add_argument('--patterns', '-p', dest='pattern_file_path', help='Path to custom domain pattern file (default: domain_patterns.txt)')
	parser.add_argument('--no-patterns', action='store_true', help='Disable pattern-based domain normalization')
	parser.add_argument('--temp-path', help='Custom temporary file path for database copy')
	parser.add_argument('--quiet', '-q', action='store_true', help='Suppress warning messages')
	parser.add_argument('--redact', action='store_true', help='Redact domain names for privacy (shows only TLD)')
	parser.add_argument('--version', action='version', version='%(prog)s 1.0')

	return parser

	def main_cli():
	parser = create_parser()
	args = parser.parse_args()

	pattern_file_path = None if args.no_patterns else args.pattern_file_path

	if args.top is not None and args.top < 0:
	print("Error: --top must be non-negative", file=sys.stderr)
	sys.exit(1)

	if args.bottom is not None and args.bottom < 0:
	print("Error: --bottom must be non-negative", file=sys.stderr)
	sys.exit(1)

	show_top = args.top is not None
	show_bottom = args.bottom is not None

	if not show_top and not show_bottom:
	show_top = True
	top_count = 100
	else:
	top_count = args.top if show_top else 1

	if args.quiet:
	import warnings
	warnings.filterwarnings("ignore")

	try:
	results = analyze_browser_history(
	browser_name=args.browser,
	temp_path=args.temp_path,
	top_domains=top_count,
	pattern_file_path=pattern_file_path
	)

	print_analysis_results(
	results,
	browser_name=args.browser,
	top_domains=args.top if show_top else None,
	bottom_domains=args.bottom if show_bottom else None,
	redact=args.redact
	)

	except Exception as e:
	print(f"Error: {e}", file=sys.stderr)
	sys.exit(1)

	def main(browser_name='Vivaldi', top_domains=100, temp_path=None, pattern_file_path=None, redact=False):
	try:
	results = analyze_browser_history(
	browser_name=browser_name,
	temp_path=temp_path,
	top_domains=top_domains,
	pattern_file_path=pattern_file_path
	)
	print_analysis_results(results, browser_name, redact=redact)

	except Exception as e:
	print(f"An error occurred: {e}")

	if __name__ == "__main__":
	main_cli()
No results found