cheeseonamonkey · December 7, 2025 09:16
diff --git a/scraper.py b/scraper.py
 import csv, json, os, random, re, time
 from typing import Optional
 from curl_cffi import requests  # pip install curl_cffi
 from bs4 import BeautifulSoup

 # --- CONFIGURATION ---
 BATCH_SIZE = 500  # Lowered slightly to be safe
 MIN_DELAY, MAX_DELAY = 12, 40
 LONG_DELAY_CHANCE = 0.08  # 8% chance of a "coffee break"
 LONG_DELAY_RANGE = (120, 300) # 2-5 minutes
 REQUEST_TIMEOUT = 30
 MAX_RETRIES = 3
 SESSION_ROTATE_LIMIT = 50 # Rotate "browser" after this many requests

 CHORD_URL_PATTERN = r'https://tabs\.ultimate-guitar\.com/tab/([^/]+)/([a-z0-9-]+)-chords-(\d+)'
 CHORD_URLS_FILE = 'chord_urls.csv'
 RESULTS_FILE = 'chord_results.csv'
 COLUMN_NAMES = ['url', 'artist', 'track', 'id', 'content']

 # Increase CSV field size for massive tabs
 csv.field_size_limit(10**7)

 # --- HELPER FUNCTIONS ---

 def get_impersonate_ver():
    """Rotates between recent browser signatures for new sessions."""
    return random.choice(["chrome120", "chrome119", "safari17_0"])

 def create_session():
    """Creates a fresh session with a consistent TLS fingerprint."""
    ver = get_impersonate_ver()
    session = requests.Session(impersonate=ver)
    
    # Set standard "human" headers that curl_cffi might not auto-populate fully
    session.headers.update({
        "Accept-Language": "en-US,en;q=0.9",
        "Referer": "https://www.google.com/",
        "DNT": "1",
        "Upgrade-Insecure-Requests": "1",
    })
    return session

 def warm_up_session(session):
    """Visits the homepage to get initial cookies/CSRF tokens."""
    try:
        print("  [~] Warming up cookies...", end='', flush=True)
        session.get("https://www.ultimate-guitar.com/", timeout=15)
        time.sleep(random.uniform(3, 7))
        print(" Done.")
        return True
    except Exception as e:
        print(f" Failed ({e})")
        return False

 def find_in_nested(obj, key):
    """Recursively search for a key in a nested dict/list."""
    if isinstance(obj, dict):
        return obj.get(key) or next((v for v in (find_in_nested(val, key) for val in obj.values()) if v), None)
    if isinstance(obj, list):
        return next((v for v in (find_in_nested(item, key) for item in obj) if v), None)
    return None

 def parse_chord_url(url: str):
    m = re.match(CHORD_URL_PATTERN, url)
    return (m.group(2).replace('-', ' ').lower(), m.group(3)) if m else (None, None)

 def random_delay():
    """Simulates human reading/browsing time."""
    if random.random() < LONG_DELAY_CHANCE:
        delay = random.uniform(*LONG_DELAY_RANGE)
        print(f"  [Coffee Break] Sleeping {int(delay)}s...", end='', flush=True)
    else:
        delay = random.uniform(MIN_DELAY, MAX_DELAY)
        print(f"  {delay:.1f}s", end='', flush=True)

    # Jitter the sleep (break into small chunks)
    chunks = random.randint(4, 10)
    chunk_size = delay / chunks
    for _ in range(chunks):
        time.sleep(random.uniform(chunk_size * 0.8, chunk_size * 1.2))
        print('.', end='', flush=True)
    print()

 def scrape_chord_page(url: str, session: requests.Session):
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            r = session.get(url, timeout=REQUEST_TIMEOUT)
            
            # --- Anti-Bot Detection Checks ---
            if r.status_code == 403 or r.status_code == 429:
                print(f"\n  [!] Detected (Status {r.status_code}). Cooling down for 2 mins...")
                time.sleep(120)
                # If 403, our session might be poisoned. Raise error to trigger session rotation.
                raise Exception(f"Blocked with status {r.status_code}")
            
            if r.status_code != 200:
                print(f"  [x] Status {r.status_code}, retrying...")
                time.sleep(10)
                continue

            # --- Parsing ---
            soup = BeautifulSoup(r.text, 'html.parser')
            store = soup.find(class_='js-store')
            
            if not store:
                # Sometimes they serve a "soft block" page without the data store
                if "Access denied" in r.text or "One more step" in r.text:
                    raise Exception("Cloudflare Challenge Page")
                return None

            data = json.loads(store['data-content'])
            
            # Harvest new URLs
            urls = set(m.group(0) for m in re.finditer(CHORD_URL_PATTERN, r.text))
            urls.update(l['href'] for l in soup.find_all('a', href=True)
                        if 'ultimate-guitar.com/tab/' in l['href'] 
                        and '-chords-' in l['href']
                        and re.match(CHORD_URL_PATTERN, l['href']))

            return {
                'artist': find_in_nested(data, 'artist_name'), 
                'track': find_in_nested(data, 'song_name'),
                'content': find_in_nested(data, 'content'), 
                'urls': urls
            }

        except Exception as e:
            print(f"  [!] Retry {attempt}/{MAX_RETRIES}: {e}")
            time.sleep(random.uniform(5, 15))
    
    return None

 def load_csv_column(file_path: str, col=0, skip_header=True):
    if not os.path.exists(file_path): return set()
    with open(file_path, encoding='utf-8') as f:
        reader = csv.reader(f)
        if skip_header: next(reader, None)
        return {row[col] for row in reader if row and len(row) > col and row[col].strip()}

 def save_chord_result(url, artist, track, track_id, content):
    exists = os.path.exists(RESULTS_FILE) and os.path.getsize(RESULTS_FILE) > 0
    with open(RESULTS_FILE, 'a', newline='', encoding='utf-8') as f:
        w = csv.writer(f, quoting=csv.QUOTE_ALL)
        if not exists: w.writerow(COLUMN_NAMES)
        w.writerow([url, artist, track, track_id, content])

 def append_urls(file_path, urls):
    if not urls: return
    existing = load_csv_column(file_path, skip_header=False)
    new_urls = [u for u in urls if u not in existing]
    if new_urls: open(file_path, 'a', encoding='utf-8').write('\n'.join(new_urls)+'\n')

 # --- MAIN ---

 def main():
    print(f"Initializing TLS-Hardened Scraper...")
    all_urls = load_csv_column(CHORD_URLS_FILE, skip_header=True)
    scraped_urls = load_csv_column(RESULTS_FILE, skip_header=True)
    
    # Filter valid URLs
    to_scrape = [u for u in all_urls - scraped_urls if re.match(CHORD_URL_PATTERN, u)]
    if not to_scrape: 
        print("✓ No URLs to scrape")
        return

    # Randomize queue to avoid hammering one artist/pattern
    random.shuffle(to_scrape)
    to_scrape = to_scrape[:BATCH_SIZE]

    seen_ids = load_csv_column(RESULTS_FILE, 3, skip_header=True)
    total_saved = len(seen_ids)
    
    print(f"Queue: {len(to_scrape):,} URLs to process.")
    
    session = create_session()
    warm_up_session(session)
    
    saved = failed = skipped = new_urls_count = 0
    new_urls_buffer = []

    try:
        for idx, url in enumerate(to_scrape, 1):
            track_name, track_id = parse_chord_url(url)
            
            # --- Skip Duplicates ---
            if not track_name: 
                failed += 1; continue
            if track_id in seen_ids: 
                skipped += 1
                print(f"[{idx}/{len(to_scrape)}] ⊘ Duplicate: {track_name}")
                continue

            # --- Session Rotation ---
            if idx % SESSION_ROTATE_LIMIT == 0:
                print("\n[↻] Rotating Browser Session & IP Cool-down...")
                session.close()
                time.sleep(random.uniform(10, 20))
                session = create_session()
                warm_up_session(session)

            # --- Action ---
            random_delay()
            
            # Update Referer to look like we came from the last page (or home)
            if idx > 1:
                session.headers.update({"Referer": to_scrape[idx-2]})
            
            data = scrape_chord_page(url, session)

            if data and data['content']:
                save_chord_result(url, data['artist'], data['track'], track_id, data['content'])
                seen_ids.add(track_id)
                saved += 1
                
                # Discovery
                discovered = data['urls'] - scraped_urls - all_urls
                valid_new = [u for u in discovered if parse_chord_url(u)[1] not in seen_ids]
                new_urls_buffer.extend(valid_new)
                all_urls.update(valid_new)
                new_urls_count += len(valid_new)
                
                print(f"[{idx}/{len(to_scrape)}] ✓ {data['track']} | +{len(valid_new)} URLs")
            else:
                failed += 1
                print(f"[{idx}/{len(to_scrape)}] ✗ Failed: {track_name}")
                # If we fail, clear referer to reset navigation "trail"
                session.headers.update({"Referer": "https://www.ultimate-guitar.com/"})

    except KeyboardInterrupt:
        print("\n[!] Stopped by user. Saving progress...")
    finally:
        session.close()
        append_urls(CHORD_URLS_FILE, new_urls_buffer)
        print(f"\nBATCH SUMMARY: {saved} saved | {failed} failed | {new_urls_count} found")

 if __name__ == '__main__':
    main()
	import csv, json, os, random, re, time
	from typing import Optional
	from curl_cffi import requests # pip install curl_cffi
	from bs4 import BeautifulSoup

	# --- CONFIGURATION ---
	BATCH_SIZE = 500 # Lowered slightly to be safe
	MIN_DELAY, MAX_DELAY = 12, 40
	LONG_DELAY_CHANCE = 0.08 # 8% chance of a "coffee break"
	LONG_DELAY_RANGE = (120, 300) # 2-5 minutes
	REQUEST_TIMEOUT = 30
	MAX_RETRIES = 3
	SESSION_ROTATE_LIMIT = 50 # Rotate "browser" after this many requests

	CHORD_URL_PATTERN = r'https://tabs\.ultimate-guitar\.com/tab/([^/]+)/([a-z0-9-]+)-chords-(\d+)'
	CHORD_URLS_FILE = 'chord_urls.csv'
	RESULTS_FILE = 'chord_results.csv'
	COLUMN_NAMES = ['url', 'artist', 'track', 'id', 'content']

	# Increase CSV field size for massive tabs
	csv.field_size_limit(10**7)

	# --- HELPER FUNCTIONS ---

	def get_impersonate_ver():
	"""Rotates between recent browser signatures for new sessions."""
	return random.choice(["chrome120", "chrome119", "safari17_0"])

	def create_session():
	"""Creates a fresh session with a consistent TLS fingerprint."""
	ver = get_impersonate_ver()
	session = requests.Session(impersonate=ver)

	# Set standard "human" headers that curl_cffi might not auto-populate fully
	session.headers.update({
	"Accept-Language": "en-US,en;q=0.9",
	"Referer": "https://www.google.com/",
	"DNT": "1",
	"Upgrade-Insecure-Requests": "1",
	})
	return session

	def warm_up_session(session):
	"""Visits the homepage to get initial cookies/CSRF tokens."""
	try:
	print(" [~] Warming up cookies...", end='', flush=True)
	session.get("https://www.ultimate-guitar.com/", timeout=15)
	time.sleep(random.uniform(3, 7))
	print(" Done.")
	return True
	except Exception as e:
	print(f" Failed ({e})")
	return False

	def find_in_nested(obj, key):
	"""Recursively search for a key in a nested dict/list."""
	if isinstance(obj, dict):
	return obj.get(key) or next((v for v in (find_in_nested(val, key) for val in obj.values()) if v), None)
	if isinstance(obj, list):
	return next((v for v in (find_in_nested(item, key) for item in obj) if v), None)
	return None

	def parse_chord_url(url: str):
	m = re.match(CHORD_URL_PATTERN, url)
	return (m.group(2).replace('-', ' ').lower(), m.group(3)) if m else (None, None)

	def random_delay():
	"""Simulates human reading/browsing time."""
	if random.random() < LONG_DELAY_CHANCE:
	delay = random.uniform(*LONG_DELAY_RANGE)
	print(f" [Coffee Break] Sleeping {int(delay)}s...", end='', flush=True)
	else:
	delay = random.uniform(MIN_DELAY, MAX_DELAY)
	print(f" {delay:.1f}s", end='', flush=True)

	# Jitter the sleep (break into small chunks)
	chunks = random.randint(4, 10)
	chunk_size = delay / chunks
	for _ in range(chunks):
	time.sleep(random.uniform(chunk_size * 0.8, chunk_size * 1.2))
	print('.', end='', flush=True)
	print()

	def scrape_chord_page(url: str, session: requests.Session):
	for attempt in range(1, MAX_RETRIES + 1):
	try:
	r = session.get(url, timeout=REQUEST_TIMEOUT)

	# --- Anti-Bot Detection Checks ---
	if r.status_code == 403 or r.status_code == 429:
	print(f"\n [!] Detected (Status {r.status_code}). Cooling down for 2 mins...")
	time.sleep(120)
	# If 403, our session might be poisoned. Raise error to trigger session rotation.
	raise Exception(f"Blocked with status {r.status_code}")

	if r.status_code != 200:
	print(f" [x] Status {r.status_code}, retrying...")
	time.sleep(10)
	continue

	# --- Parsing ---
	soup = BeautifulSoup(r.text, 'html.parser')
	store = soup.find(class_='js-store')

	if not store:
	# Sometimes they serve a "soft block" page without the data store
	if "Access denied" in r.text or "One more step" in r.text:
	raise Exception("Cloudflare Challenge Page")
	return None

	data = json.loads(store['data-content'])

	# Harvest new URLs
	urls = set(m.group(0) for m in re.finditer(CHORD_URL_PATTERN, r.text))
	urls.update(l['href'] for l in soup.find_all('a', href=True)
	if 'ultimate-guitar.com/tab/' in l['href']
	and '-chords-' in l['href']
	and re.match(CHORD_URL_PATTERN, l['href']))

	return {
	'artist': find_in_nested(data, 'artist_name'),
	'track': find_in_nested(data, 'song_name'),
	'content': find_in_nested(data, 'content'),
	'urls': urls
	}

	except Exception as e:
	print(f" [!] Retry {attempt}/{MAX_RETRIES}: {e}")
	time.sleep(random.uniform(5, 15))

	return None

	def load_csv_column(file_path: str, col=0, skip_header=True):
	if not os.path.exists(file_path): return set()
	with open(file_path, encoding='utf-8') as f:
	reader = csv.reader(f)
	if skip_header: next(reader, None)
	return {row[col] for row in reader if row and len(row) > col and row[col].strip()}

	def save_chord_result(url, artist, track, track_id, content):
	exists = os.path.exists(RESULTS_FILE) and os.path.getsize(RESULTS_FILE) > 0
	with open(RESULTS_FILE, 'a', newline='', encoding='utf-8') as f:
	w = csv.writer(f, quoting=csv.QUOTE_ALL)
	if not exists: w.writerow(COLUMN_NAMES)
	w.writerow([url, artist, track, track_id, content])

	def append_urls(file_path, urls):
	if not urls: return
	existing = load_csv_column(file_path, skip_header=False)
	new_urls = [u for u in urls if u not in existing]
	if new_urls: open(file_path, 'a', encoding='utf-8').write('\n'.join(new_urls)+'\n')

	# --- MAIN ---

	def main():
	print(f"Initializing TLS-Hardened Scraper...")
	all_urls = load_csv_column(CHORD_URLS_FILE, skip_header=True)
	scraped_urls = load_csv_column(RESULTS_FILE, skip_header=True)

	# Filter valid URLs
	to_scrape = [u for u in all_urls - scraped_urls if re.match(CHORD_URL_PATTERN, u)]
	if not to_scrape:
	print("✓ No URLs to scrape")
	return

	# Randomize queue to avoid hammering one artist/pattern
	random.shuffle(to_scrape)
	to_scrape = to_scrape[:BATCH_SIZE]

	seen_ids = load_csv_column(RESULTS_FILE, 3, skip_header=True)
	total_saved = len(seen_ids)

	print(f"Queue: {len(to_scrape):,} URLs to process.")

	session = create_session()
	warm_up_session(session)

	saved = failed = skipped = new_urls_count = 0
	new_urls_buffer = []

	try:
	for idx, url in enumerate(to_scrape, 1):
	track_name, track_id = parse_chord_url(url)

	# --- Skip Duplicates ---
	if not track_name:
	failed += 1; continue
	if track_id in seen_ids:
	skipped += 1
	print(f"[{idx}/{len(to_scrape)}] ⊘ Duplicate: {track_name}")
	continue

	# --- Session Rotation ---
	if idx % SESSION_ROTATE_LIMIT == 0:
	print("\n[↻] Rotating Browser Session & IP Cool-down...")
	session.close()
	time.sleep(random.uniform(10, 20))
	session = create_session()
	warm_up_session(session)

	# --- Action ---
	random_delay()

	# Update Referer to look like we came from the last page (or home)
	if idx > 1:
	session.headers.update({"Referer": to_scrape[idx-2]})

	data = scrape_chord_page(url, session)

	if data and data['content']:
	save_chord_result(url, data['artist'], data['track'], track_id, data['content'])
	seen_ids.add(track_id)
	saved += 1

	# Discovery
	discovered = data['urls'] - scraped_urls - all_urls
	valid_new = [u for u in discovered if parse_chord_url(u)[1] not in seen_ids]
	new_urls_buffer.extend(valid_new)
	all_urls.update(valid_new)
	new_urls_count += len(valid_new)

	print(f"[{idx}/{len(to_scrape)}] ✓ {data['track']} \| +{len(valid_new)} URLs")
	else:
	failed += 1
	print(f"[{idx}/{len(to_scrape)}] ✗ Failed: {track_name}")
	# If we fail, clear referer to reset navigation "trail"
	session.headers.update({"Referer": "https://www.ultimate-guitar.com/"})

	except KeyboardInterrupt:
	print("\n[!] Stopped by user. Saving progress...")
	finally:
	session.close()
	append_urls(CHORD_URLS_FILE, new_urls_buffer)
	print(f"\nBATCH SUMMARY: {saved} saved \| {failed} failed \| {new_urls_count} found")

	if __name__ == '__main__':
	main()
No results found