Created
December 7, 2025 09:16
-
-
Save cheeseonamonkey/f71006e118f628df3d6762713c35d93b to your computer and use it in GitHub Desktop.
chord scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import csv, json, os, random, re, time | |
| from typing import Optional | |
| from curl_cffi import requests # pip install curl_cffi | |
| from bs4 import BeautifulSoup | |
| # --- CONFIGURATION --- | |
| BATCH_SIZE = 500 # Lowered slightly to be safe | |
| MIN_DELAY, MAX_DELAY = 12, 40 | |
| LONG_DELAY_CHANCE = 0.08 # 8% chance of a "coffee break" | |
| LONG_DELAY_RANGE = (120, 300) # 2-5 minutes | |
| REQUEST_TIMEOUT = 30 | |
| MAX_RETRIES = 3 | |
| SESSION_ROTATE_LIMIT = 50 # Rotate "browser" after this many requests | |
| CHORD_URL_PATTERN = r'https://tabs\.ultimate-guitar\.com/tab/([^/]+)/([a-z0-9-]+)-chords-(\d+)' | |
| CHORD_URLS_FILE = 'chord_urls.csv' | |
| RESULTS_FILE = 'chord_results.csv' | |
| COLUMN_NAMES = ['url', 'artist', 'track', 'id', 'content'] | |
| # Increase CSV field size for massive tabs | |
| csv.field_size_limit(10**7) | |
| # --- HELPER FUNCTIONS --- | |
| def get_impersonate_ver(): | |
| """Rotates between recent browser signatures for new sessions.""" | |
| return random.choice(["chrome120", "chrome119", "safari17_0"]) | |
| def create_session(): | |
| """Creates a fresh session with a consistent TLS fingerprint.""" | |
| ver = get_impersonate_ver() | |
| session = requests.Session(impersonate=ver) | |
| # Set standard "human" headers that curl_cffi might not auto-populate fully | |
| session.headers.update({ | |
| "Accept-Language": "en-US,en;q=0.9", | |
| "Referer": "https://www.google.com/", | |
| "DNT": "1", | |
| "Upgrade-Insecure-Requests": "1", | |
| }) | |
| return session | |
| def warm_up_session(session): | |
| """Visits the homepage to get initial cookies/CSRF tokens.""" | |
| try: | |
| print(" [~] Warming up cookies...", end='', flush=True) | |
| session.get("https://www.ultimate-guitar.com/", timeout=15) | |
| time.sleep(random.uniform(3, 7)) | |
| print(" Done.") | |
| return True | |
| except Exception as e: | |
| print(f" Failed ({e})") | |
| return False | |
| def find_in_nested(obj, key): | |
| """Recursively search for a key in a nested dict/list.""" | |
| if isinstance(obj, dict): | |
| return obj.get(key) or next((v for v in (find_in_nested(val, key) for val in obj.values()) if v), None) | |
| if isinstance(obj, list): | |
| return next((v for v in (find_in_nested(item, key) for item in obj) if v), None) | |
| return None | |
| def parse_chord_url(url: str): | |
| m = re.match(CHORD_URL_PATTERN, url) | |
| return (m.group(2).replace('-', ' ').lower(), m.group(3)) if m else (None, None) | |
| def random_delay(): | |
| """Simulates human reading/browsing time.""" | |
| if random.random() < LONG_DELAY_CHANCE: | |
| delay = random.uniform(*LONG_DELAY_RANGE) | |
| print(f" [Coffee Break] Sleeping {int(delay)}s...", end='', flush=True) | |
| else: | |
| delay = random.uniform(MIN_DELAY, MAX_DELAY) | |
| print(f" {delay:.1f}s", end='', flush=True) | |
| # Jitter the sleep (break into small chunks) | |
| chunks = random.randint(4, 10) | |
| chunk_size = delay / chunks | |
| for _ in range(chunks): | |
| time.sleep(random.uniform(chunk_size * 0.8, chunk_size * 1.2)) | |
| print('.', end='', flush=True) | |
| print() | |
| def scrape_chord_page(url: str, session: requests.Session): | |
| for attempt in range(1, MAX_RETRIES + 1): | |
| try: | |
| r = session.get(url, timeout=REQUEST_TIMEOUT) | |
| # --- Anti-Bot Detection Checks --- | |
| if r.status_code == 403 or r.status_code == 429: | |
| print(f"\n [!] Detected (Status {r.status_code}). Cooling down for 2 mins...") | |
| time.sleep(120) | |
| # If 403, our session might be poisoned. Raise error to trigger session rotation. | |
| raise Exception(f"Blocked with status {r.status_code}") | |
| if r.status_code != 200: | |
| print(f" [x] Status {r.status_code}, retrying...") | |
| time.sleep(10) | |
| continue | |
| # --- Parsing --- | |
| soup = BeautifulSoup(r.text, 'html.parser') | |
| store = soup.find(class_='js-store') | |
| if not store: | |
| # Sometimes they serve a "soft block" page without the data store | |
| if "Access denied" in r.text or "One more step" in r.text: | |
| raise Exception("Cloudflare Challenge Page") | |
| return None | |
| data = json.loads(store['data-content']) | |
| # Harvest new URLs | |
| urls = set(m.group(0) for m in re.finditer(CHORD_URL_PATTERN, r.text)) | |
| urls.update(l['href'] for l in soup.find_all('a', href=True) | |
| if 'ultimate-guitar.com/tab/' in l['href'] | |
| and '-chords-' in l['href'] | |
| and re.match(CHORD_URL_PATTERN, l['href'])) | |
| return { | |
| 'artist': find_in_nested(data, 'artist_name'), | |
| 'track': find_in_nested(data, 'song_name'), | |
| 'content': find_in_nested(data, 'content'), | |
| 'urls': urls | |
| } | |
| except Exception as e: | |
| print(f" [!] Retry {attempt}/{MAX_RETRIES}: {e}") | |
| time.sleep(random.uniform(5, 15)) | |
| return None | |
| def load_csv_column(file_path: str, col=0, skip_header=True): | |
| if not os.path.exists(file_path): return set() | |
| with open(file_path, encoding='utf-8') as f: | |
| reader = csv.reader(f) | |
| if skip_header: next(reader, None) | |
| return {row[col] for row in reader if row and len(row) > col and row[col].strip()} | |
| def save_chord_result(url, artist, track, track_id, content): | |
| exists = os.path.exists(RESULTS_FILE) and os.path.getsize(RESULTS_FILE) > 0 | |
| with open(RESULTS_FILE, 'a', newline='', encoding='utf-8') as f: | |
| w = csv.writer(f, quoting=csv.QUOTE_ALL) | |
| if not exists: w.writerow(COLUMN_NAMES) | |
| w.writerow([url, artist, track, track_id, content]) | |
| def append_urls(file_path, urls): | |
| if not urls: return | |
| existing = load_csv_column(file_path, skip_header=False) | |
| new_urls = [u for u in urls if u not in existing] | |
| if new_urls: open(file_path, 'a', encoding='utf-8').write('\n'.join(new_urls)+'\n') | |
| # --- MAIN --- | |
| def main(): | |
| print(f"Initializing TLS-Hardened Scraper...") | |
| all_urls = load_csv_column(CHORD_URLS_FILE, skip_header=True) | |
| scraped_urls = load_csv_column(RESULTS_FILE, skip_header=True) | |
| # Filter valid URLs | |
| to_scrape = [u for u in all_urls - scraped_urls if re.match(CHORD_URL_PATTERN, u)] | |
| if not to_scrape: | |
| print("✓ No URLs to scrape") | |
| return | |
| # Randomize queue to avoid hammering one artist/pattern | |
| random.shuffle(to_scrape) | |
| to_scrape = to_scrape[:BATCH_SIZE] | |
| seen_ids = load_csv_column(RESULTS_FILE, 3, skip_header=True) | |
| total_saved = len(seen_ids) | |
| print(f"Queue: {len(to_scrape):,} URLs to process.") | |
| session = create_session() | |
| warm_up_session(session) | |
| saved = failed = skipped = new_urls_count = 0 | |
| new_urls_buffer = [] | |
| try: | |
| for idx, url in enumerate(to_scrape, 1): | |
| track_name, track_id = parse_chord_url(url) | |
| # --- Skip Duplicates --- | |
| if not track_name: | |
| failed += 1; continue | |
| if track_id in seen_ids: | |
| skipped += 1 | |
| print(f"[{idx}/{len(to_scrape)}] ⊘ Duplicate: {track_name}") | |
| continue | |
| # --- Session Rotation --- | |
| if idx % SESSION_ROTATE_LIMIT == 0: | |
| print("\n[↻] Rotating Browser Session & IP Cool-down...") | |
| session.close() | |
| time.sleep(random.uniform(10, 20)) | |
| session = create_session() | |
| warm_up_session(session) | |
| # --- Action --- | |
| random_delay() | |
| # Update Referer to look like we came from the last page (or home) | |
| if idx > 1: | |
| session.headers.update({"Referer": to_scrape[idx-2]}) | |
| data = scrape_chord_page(url, session) | |
| if data and data['content']: | |
| save_chord_result(url, data['artist'], data['track'], track_id, data['content']) | |
| seen_ids.add(track_id) | |
| saved += 1 | |
| # Discovery | |
| discovered = data['urls'] - scraped_urls - all_urls | |
| valid_new = [u for u in discovered if parse_chord_url(u)[1] not in seen_ids] | |
| new_urls_buffer.extend(valid_new) | |
| all_urls.update(valid_new) | |
| new_urls_count += len(valid_new) | |
| print(f"[{idx}/{len(to_scrape)}] ✓ {data['track']} | +{len(valid_new)} URLs") | |
| else: | |
| failed += 1 | |
| print(f"[{idx}/{len(to_scrape)}] ✗ Failed: {track_name}") | |
| # If we fail, clear referer to reset navigation "trail" | |
| session.headers.update({"Referer": "https://www.ultimate-guitar.com/"}) | |
| except KeyboardInterrupt: | |
| print("\n[!] Stopped by user. Saving progress...") | |
| finally: | |
| session.close() | |
| append_urls(CHORD_URLS_FILE, new_urls_buffer) | |
| print(f"\nBATCH SUMMARY: {saved} saved | {failed} failed | {new_urls_count} found") | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment