Skip to content

Instantly share code, notes, and snippets.

@cheeseonamonkey
Created December 7, 2025 09:16
Show Gist options
  • Select an option

  • Save cheeseonamonkey/f71006e118f628df3d6762713c35d93b to your computer and use it in GitHub Desktop.

Select an option

Save cheeseonamonkey/f71006e118f628df3d6762713c35d93b to your computer and use it in GitHub Desktop.
chord scraper
import csv, json, os, random, re, time
from typing import Optional
from curl_cffi import requests # pip install curl_cffi
from bs4 import BeautifulSoup
# --- CONFIGURATION ---
BATCH_SIZE = 500 # Lowered slightly to be safe
MIN_DELAY, MAX_DELAY = 12, 40
LONG_DELAY_CHANCE = 0.08 # 8% chance of a "coffee break"
LONG_DELAY_RANGE = (120, 300) # 2-5 minutes
REQUEST_TIMEOUT = 30
MAX_RETRIES = 3
SESSION_ROTATE_LIMIT = 50 # Rotate "browser" after this many requests
CHORD_URL_PATTERN = r'https://tabs\.ultimate-guitar\.com/tab/([^/]+)/([a-z0-9-]+)-chords-(\d+)'
CHORD_URLS_FILE = 'chord_urls.csv'
RESULTS_FILE = 'chord_results.csv'
COLUMN_NAMES = ['url', 'artist', 'track', 'id', 'content']
# Increase CSV field size for massive tabs
csv.field_size_limit(10**7)
# --- HELPER FUNCTIONS ---
def get_impersonate_ver():
"""Rotates between recent browser signatures for new sessions."""
return random.choice(["chrome120", "chrome119", "safari17_0"])
def create_session():
"""Creates a fresh session with a consistent TLS fingerprint."""
ver = get_impersonate_ver()
session = requests.Session(impersonate=ver)
# Set standard "human" headers that curl_cffi might not auto-populate fully
session.headers.update({
"Accept-Language": "en-US,en;q=0.9",
"Referer": "https://www.google.com/",
"DNT": "1",
"Upgrade-Insecure-Requests": "1",
})
return session
def warm_up_session(session):
"""Visits the homepage to get initial cookies/CSRF tokens."""
try:
print(" [~] Warming up cookies...", end='', flush=True)
session.get("https://www.ultimate-guitar.com/", timeout=15)
time.sleep(random.uniform(3, 7))
print(" Done.")
return True
except Exception as e:
print(f" Failed ({e})")
return False
def find_in_nested(obj, key):
"""Recursively search for a key in a nested dict/list."""
if isinstance(obj, dict):
return obj.get(key) or next((v for v in (find_in_nested(val, key) for val in obj.values()) if v), None)
if isinstance(obj, list):
return next((v for v in (find_in_nested(item, key) for item in obj) if v), None)
return None
def parse_chord_url(url: str):
m = re.match(CHORD_URL_PATTERN, url)
return (m.group(2).replace('-', ' ').lower(), m.group(3)) if m else (None, None)
def random_delay():
"""Simulates human reading/browsing time."""
if random.random() < LONG_DELAY_CHANCE:
delay = random.uniform(*LONG_DELAY_RANGE)
print(f" [Coffee Break] Sleeping {int(delay)}s...", end='', flush=True)
else:
delay = random.uniform(MIN_DELAY, MAX_DELAY)
print(f" {delay:.1f}s", end='', flush=True)
# Jitter the sleep (break into small chunks)
chunks = random.randint(4, 10)
chunk_size = delay / chunks
for _ in range(chunks):
time.sleep(random.uniform(chunk_size * 0.8, chunk_size * 1.2))
print('.', end='', flush=True)
print()
def scrape_chord_page(url: str, session: requests.Session):
for attempt in range(1, MAX_RETRIES + 1):
try:
r = session.get(url, timeout=REQUEST_TIMEOUT)
# --- Anti-Bot Detection Checks ---
if r.status_code == 403 or r.status_code == 429:
print(f"\n [!] Detected (Status {r.status_code}). Cooling down for 2 mins...")
time.sleep(120)
# If 403, our session might be poisoned. Raise error to trigger session rotation.
raise Exception(f"Blocked with status {r.status_code}")
if r.status_code != 200:
print(f" [x] Status {r.status_code}, retrying...")
time.sleep(10)
continue
# --- Parsing ---
soup = BeautifulSoup(r.text, 'html.parser')
store = soup.find(class_='js-store')
if not store:
# Sometimes they serve a "soft block" page without the data store
if "Access denied" in r.text or "One more step" in r.text:
raise Exception("Cloudflare Challenge Page")
return None
data = json.loads(store['data-content'])
# Harvest new URLs
urls = set(m.group(0) for m in re.finditer(CHORD_URL_PATTERN, r.text))
urls.update(l['href'] for l in soup.find_all('a', href=True)
if 'ultimate-guitar.com/tab/' in l['href']
and '-chords-' in l['href']
and re.match(CHORD_URL_PATTERN, l['href']))
return {
'artist': find_in_nested(data, 'artist_name'),
'track': find_in_nested(data, 'song_name'),
'content': find_in_nested(data, 'content'),
'urls': urls
}
except Exception as e:
print(f" [!] Retry {attempt}/{MAX_RETRIES}: {e}")
time.sleep(random.uniform(5, 15))
return None
def load_csv_column(file_path: str, col=0, skip_header=True):
if not os.path.exists(file_path): return set()
with open(file_path, encoding='utf-8') as f:
reader = csv.reader(f)
if skip_header: next(reader, None)
return {row[col] for row in reader if row and len(row) > col and row[col].strip()}
def save_chord_result(url, artist, track, track_id, content):
exists = os.path.exists(RESULTS_FILE) and os.path.getsize(RESULTS_FILE) > 0
with open(RESULTS_FILE, 'a', newline='', encoding='utf-8') as f:
w = csv.writer(f, quoting=csv.QUOTE_ALL)
if not exists: w.writerow(COLUMN_NAMES)
w.writerow([url, artist, track, track_id, content])
def append_urls(file_path, urls):
if not urls: return
existing = load_csv_column(file_path, skip_header=False)
new_urls = [u for u in urls if u not in existing]
if new_urls: open(file_path, 'a', encoding='utf-8').write('\n'.join(new_urls)+'\n')
# --- MAIN ---
def main():
print(f"Initializing TLS-Hardened Scraper...")
all_urls = load_csv_column(CHORD_URLS_FILE, skip_header=True)
scraped_urls = load_csv_column(RESULTS_FILE, skip_header=True)
# Filter valid URLs
to_scrape = [u for u in all_urls - scraped_urls if re.match(CHORD_URL_PATTERN, u)]
if not to_scrape:
print("✓ No URLs to scrape")
return
# Randomize queue to avoid hammering one artist/pattern
random.shuffle(to_scrape)
to_scrape = to_scrape[:BATCH_SIZE]
seen_ids = load_csv_column(RESULTS_FILE, 3, skip_header=True)
total_saved = len(seen_ids)
print(f"Queue: {len(to_scrape):,} URLs to process.")
session = create_session()
warm_up_session(session)
saved = failed = skipped = new_urls_count = 0
new_urls_buffer = []
try:
for idx, url in enumerate(to_scrape, 1):
track_name, track_id = parse_chord_url(url)
# --- Skip Duplicates ---
if not track_name:
failed += 1; continue
if track_id in seen_ids:
skipped += 1
print(f"[{idx}/{len(to_scrape)}] ⊘ Duplicate: {track_name}")
continue
# --- Session Rotation ---
if idx % SESSION_ROTATE_LIMIT == 0:
print("\n[↻] Rotating Browser Session & IP Cool-down...")
session.close()
time.sleep(random.uniform(10, 20))
session = create_session()
warm_up_session(session)
# --- Action ---
random_delay()
# Update Referer to look like we came from the last page (or home)
if idx > 1:
session.headers.update({"Referer": to_scrape[idx-2]})
data = scrape_chord_page(url, session)
if data and data['content']:
save_chord_result(url, data['artist'], data['track'], track_id, data['content'])
seen_ids.add(track_id)
saved += 1
# Discovery
discovered = data['urls'] - scraped_urls - all_urls
valid_new = [u for u in discovered if parse_chord_url(u)[1] not in seen_ids]
new_urls_buffer.extend(valid_new)
all_urls.update(valid_new)
new_urls_count += len(valid_new)
print(f"[{idx}/{len(to_scrape)}] ✓ {data['track']} | +{len(valid_new)} URLs")
else:
failed += 1
print(f"[{idx}/{len(to_scrape)}] ✗ Failed: {track_name}")
# If we fail, clear referer to reset navigation "trail"
session.headers.update({"Referer": "https://www.ultimate-guitar.com/"})
except KeyboardInterrupt:
print("\n[!] Stopped by user. Saving progress...")
finally:
session.close()
append_urls(CHORD_URLS_FILE, new_urls_buffer)
print(f"\nBATCH SUMMARY: {saved} saved | {failed} failed | {new_urls_count} found")
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment