Skip to content

Instantly share code, notes, and snippets.

@benlacey57
Created May 24, 2025 07:40
Show Gist options
  • Select an option

  • Save benlacey57/b10dee309dd453698e903006db1af88b to your computer and use it in GitHub Desktop.

Select an option

Save benlacey57/b10dee309dd453698e903006db1af88b to your computer and use it in GitHub Desktop.
Free Proxy Scraping and Testing Tool
# Version 2
pip install requests bs4 gspread oauth2client
import csv
import logging
import os
import requests
import time
import datetime
import random
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import json
from pathlib import Path
import sys
# Configuration
PREFERRED_COUNTRIES = ['GB', 'US', 'CA', 'DE', 'FR', 'NL', 'ES']
CHECK_INTERVAL = {
'WORKING': 7,
'NOT_WORKING': 30
}
LOG_FILE = 'proxy_manager.log'
MAX_LOG_SIZE_MB = 10
MAX_REQUESTS_PER_SOURCE = 3 # Max requests before implementing delays
REQUEST_DELAY_RANGE = (2, 8) # Random delay range in seconds
# User agents for rotation
USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:89.0) Gecko/20100101 Firefox/89.0'
]
def clear_screen():
"""Clear the terminal screen"""
os.system('cls' if os.name == 'nt' else 'clear')
def setup_logging():
"""Setup logging with automatic log pruning"""
# Check and prune log file if it's too large
if Path(LOG_FILE).exists():
log_size_mb = Path(LOG_FILE).stat().st_size / (1024 * 1024)
if log_size_mb > MAX_LOG_SIZE_MB:
# Keep only the last 1000 lines
with open(LOG_FILE, 'r') as f:
lines = f.readlines()
with open(LOG_FILE, 'w') as f:
f.writelines(lines[-1000:])
print(f"Log file pruned (was {log_size_mb:.1f}MB)")
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(LOG_FILE),
logging.StreamHandler()
]
)
def get_random_headers():
"""Get random headers for requests"""
return {
'User-Agent': random.choice(USER_AGENTS),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}
def load_proxy_sources():
"""Load proxy sources from JSON file"""
sources_file = 'proxy_sources.json'
default_sources = [
{
'name': 'Free Proxy List',
'url': 'https://free-proxy-list.net/',
'scraper': 'free_proxy_list_scraper',
'enabled': True,
'request_count': 0
},
{
'name': 'SSL Proxies',
'url': 'https://www.sslproxies.org/',
'scraper': 'free_proxy_list_scraper',
'enabled': True,
'request_count': 0
},
{
'name': 'Geonode Proxy List',
'url': 'https://proxylist.geonode.com/',
'scraper': 'geonode_scraper',
'enabled': True,
'request_count': 0
}
]
if Path(sources_file).exists():
try:
with open(sources_file, 'r') as f:
sources = json.load(f)
# Add request_count if not present
for source in sources:
if 'request_count' not in source:
source['request_count'] = 0
return sources
except Exception as e:
logging.error(f"Error loading proxy sources: {e}")
return default_sources
else:
# Create default sources file
save_proxy_sources(default_sources)
return default_sources
def save_proxy_sources(sources):
"""Save proxy sources to JSON file"""
with open('proxy_sources.json', 'w') as f:
json.dump(sources, f, indent=2)
def load_existing_proxies():
"""Load existing proxies from CSV"""
proxies_db = {}
if Path('all_proxies.csv').exists():
try:
with open('all_proxies.csv', 'r', newline='') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
key = f"{row['ip']}:{row['port']}"
proxies_db[key] = row
logging.info(f"Loaded {len(proxies_db)} existing proxies")
except Exception as e:
logging.warning(f"Error reading existing proxies: {str(e)}")
return proxies_db
def get_proxy_stats():
"""Get current proxy statistics"""
working_count = 0
total_sources = len(load_proxy_sources())
if Path('working_proxies.csv').exists():
try:
with open('working_proxies.csv', 'r', newline='') as csvfile:
reader = csv.DictReader(csvfile)
working_count = sum(1 for row in reader)
except Exception:
pass
return working_count, total_sources
def free_proxy_list_scraper(html_content):
"""Scrape proxies from free-proxy-list.net and similar sites"""
soup = BeautifulSoup(html_content, 'html.parser')
proxies = []
try:
table = soup.find('table', class_='table table-striped table-bordered')
rows = table.tbody.find_all('tr')
for row in rows:
cols = row.find_all('td')
if cols:
ip = cols[0].text.strip()
port = cols[1].text.strip()
country_code = cols[2].text.strip()
https = cols[6].text.strip() == 'yes'
proxy = {
'ip': ip,
'port': port,
'country': country_code,
'https': https
}
proxies.append(proxy)
return proxies
except Exception as e:
logging.error(f"Error parsing with free_proxy_list_scraper: {str(e)}")
return []
def geonode_scraper(html_content):
"""Scrape proxies from proxylist.geonode.com"""
soup = BeautifulSoup(html_content, 'html.parser')
proxies = []
try:
# Look for the proxy table
table = soup.find('table', class_='table')
if table:
rows = table.find('tbody').find_all('tr')
for row in rows:
cols = row.find_all('td')
if len(cols) >= 7:
ip = cols[0].text.strip()
port = cols[1].text.strip()
country = cols[4].text.strip()[:2] # Get country code
https = 'yes' in cols[5].text.lower()
proxy = {
'ip': ip,
'port': port,
'country': country,
'https': https
}
proxies.append(proxy)
return proxies
except Exception as e:
logging.error(f"Error parsing with geonode_scraper: {str(e)}")
return []
def scrape_proxies():
"""Scrape proxies from all enabled sources"""
clear_screen()
print("\n=== Scraping Proxies ===")
logging.info("Starting proxy scraping")
sources = load_proxy_sources()
enabled_sources = [s for s in sources if s.get('enabled', True)]
if not enabled_sources:
print("No enabled proxy sources found!")
return
existing_proxies = load_existing_proxies()
all_proxies = []
for source in enabled_sources:
try:
print(f"Scraping from {source['name']}...")
logging.info(f"Scraping from {source['name']} ({source['url']})")
# Check if we need to apply rate limiting
if source['request_count'] >= MAX_REQUESTS_PER_SOURCE:
delay = random.uniform(*REQUEST_DELAY_RANGE)
print(f" Rate limiting: waiting {delay:.1f} seconds...")
logging.info(f"Rate limiting for {source['name']}: {delay:.1f}s delay")
time.sleep(delay)
source['request_count'] = 0 # Reset counter
headers = get_random_headers()
logging.debug(f"Using User-Agent: {headers['User-Agent']}")
response = requests.get(source['url'], headers=headers, timeout=15)
source['request_count'] += 1
if response.status_code == 200:
scraper_func = globals()[source['scraper']]
proxies = scraper_func(response.text)
for proxy in proxies:
proxy['source'] = source['name']
print(f" ✓ Found {len(proxies)} proxies from {source['name']}")
logging.info(f"Found {len(proxies)} proxies from {source['name']}")
all_proxies.extend(proxies)
else:
print(f" ✗ Failed to retrieve from {source['name']}: Status {response.status_code}")
logging.error(f"Failed to retrieve from {source['name']}: Status {response.status_code}")
# Add blank line between sources
print()
except Exception as e:
print(f" ✗ Error scraping from {source['name']}: {str(e)}")
logging.error(f"Error scraping from {source['name']}: {str(e)}")
print()
# Save updated request counts
save_proxy_sources(sources)
# Filter by preferred countries
if PREFERRED_COUNTRIES:
filtered_proxies = [p for p in all_proxies if p['country'] in PREFERRED_COUNTRIES]
print(f"Filtered to {len(filtered_proxies)} proxies from preferred countries: {', '.join(PREFERRED_COUNTRIES)}")
all_proxies = filtered_proxies
# Merge with existing data
for proxy in all_proxies:
key = f"{proxy['ip']}:{proxy['port']}"
if key in existing_proxies:
proxy['last_checked'] = existing_proxies[key]['last_checked']
proxy['status'] = existing_proxies[key]['status']
proxy['speed'] = existing_proxies[key].get('speed', '')
proxy['location'] = existing_proxies[key].get('location', '')
else:
proxy['last_checked'] = ''
proxy['status'] = 'New'
# Save to CSV
if all_proxies:
with open('proxies.csv', 'w', newline='') as csvfile:
fieldnames = ['ip', 'port', 'country', 'https', 'source', 'status', 'speed', 'location', 'last_checked']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(all_proxies)
print(f"\n✓ Saved {len(all_proxies)} proxies to proxies.csv")
logging.info(f"Saved {len(all_proxies)} proxies to proxies.csv")
else:
print("\n✗ No proxies found!")
def manage_proxy_websites():
"""CRUD operations for proxy websites"""
while True:
clear_screen()
print("\n=== Manage Proxy Websites ===")
sources = load_proxy_sources()
print("Current proxy sources:")
for i, source in enumerate(sources, 1):
status = "✓" if source.get('enabled', True) else "✗"
req_count = source.get('request_count', 0)
print(f"{i}. {status} {source['name']} - {source['url']} (Requests: {req_count})")
print("\nOptions:")
print("1. Add new source")
print("2. Edit source")
print("3. Enable/Disable source")
print("4. Delete source")
print("5. Reset request counters")
print("6. Back to main menu")
choice = input("\nChoose option (1-6): ").strip()
if choice == '1':
add_proxy_source(sources)
elif choice == '2':
edit_proxy_source(sources)
elif choice == '3':
toggle_proxy_source(sources)
elif choice == '4':
delete_proxy_source(sources)
elif choice == '5':
reset_request_counters(sources)
elif choice == '6':
break
else:
print("Invalid choice!")
if choice != '6':
input("\nPress Enter to continue...")
def reset_request_counters(sources):
"""Reset request counters for all sources"""
for source in sources:
source['request_count'] = 0
save_proxy_sources(sources)
print("All request counters reset!")
logging.info("Reset request counters for all proxy sources")
def add_proxy_source(sources):
"""Add a new proxy source"""
print("\n=== Add New Proxy Source ===")
name = input("Enter source name: ").strip()
url = input("Enter source URL: ").strip()
scraper = input("Enter scraper function name (default: free_proxy_list_scraper): ").strip()
if not scraper:
scraper = 'free_proxy_list_scraper'
new_source = {
'name': name,
'url': url,
'scraper': scraper,
'enabled': True,
'request_count': 0
}
sources.append(new_source)
save_proxy_sources(sources)
print(f"Added new proxy source: {name}")
logging.info(f"Added new proxy source: {name}")
def edit_proxy_source(sources):
"""Edit an existing proxy source"""
if not sources:
print("No sources to edit!")
return
try:
index = int(input("Enter source number to edit: ")) - 1
if 0 <= index < len(sources):
source = sources[index]
print(f"\nEditing: {source['name']}")
name = input(f"Name ({source['name']}): ").strip()
url = input(f"URL ({source['url']}): ").strip()
scraper = input(f"Scraper ({source['scraper']}): ").strip()
if name:
source['name'] = name
if url:
source['url'] = url
if scraper:
source['scraper'] = scraper
save_proxy_sources(sources)
print("Source updated successfully!")
logging.info(f"Updated proxy source: {source['name']}")
else:
print("Invalid source number!")
except ValueError:
print("Invalid input!")
def toggle_proxy_source(sources):
"""Enable/disable a proxy source"""
if not sources:
print("No sources to toggle!")
return
try:
index = int(input("Enter source number to toggle: ")) - 1
if 0 <= index < len(sources):
source = sources[index]
source['enabled'] = not source.get('enabled', True)
status = "enabled" if source['enabled'] else "disabled"
save_proxy_sources(sources)
print(f"Source '{source['name']}' {status}")
logging.info(f"Source '{source['name']}' {status}")
else:
print("Invalid source number!")
except ValueError:
print("Invalid input!")
def delete_proxy_source(sources):
"""Delete a proxy source"""
if not sources:
print("No sources to delete!")
return
try:
index = int(input("Enter source number to delete: ")) - 1
if 0 <= index < len(sources):
source = sources[index]
confirm = input(f"Delete '{source['name']}'? (y/N): ").strip().lower()
if confirm == 'y':
deleted_source = sources.pop(index)
save_proxy_sources(sources)
print(f"Deleted source: {deleted_source['name']}")
logging.info(f"Deleted proxy source: {deleted_source['name']}")
else:
print("Deletion cancelled")
else:
print("Invalid source number!")
except ValueError:
print("Invalid input!")
def test_proxy(proxy):
"""Test a single proxy with live output"""
proxy_str = f"{proxy['ip']}:{proxy['port']}"
proxy_dict = {
"http": f"http://{proxy_str}",
"https": f"https://{proxy_str}" if proxy['https'] else None
}
proxy['last_checked'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# Show testing progress
print(f"Testing {proxy_str} ({proxy['country']})...", end=' ', flush=True)
start_time = time.time()
try:
response = requests.get('https://ipinfo.io/json',
proxies=proxy_dict,
timeout=10)
if response.status_code == 200:
data = response.json()
end_time = time.time()
speed = round(end_time - start_time, 2)
proxy['status'] = 'Working'
proxy['speed'] = speed
proxy['location'] = f"{data.get('city', 'Unknown')}, {data.get('country', 'Unknown')}"
print(f"✓ Working ({speed}s) - {proxy['location']}")
logging.info(f"Proxy {proxy_str} is working - Speed: {speed}s - Location: {proxy['location']}")
return proxy
else:
print(f"✗ Status {response.status_code}")
proxy['status'] = 'Not Working'
return proxy
except requests.exceptions.RequestException as e:
print(f"✗ Failed ({str(e)[:30]}...)")
logging.warning(f"Proxy {proxy_str} failed connection test: {str(e)}")
proxy['status'] = 'Not Working'
return proxy
def test_proxies():
"""Test all proxies with live output"""
clear_screen()
print("\n=== Testing Proxies ===")
if not Path('proxies.csv').exists():
print("No proxies file found! Please scrape proxies first.")
return
proxies = []
with open('proxies.csv', 'r', newline='') as csvfile:
reader = csv.DictReader(csvfile)
proxies = list(reader)
if not proxies:
print("No proxies to test!")
return
print(f"Testing {len(proxies)} proxies...\n")
logging.info(f"Starting to test {len(proxies)} proxies")
# Test proxies one by one to show progress
tested_proxies = []
working_count = 0
for i, proxy in enumerate(proxies, 1):
print(f"[{i}/{len(proxies)}] ", end='')
tested_proxy = test_proxy(proxy)
tested_proxies.append(tested_proxy)
if tested_proxy['status'] == 'Working':
working_count += 1
print(f"\n=== Testing Complete ===")
print(f"Working proxies: {working_count}/{len(proxies)}")
# Save all proxies
with open('all_proxies.csv', 'w', newline='') as csvfile:
fieldnames = ['ip', 'port', 'country', 'https', 'source', 'status', 'speed', 'location', 'last_checked']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(tested_proxies)
# Save only working proxies
working_proxies = [p for p in tested_proxies if p['status'] == 'Working']
working_proxies.sort(key=lambda x: float(x.get('speed', float('inf'))))
if working_proxies:
with open('working_proxies.csv', 'w', newline='') as csvfile:
fieldnames = ['ip', 'port', 'country', 'https', 'source', 'status', 'speed', 'location', 'last_checked']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(working_proxies)
print(f"\nSaved {len(working_proxies)} working proxies to working_proxies.csv")
logging.info(f"Found {len(working_proxies)} working proxies out of {len(tested_proxies)}")
print("\nTop 5 fastest proxies:")
for i, proxy in enumerate(working_proxies[:5], 1):
print(f"{i}. {proxy['ip']}:{proxy['port']} - {proxy['country']} - {proxy['speed']}s - {proxy['location']}")
else:
print("No working proxies found!")
def sync_to_google_sheets():
"""Sync working proxies to Google Sheets with debug logging"""
clear_screen()
print("\n=== Sync to Google Sheets ===")
logging.info("Starting Google Sheets sync process")
try:
print("Checking for required packages...")
logging.debug("Importing Google Sheets libraries")
import gspread
from oauth2client.service_account import ServiceAccountCredentials
print("✓ Required packages found")
logging.debug("Successfully imported gspread and oauth2client")
# Check for credentials file
creds_file = 'service_account.json'
if not Path(creds_file).exists():
print(f"✗ Credentials file '{creds_file}' not found!")
print("\nTo set up Google Sheets sync:")
print("1. Go to https://console.developers.google.com/")
print("2. Create a new project or select existing")
print("3. Enable Google Sheets API")
print("4. Create service account credentials")
print("5. Download JSON key file as 'service_account.json'")
print("6. Share your Google Sheet with the service account email")
logging.warning("Google Sheets credentials file not found")
return
print("✓ Credentials file found")
logging.debug("Service account credentials file located")
# Check for working proxies file
if not Path('working_proxies.csv').exists():
print("✗ No working proxies file found! Please test proxies first.")
logging.warning("No working_proxies.csv file found")
return
print("✓ Working proxies file found")
logging.debug("Working proxies CSV file located")
# Load working proxies
working_proxies = []
with open('working_proxies.csv', 'r', newline='') as csvfile:
reader = csv.DictReader(csvfile)
working_proxies = list(reader)
print(f"✓ Loaded {len(working_proxies)} working proxies")
logging.debug(f"Loaded {len(working_proxies)} working proxies from CSV")
# Set up Google Sheets connection
print("Connecting to Google Sheets...")
logging.debug("Setting up Google Sheets API connection")
scope = ['https://spreadsheets.google.com/feeds',
'https://www.googleapis.com/auth/drive']
creds = ServiceAccountCredentials.from_json_keyfile_name(creds_file, scope)
client = gspread.authorize(creds)
print("✓ Connected to Google Sheets API")
logging.debug("Successfully authorized with Google Sheets API")
# Get or create spreadsheet
sheet_name = 'Proxy Manager'
try:
print(f"Opening spreadsheet '{sheet_name}'...")
logging.debug(f"Attempting to open existing spreadsheet: {sheet_name}")
spreadsheet = client.open(sheet_name)
print("✓ Opened existing spreadsheet")
logging.debug("Successfully opened existing spreadsheet")
except gspread.SpreadsheetNotFound:
print(f"Creating new spreadsheet '{sheet_name}'...")
logging.debug(f"Creating new spreadsheet: {sheet_name}")
spreadsheet = client.create(sheet_name)
print("✓ Created new spreadsheet")
logging.debug("Successfully created new spreadsheet")
# Get or create worksheet
worksheet_name = 'Working Proxies'
try:
worksheet = spreadsheet.worksheet(worksheet_name)
print(f"✓ Using existing worksheet '{worksheet_name}'")
logging.debug(f"Using existing worksheet: {worksheet_name}")
except gspread.WorksheetNotFound:
print(f"Creating new worksheet '{worksheet_name}'...")
logging.debug(f"Creating new worksheet: {worksheet_name}")
worksheet = spreadsheet.add_worksheet(title=worksheet_name, rows=1000, cols=10)
print("✓ Created new worksheet")
logging.debug("Successfully created new worksheet")
# Prepare data for upload
print("Preparing data for upload...")
logging.debug("Formatting proxy data for Google Sheets")
headers = ['IP', 'Port', 'Country', 'HTTPS', 'Source', 'Status', 'Speed (s)', 'Location', 'Last Checked']
data = [headers]
for proxy in working_proxies:
row = [
proxy['ip'],
proxy['port'],
proxy['country'],
'Yes' if proxy['https'] == 'True' or proxy['https'] == True else 'No',
proxy['source'],
proxy['status'],
proxy['speed'],
proxy['location'],
proxy['last_checked']
]
data.append(row)
print(f"✓ Prepared {len(data)-1} rows of data")
logging.debug(f"Prepared {len(data)-1} rows of proxy data")
# Clear existing data and upload new data
print("Uploading data to Google Sheets...")
logging.debug("Clearing existing worksheet data")
worksheet.clear()
logging.debug("Uploading new data to worksheet")
worksheet.update('A1', data)
print("✓ Data uploaded successfully!")
logging.info(f"Successfully synced {len(working_proxies)} proxies to Google Sheets")
# Format the sheet
print("Formatting spreadsheet...")
logging.debug("Applying formatting to spreadsheet")
# Make header row bold
worksheet.format('A1:I1', {'textFormat': {'bold': True}})
# Auto-resize columns
worksheet.columns_auto_resize(0, len(headers)-1)
print("✓ Formatting applied")
logging.debug("Successfully applied formatting")
# Get spreadsheet URL
spreadsheet_url = f"https://docs.google.com/spreadsheets/d/{spreadsheet.id}"
print(f"\n✓ Sync complete! Spreadsheet URL:")
print(f" {spreadsheet_url}")
logging.info(f"Google Sheets sync completed successfully. URL: {spreadsheet_url}")
except ImportError:
print("✗ Required packages not installed!")
print("Run: pip install gspread oauth2client")
logging.error("Required Google Sheets packages not installed")
except Exception as e:
print(f"✗ Error during sync: {str(e)}")
logging.error(f"Google Sheets sync failed: {str(e)}")
def show_menu():
"""Display the main menu"""
clear_screen()
working_count, source_count = get_proxy_stats()
print("\n" + "="*50)
print(" PROXY MANAGER")
print("="*50)
print(f"You have {working_count} Working Proxies from {source_count} sources.")
print()
print("1. Scrape Proxies")
print("2. Manage Proxy Websites")
print("3. Test Proxies")
print("4. Sync To Google Sheets")
print("E. Exit")
print("="*50)
def main():
"""Main program loop"""
setup_logging()
logging.info("Proxy Manager started")
while True:
show_menu()
choice = input("\nChoose an option: ").strip().upper()
if choice == '1':
scrape_proxies()
elif choice == '2':
manage_proxy_websites()
elif choice == '3':
test_proxies()
elif choice == '4':
sync_to_google_sheets()
elif choice == 'E':
clear_screen()
print("\nExiting Proxy Manager...")
logging.info("Proxy Manager exited")
break
else:
print("Invalid choice! Please try again.")
if choice != 'E':
input("\nPress Enter to continue...")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment