Created
August 30, 2025 21:21
-
-
Save zone559/9890c5603128b0563b1f3c8f7514d2d8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| import re | |
| from urllib.parse import urljoin | |
| def is_user_profile(url): | |
| """Check if the URL is a user profile""" | |
| try: | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| # Check for user profile indicators in the HTML | |
| profile_indicators = [ | |
| r'class="top-user-credentials"', | |
| r'class="top-user-avatar"', | |
| r'data-route="[^"]+"', # data-route attribute often contains username | |
| r'Files by [^<]+</a>', # "Files by username" text | |
| r'<div class="user-meta">', | |
| r'data-text="following-count"', | |
| r'data-text="followers-count"' | |
| ] | |
| # If any of these patterns are found, it's likely a user profile | |
| html_content = response.text | |
| for pattern in profile_indicators: | |
| if re.search(pattern, html_content): | |
| return True | |
| return False | |
| except requests.RequestException: | |
| return False | |
| except Exception: | |
| return False | |
| def extract_username(url): | |
| """Extract username from user profile""" | |
| try: | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| # Method 1: Look for username in JavaScript data | |
| username_pattern = r'username:\s*"([^"]+)"' | |
| match = re.search(username_pattern, response.text) | |
| if match: | |
| return match.group(1) | |
| # Method 2: Look for username in data-route attribute | |
| route_pattern = r'data-route="([^"]+)"' | |
| match = re.search(route_pattern, response.text) | |
| if match: | |
| return match.group(1) | |
| # Method 3: Extract from URL path | |
| path_username = url.rstrip('/').split('/')[-1] | |
| if path_username and not path_username.startswith('?'): | |
| return path_username | |
| return "Username not found" | |
| except requests.RequestException as e: | |
| return f"Error making request: {e}" | |
| except Exception as e: | |
| return f"Error: {e}" | |
| def extract_next_page_url(html_content, base_url): | |
| """Extract the next page URL from pagination HTML""" | |
| next_page_pattern = r'<li class="pagination-next"><a data-pagination="next" href="([^"]+)"' | |
| match = re.search(next_page_pattern, html_content) | |
| if match: | |
| next_page_relative = match.group(1) | |
| return urljoin(base_url, next_page_relative) | |
| return None | |
| def extract_image_urls_from_page(html_content): | |
| """Extract image/video URLs from a single page's HTML content""" | |
| pattern = r'<a href="([^"]+)" class="image-container --media">' | |
| return re.findall(pattern, html_content) | |
| def extract_all_image_urls(start_url, max_pages=None): | |
| """Extract image/video URLs from all pages with pagination (only for user profiles)""" | |
| # First check if it's a user profile | |
| if not is_user_profile(start_url): | |
| print("This is not a user profile. Skipping pagination extraction.") | |
| return [] | |
| username = extract_username(start_url) | |
| print(f"User profile detected: {username}") | |
| all_urls = [] | |
| current_url = start_url | |
| page_count = 0 | |
| try: | |
| while current_url and (max_pages is None or page_count < max_pages): | |
| response = requests.get(current_url) | |
| response.raise_for_status() | |
| print(f"Processing page {page_count + 1}: {current_url}") | |
| page_urls = extract_image_urls_from_page(response.text) | |
| all_urls.extend(page_urls) | |
| next_page_url = extract_next_page_url(response.text, current_url) | |
| if not next_page_url: | |
| print("No more pages found.") | |
| break | |
| current_url = next_page_url | |
| page_count += 1 | |
| except requests.RequestException as e: | |
| print(f"Error making request: {e}") | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| return all_urls | |
| def get_full_urls(relative_urls, base_domain="https://imagepond.net"): | |
| """Convert relative URLs to full URLs""" | |
| return [urljoin(base_domain, url) for url in relative_urls] | |
| # Example usage | |
| url = "https://imagepond.net/ap000" | |
| # Check if it's a user profile and extract accordingly | |
| if is_user_profile(url): | |
| username = extract_username(url) | |
| print(f"Username: {username}") | |
| # Extract URLs with pagination | |
| relative_urls = extract_all_image_urls(url) | |
| full_urls = get_full_urls(relative_urls) | |
| print(f"\nFound {len(full_urls)} image/video URLs:") | |
| for i, full_url in enumerate(full_urls, 1): | |
| print(f"{i}. {full_url}") | |
| else: | |
| print("This URL is not a user profile. Only extracting username if available.") | |
| username = extract_username(url) | |
| if username != "Username not found": | |
| print(f"Username: {username}") | |
| else: | |
| print("No username found.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment