Created
December 2, 2025 11:01
-
-
Save ugovaretto/ddb050805eea8ab6b0bf44a4e94fb423 to your computer and use it in GitHub Desktop.
Headless Selenium for webscraping
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from selenium import webdriver | |
| from selenium.webdriver.common.by import By | |
| from selenium.webdriver.chrome.service import Service | |
| from webdriver_manager.chrome import ChromeDriverManager | |
| # 1. Setup the WebDriver (using webdriver_manager for simplicity) | |
| # This automatically downloads and manages the correct ChromeDriver version | |
| service = Service(ChromeDriverManager().install()) | |
| driver = webdriver.Chrome(service=service) | |
| try: | |
| # 2. Navigate to the target page | |
| url = 'https://example-site.com' # Replace with your target URL | |
| driver.get(url) | |
| # 3. Find all anchor elements (<a> tags) on the page | |
| # The By.TAG_NAME locator finds all elements with the tag 'a' | |
| link_elements = driver.find_elements(By.TAG_NAME, 'a') | |
| # 4. Extract the 'href' attribute from each element and store in a list | |
| all_urls = [] | |
| for link_element in link_elements: | |
| href = link_element.get_attribute('href') | |
| if href and href.strip(): # Check if the href attribute is not empty or None | |
| all_urls.append(href) | |
| # 5. Print the extracted URLs | |
| print(f"Found {len(all_urls)} URLs:") | |
| for single_url in all_urls: | |
| print(single_url) | |
| except Exception as e: | |
| print(f"An error occurred: {e}") | |
| finally: | |
| # 6. Close the browser | |
| driver.quit() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from selenium import webdriver | |
| from selenium.webdriver.chrome.options import Options | |
| from selenium.webdriver.common.by import By | |
| from selenium.webdriver.common.keys import Keys | |
| from markdownify import markdownify as md | |
| # 1. Set up Chrome options for headless mode | |
| chrome_options = Options() | |
| chrome_options.add_argument("--headless=new") # Use the new headless mode for full functionality | |
| chrome_options.add_argument("--disable-gpu") # Good practice for some systems | |
| chrome_options.add_argument("--window-size=1920,1080") # Set a default window size for consistent rendering | |
| # 2. Initialize the WebDriver with the specified options | |
| # Selenium Manager automatically handles the driver installation/location in modern versions | |
| driver = webdriver.Chrome(options=chrome_options) | |
| driver.get("http://www.python.org") | |
| assert "Python" in driver.title | |
| elem = driver.find_element(By.NAME, "q") | |
| elem.clear() | |
| elem.send_keys("pycon") | |
| elem.send_keys(Keys.RETURN) | |
| assert "No results found." not in driver.page_source | |
| driver.save_screenshot("python.png") | |
| # Convert HTML to Markdown | |
| # You can configure markdownify to ignore certain tags if needed (e.g., strip=['script', 'style']) | |
| html_content = driver.page_source | |
| markdown_text = md(html_content, heading_style="ATX", strip=['script', 'style', 'noscript']) | |
| filename = "python.md" | |
| # Get the full page source HTML | |
| # Save the Markdown text to a file | |
| with open(filename, "w", encoding="utf-8") as file: | |
| file.write(markdown_text) | |
| driver.close() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from selenium import webdriver | |
| from selenium.webdriver.common.by import By | |
| from markdownify import markdownify as md | |
| import time | |
| def save_page_as_markdown(url, filename="webpage.md"): | |
| # Setup Selenium WebDriver (using Chrome in this example) | |
| driver = webdriver.Chrome() | |
| try: | |
| # Navigate to the target page | |
| driver.get(url) | |
| # Give the page a moment to load dynamic content | |
| time.sleep(3) | |
| # Get the full page source HTML | |
| html_content = driver.page_source | |
| # Convert HTML to Markdown | |
| # You can configure markdownify to ignore certain tags if needed (e.g., strip=['script', 'style']) | |
| markdown_text = md(html_content, heading_style="ATX", strip=['script', 'style', 'noscript']) | |
| # Save the Markdown text to a file | |
| with open(filename, "w", encoding="utf-8") as file: | |
| file.write(markdown_text) | |
| print(f"Successfully saved page content from {url} to {filename}") | |
| except Exception as e: | |
| print(f"An error occurred: {e}") | |
| finally: | |
| # Always close the browser | |
| driver.quit() | |
| # --- Example Usage --- | |
| # Replace with the URL you want to save | |
| target_url = "en.wikipedia.org" | |
| save_page_as_markdown(target_url, "wiki_page.md") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from selenium import webdriver | |
| from selenium.webdriver.chrome.service import Service | |
| from webdriver_manager.chrome import ChromeDriverManager | |
| import time | |
| # Setup the WebDriver (webdriver-manager simplifies driver installation) | |
| service = Service(ChromeDriverManager().install()) | |
| driver = webdriver.Chrome(service=service) | |
| try: | |
| # Navigate to a webpage | |
| driver.get("https://www.python.org") | |
| # Wait for the page to load (optional, but a good practice) | |
| time.sleep(2) | |
| # Save a screenshot of the visible viewport | |
| driver.save_screenshot("python_org_screenshot.png") | |
| # Alternatively: driver.get_screenshot_as_file("python_org_screenshot.png") | |
| print("Screenshot saved as python_org_screenshot.png") | |
| except Exception as e: | |
| print(f"An error occurred: {e}") | |
| finally: | |
| # Close the browser | |
| driver.quit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment