Skip to content

Instantly share code, notes, and snippets.

@ugovaretto
Created December 2, 2025 11:01
Show Gist options
  • Select an option

  • Save ugovaretto/ddb050805eea8ab6b0bf44a4e94fb423 to your computer and use it in GitHub Desktop.

Select an option

Save ugovaretto/ddb050805eea8ab6b0bf44a4e94fb423 to your computer and use it in GitHub Desktop.
Headless Selenium for webscraping
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from markdownify import markdownify as md
# 1. Set up Chrome options for headless mode
chrome_options = Options()
chrome_options.add_argument("--headless=new") # Use the new headless mode for full functionality
chrome_options.add_argument("--disable-gpu") # Good practice for some systems
chrome_options.add_argument("--window-size=1920,1080") # Set a default window size for consistent rendering
# 2. Initialize the WebDriver with the specified options
# Selenium Manager automatically handles the driver installation/location in modern versions
driver = webdriver.Chrome(options=chrome_options)
driver.get("http://www.python.org")
assert "Python" in driver.title
elem = driver.find_element(By.NAME, "q")
elem.clear()
elem.send_keys("pycon")
elem.send_keys(Keys.RETURN)
assert "No results found." not in driver.page_source
driver.save_screenshot("python.png")
# Convert HTML to Markdown
# You can configure markdownify to ignore certain tags if needed (e.g., strip=['script', 'style'])
html_content = driver.page_source
markdown_text = md(html_content, heading_style="ATX", strip=['script', 'style', 'noscript'])
filename = "python.md"
# Get the full page source HTML
# Save the Markdown text to a file
with open(filename, "w", encoding="utf-8") as file:
file.write(markdown_text)
driver.close()
from selenium import webdriver
from selenium.webdriver.common.by import By
from markdownify import markdownify as md
import time
def save_page_as_markdown(url, filename="webpage.md"):
# Setup Selenium WebDriver (using Chrome in this example)
driver = webdriver.Chrome()
try:
# Navigate to the target page
driver.get(url)
# Give the page a moment to load dynamic content
time.sleep(3)
# Get the full page source HTML
html_content = driver.page_source
# Convert HTML to Markdown
# You can configure markdownify to ignore certain tags if needed (e.g., strip=['script', 'style'])
markdown_text = md(html_content, heading_style="ATX", strip=['script', 'style', 'noscript'])
# Save the Markdown text to a file
with open(filename, "w", encoding="utf-8") as file:
file.write(markdown_text)
print(f"Successfully saved page content from {url} to {filename}")
except Exception as e:
print(f"An error occurred: {e}")
finally:
# Always close the browser
driver.quit()
# --- Example Usage ---
# Replace with the URL you want to save
target_url = "en.wikipedia.org"
save_page_as_markdown(target_url, "wiki_page.md")
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
# Setup the WebDriver (webdriver-manager simplifies driver installation)
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)
try:
# Navigate to a webpage
driver.get("https://www.python.org")
# Wait for the page to load (optional, but a good practice)
time.sleep(2)
# Save a screenshot of the visible viewport
driver.save_screenshot("python_org_screenshot.png")
# Alternatively: driver.get_screenshot_as_file("python_org_screenshot.png")
print("Screenshot saved as python_org_screenshot.png")
except Exception as e:
print(f"An error occurred: {e}")
finally:
# Close the browser
driver.quit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment