Skip to content

Instantly share code, notes, and snippets.

@PragatiVerma18
Last active December 4, 2024 15:14
Show Gist options
  • Select an option

  • Save PragatiVerma18/e1f3b48204a816ef55269311c9dbbc53 to your computer and use it in GitHub Desktop.

Select an option

Save PragatiVerma18/e1f3b48204a816ef55269311c9dbbc53 to your computer and use it in GitHub Desktop.
How to Scrape Products from a Page with infinite scroll via "Load more" button
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
import csv
# Set up the WebDriver (make sure you have the appropriate driver installed, e.g., ChromeDriver)
driver = webdriver.Chrome()
# Open the page
driver.get("https://www.scrapingcourse.com/button-click")
# Loop to click the "Load More" button until there are no more products to load
while True:
try:
# Find the "Load more" button by its ID and click it
load_more_button = driver.find_element(By.ID, "load-more-btn")
load_more_button.click()
# Wait for the content to load (adjust time as necessary)
time.sleep(2)
except Exception as e:
# If no "Load More" button is found (end of products), break out of the loop
print("No more products to load.")
break
# Get the updated page content after all products are loaded
html_content = driver.page_source
# Close the browser window
driver.quit()
# Parse the page content with BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
# Extract product details
products = []
# Find all product items in the grid
product_items = soup.find_all('div', class_='product-item')
for product in product_items:
# Extract the product name
name = product.find('span', class_='product-name').get_text(strip=True)
# Extract the product price
price = product.find('span', class_='product-price').get_text(strip=True)
# Extract the product link
link = product.find('a')['href']
# Extract the image URL
image_url = product.find('img')['src']
# Create a dictionary with the product details
products.append({
'name': name,
'price': price,
'link': link,
'image_url': image_url
})
# Print the extracted product details
for product in products[:2]: # You can modify the slice as needed to check more products
print(f"Name: {product['name']}")
print(f"Price: {product['price']}")
print(f"Link: {product['link']}")
print(f"Image URL: {product['image_url']}")
print('-' * 30)
# Write the product information to a CSV file
with open("products.csv", mode="w", newline="") as file:
writer = csv.DictWriter(file, fieldnames=["name", "image_url", "price", "link"])
writer.writeheader()
for product in products:
writer.writerow(product)
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
import csv
# Set up the WebDriver (make sure you have the appropriate driver installed, e.g., ChromeDriver)
driver = webdriver.Chrome()
# Open the page
driver.get("https://www.scrapingcourse.com/button-click")
# Loop to click the "Load More" button until there are no more products to load
while True:
try:
# Find the "Load more" button by its ID and click it
load_more_button = driver.find_element(By.ID, "load-more-btn")
load_more_button.click()
# Wait for the content to load (adjust time as necessary)
time.sleep(2)
except Exception as e:
# If no "Load More" button is found (end of products), break out of the loop
print("No more products to load.")
break
# Get the updated page content after all products are loaded
html_content = driver.page_source
# Parse the page content with BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
# Extract product details
products = []
# Find all product items in the grid
product_items = soup.find_all('div', class_='product-item')
for product in product_items:
# Extract the product name
name = product.find('span', class_='product-name').get_text(strip=True)
# Extract the product price
price = product.find('span', class_='product-price').get_text(strip=True)
# Convert price to a float for sorting (remove '$' or other symbols as needed)
price_float = float(price.replace('$', '').replace(',', '').strip())
# Extract the product link
link = product.find('a')['href']
# Extract the image URL
image_url = product.find('img')['src']
# Create a dictionary with the product details
products.append({
'name': name,
'price': price,
'price_float': price_float, # store as float for sorting
'link': link,
'image_url': image_url
})
# Sort products by price (descending order)
sorted_products = sorted(products, key=lambda x: x['price_float'], reverse=True)
# Get the top 5 highest-priced products
top_5_products = sorted_products[:5]
# Visit each of the top 5 product pages and extract extra data
for product in top_5_products:
# Open the product page
driver.get(product['link'])
time.sleep(3) # Wait for the page to load
# Parse the page content of the product page
product_page_soup = BeautifulSoup(driver.page_source, 'html.parser')
# Extract product description and SKU
description = product_page_soup.find('div', class_='product-description')
description_text = description.get_text(strip=True) if description else 'No description available'
sku = product_page_soup.find('span', class_='sku')
sku_code = sku.get_text(strip=True) if sku else 'No SKU available'
# Add the extra data to the product details
product['description'] = description_text
product['sku'] = sku_code
# Close the browser window after scraping product pages
driver.quit()
# Print the extracted product details for the top 5 products
for product in top_5_products:
print(f"Name: {product['name']}")
print(f"Price: {product['price']}")
print(f"Link: {product['link']}")
print(f"Image URL: {product['image_url']}")
print(f"Description: {product['description']}")
print(f"SKU: {product['sku']}")
print('-' * 30)
# Write the product information to a CSV file, including the extra data
with open("products.csv", mode="w", newline="") as file:
writer = csv.DictWriter(file, fieldnames=["name", "image_url", "price", "link", "description", "sku"])
writer.writeheader()
for product in products:
writer.writerow({
'name': product['name'],
'image_url': product['image_url'],
'price': product['price'],
'link': product['link'],
'description': product.get('description', ''),
'sku': product.get('sku', '')
})
import csv
# Write the product information to a CSV file
with open("products.csv", mode="w", newline="") as file:
writer = csv.DictWriter(file, fieldnames=["name", "image_url", "price", "link"])
writer.writeheader()
for product in products:
writer.writerow(product)
import requests
# URL of the demo page with products
url = "https://www.scrapingcourse.com/button-click"
# Send a GET request to the URL
response = requests.get(url)
# Check if the request was successful
if response.status_code == 200:
html_content = response.text
print(html_content) # Optional: Preview the HTML
else:
print(f"Failed to retrieve content: {response.status_code}")
# Sort products by price in descending order
sorted_products = sorted(products, key=lambda x: float(x['price'].replace('$', '')), reverse=True)
# Scrape extra details for the top 5 products
driver = webdriver.Chrome()
for product in sorted_products[:5]:
driver.get(product['link'])
time.sleep(3)
soup = BeautifulSoup(driver.page_source, 'html.parser')
description = soup.find('div', class_='product-description')
product['description'] = description.get_text(strip=True) if description else "No description"
sku = soup.find('span', class_='sku')
product['sku'] = sku.get_text(strip=True) if sku else "No SKU"
driver.quit()
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
# Set up the WebDriver (make sure you have the appropriate driver installed, e.g., ChromeDriver)
driver = webdriver.Chrome()
# Open the page
driver.get("https://www.scrapingcourse.com/button-click")
# Loop to click the "Load More" button until there are no more products
while True:
try:
# Find the "Load more" button by its ID and click it
load_more_button = driver.find_element(By.ID, "load-more-btn")
load_more_button.click()
# Wait for the content to load (adjust time as necessary)
time.sleep(2)
except Exception as e:
# If no "Load More" button is found (end of products), break out of the loop
print("No more products to load.")
break
# Get the updated page content after all products are loaded
html_content = driver.page_source
# Close the browser window
driver.quit()
from bs4 import BeautifulSoup
# Parse the page content with BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
# Extract product details
products = []
# Find all product items in the grid
product_items = soup.find_all('div', class_='product-item')
for product in product_items:
# Extract the product name
name = product.find('span', class_='product-name').get_text(strip=True)
# Extract the product price
price = product.find('span', class_='product-price').get_text(strip=True)
# Extract the product link
link = product.find('a')['href']
# Extract the image URL
image_url = product.find('img')['src']
# Create a dictionary with the product details
products.append({
'name': name,
'price': price,
'link': link,
'image_url': image_url
})
# Print the extracted product details
for product in products[:2]:
print(f"Name: {product['name']}")
print(f"Price: {product['price']}")
print(f"Link: {product['link']}")
print(f"Image URL: {product['image_url']}")
print('-' * 30)
from selenium import webdriver
driver = webdriver.Chrome() # Ensure ChromeDriver is installed and in PATH
driver.get("https://www.scrapingcourse.com/button-click")
print(driver.title)
driver.quit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment