Last active
December 4, 2024 15:14
-
-
Save PragatiVerma18/e1f3b48204a816ef55269311c9dbbc53 to your computer and use it in GitHub Desktop.
How to Scrape Products from a Page with infinite scroll via "Load more" button
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from selenium import webdriver | |
| from selenium.webdriver.common.by import By | |
| from bs4 import BeautifulSoup | |
| import time | |
| import csv | |
| # Set up the WebDriver (make sure you have the appropriate driver installed, e.g., ChromeDriver) | |
| driver = webdriver.Chrome() | |
| # Open the page | |
| driver.get("https://www.scrapingcourse.com/button-click") | |
| # Loop to click the "Load More" button until there are no more products to load | |
| while True: | |
| try: | |
| # Find the "Load more" button by its ID and click it | |
| load_more_button = driver.find_element(By.ID, "load-more-btn") | |
| load_more_button.click() | |
| # Wait for the content to load (adjust time as necessary) | |
| time.sleep(2) | |
| except Exception as e: | |
| # If no "Load More" button is found (end of products), break out of the loop | |
| print("No more products to load.") | |
| break | |
| # Get the updated page content after all products are loaded | |
| html_content = driver.page_source | |
| # Close the browser window | |
| driver.quit() | |
| # Parse the page content with BeautifulSoup | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| # Extract product details | |
| products = [] | |
| # Find all product items in the grid | |
| product_items = soup.find_all('div', class_='product-item') | |
| for product in product_items: | |
| # Extract the product name | |
| name = product.find('span', class_='product-name').get_text(strip=True) | |
| # Extract the product price | |
| price = product.find('span', class_='product-price').get_text(strip=True) | |
| # Extract the product link | |
| link = product.find('a')['href'] | |
| # Extract the image URL | |
| image_url = product.find('img')['src'] | |
| # Create a dictionary with the product details | |
| products.append({ | |
| 'name': name, | |
| 'price': price, | |
| 'link': link, | |
| 'image_url': image_url | |
| }) | |
| # Print the extracted product details | |
| for product in products[:2]: # You can modify the slice as needed to check more products | |
| print(f"Name: {product['name']}") | |
| print(f"Price: {product['price']}") | |
| print(f"Link: {product['link']}") | |
| print(f"Image URL: {product['image_url']}") | |
| print('-' * 30) | |
| # Write the product information to a CSV file | |
| with open("products.csv", mode="w", newline="") as file: | |
| writer = csv.DictWriter(file, fieldnames=["name", "image_url", "price", "link"]) | |
| writer.writeheader() | |
| for product in products: | |
| writer.writerow(product) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from selenium import webdriver | |
| from selenium.webdriver.common.by import By | |
| from bs4 import BeautifulSoup | |
| import time | |
| import csv | |
| # Set up the WebDriver (make sure you have the appropriate driver installed, e.g., ChromeDriver) | |
| driver = webdriver.Chrome() | |
| # Open the page | |
| driver.get("https://www.scrapingcourse.com/button-click") | |
| # Loop to click the "Load More" button until there are no more products to load | |
| while True: | |
| try: | |
| # Find the "Load more" button by its ID and click it | |
| load_more_button = driver.find_element(By.ID, "load-more-btn") | |
| load_more_button.click() | |
| # Wait for the content to load (adjust time as necessary) | |
| time.sleep(2) | |
| except Exception as e: | |
| # If no "Load More" button is found (end of products), break out of the loop | |
| print("No more products to load.") | |
| break | |
| # Get the updated page content after all products are loaded | |
| html_content = driver.page_source | |
| # Parse the page content with BeautifulSoup | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| # Extract product details | |
| products = [] | |
| # Find all product items in the grid | |
| product_items = soup.find_all('div', class_='product-item') | |
| for product in product_items: | |
| # Extract the product name | |
| name = product.find('span', class_='product-name').get_text(strip=True) | |
| # Extract the product price | |
| price = product.find('span', class_='product-price').get_text(strip=True) | |
| # Convert price to a float for sorting (remove '$' or other symbols as needed) | |
| price_float = float(price.replace('$', '').replace(',', '').strip()) | |
| # Extract the product link | |
| link = product.find('a')['href'] | |
| # Extract the image URL | |
| image_url = product.find('img')['src'] | |
| # Create a dictionary with the product details | |
| products.append({ | |
| 'name': name, | |
| 'price': price, | |
| 'price_float': price_float, # store as float for sorting | |
| 'link': link, | |
| 'image_url': image_url | |
| }) | |
| # Sort products by price (descending order) | |
| sorted_products = sorted(products, key=lambda x: x['price_float'], reverse=True) | |
| # Get the top 5 highest-priced products | |
| top_5_products = sorted_products[:5] | |
| # Visit each of the top 5 product pages and extract extra data | |
| for product in top_5_products: | |
| # Open the product page | |
| driver.get(product['link']) | |
| time.sleep(3) # Wait for the page to load | |
| # Parse the page content of the product page | |
| product_page_soup = BeautifulSoup(driver.page_source, 'html.parser') | |
| # Extract product description and SKU | |
| description = product_page_soup.find('div', class_='product-description') | |
| description_text = description.get_text(strip=True) if description else 'No description available' | |
| sku = product_page_soup.find('span', class_='sku') | |
| sku_code = sku.get_text(strip=True) if sku else 'No SKU available' | |
| # Add the extra data to the product details | |
| product['description'] = description_text | |
| product['sku'] = sku_code | |
| # Close the browser window after scraping product pages | |
| driver.quit() | |
| # Print the extracted product details for the top 5 products | |
| for product in top_5_products: | |
| print(f"Name: {product['name']}") | |
| print(f"Price: {product['price']}") | |
| print(f"Link: {product['link']}") | |
| print(f"Image URL: {product['image_url']}") | |
| print(f"Description: {product['description']}") | |
| print(f"SKU: {product['sku']}") | |
| print('-' * 30) | |
| # Write the product information to a CSV file, including the extra data | |
| with open("products.csv", mode="w", newline="") as file: | |
| writer = csv.DictWriter(file, fieldnames=["name", "image_url", "price", "link", "description", "sku"]) | |
| writer.writeheader() | |
| for product in products: | |
| writer.writerow({ | |
| 'name': product['name'], | |
| 'image_url': product['image_url'], | |
| 'price': product['price'], | |
| 'link': product['link'], | |
| 'description': product.get('description', ''), | |
| 'sku': product.get('sku', '') | |
| }) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import csv | |
| # Write the product information to a CSV file | |
| with open("products.csv", mode="w", newline="") as file: | |
| writer = csv.DictWriter(file, fieldnames=["name", "image_url", "price", "link"]) | |
| writer.writeheader() | |
| for product in products: | |
| writer.writerow(product) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| # URL of the demo page with products | |
| url = "https://www.scrapingcourse.com/button-click" | |
| # Send a GET request to the URL | |
| response = requests.get(url) | |
| # Check if the request was successful | |
| if response.status_code == 200: | |
| html_content = response.text | |
| print(html_content) # Optional: Preview the HTML | |
| else: | |
| print(f"Failed to retrieve content: {response.status_code}") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Sort products by price in descending order | |
| sorted_products = sorted(products, key=lambda x: float(x['price'].replace('$', '')), reverse=True) | |
| # Scrape extra details for the top 5 products | |
| driver = webdriver.Chrome() | |
| for product in sorted_products[:5]: | |
| driver.get(product['link']) | |
| time.sleep(3) | |
| soup = BeautifulSoup(driver.page_source, 'html.parser') | |
| description = soup.find('div', class_='product-description') | |
| product['description'] = description.get_text(strip=True) if description else "No description" | |
| sku = soup.find('span', class_='sku') | |
| product['sku'] = sku.get_text(strip=True) if sku else "No SKU" | |
| driver.quit() | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from selenium import webdriver | |
| from selenium.webdriver.common.by import By | |
| import time | |
| # Set up the WebDriver (make sure you have the appropriate driver installed, e.g., ChromeDriver) | |
| driver = webdriver.Chrome() | |
| # Open the page | |
| driver.get("https://www.scrapingcourse.com/button-click") | |
| # Loop to click the "Load More" button until there are no more products | |
| while True: | |
| try: | |
| # Find the "Load more" button by its ID and click it | |
| load_more_button = driver.find_element(By.ID, "load-more-btn") | |
| load_more_button.click() | |
| # Wait for the content to load (adjust time as necessary) | |
| time.sleep(2) | |
| except Exception as e: | |
| # If no "Load More" button is found (end of products), break out of the loop | |
| print("No more products to load.") | |
| break | |
| # Get the updated page content after all products are loaded | |
| html_content = driver.page_source | |
| # Close the browser window | |
| driver.quit() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from bs4 import BeautifulSoup | |
| # Parse the page content with BeautifulSoup | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| # Extract product details | |
| products = [] | |
| # Find all product items in the grid | |
| product_items = soup.find_all('div', class_='product-item') | |
| for product in product_items: | |
| # Extract the product name | |
| name = product.find('span', class_='product-name').get_text(strip=True) | |
| # Extract the product price | |
| price = product.find('span', class_='product-price').get_text(strip=True) | |
| # Extract the product link | |
| link = product.find('a')['href'] | |
| # Extract the image URL | |
| image_url = product.find('img')['src'] | |
| # Create a dictionary with the product details | |
| products.append({ | |
| 'name': name, | |
| 'price': price, | |
| 'link': link, | |
| 'image_url': image_url | |
| }) | |
| # Print the extracted product details | |
| for product in products[:2]: | |
| print(f"Name: {product['name']}") | |
| print(f"Price: {product['price']}") | |
| print(f"Link: {product['link']}") | |
| print(f"Image URL: {product['image_url']}") | |
| print('-' * 30) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from selenium import webdriver | |
| driver = webdriver.Chrome() # Ensure ChromeDriver is installed and in PATH | |
| driver.get("https://www.scrapingcourse.com/button-click") | |
| print(driver.title) | |
| driver.quit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment