Skip to content

Instantly share code, notes, and snippets.

@dobrosketchkun
Created July 12, 2023 12:08
Show Gist options
  • Select an option

  • Save dobrosketchkun/733672321bf468a9d2c4fa6889b46a5c to your computer and use it in GitHub Desktop.

Select an option

Save dobrosketchkun/733672321bf468a9d2c4fa6889b46a5c to your computer and use it in GitHub Desktop.
Fetch images from the pinterest boards
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import requests
import os
import time
from tqdm import tqdm
# Configure WebDriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--log-level=3')
options.add_experimental_option('excludeSwitches', ['enable-logging'])
# Setup WebDriver
wd = webdriver.Chrome(ChromeDriverManager().install(), options=options)
#Pinterest page
url = 'https://www.pinterest.com/meanwhileinm/brutalism/'
wd.get(url)
SCROLL_PAUSE_TIME = 2
PIN_PAUSE_TIME = 2
SCROLL_HEIGHT = 800
IMAGE_DIR = 'meanwhileinm-brutalism'
# Create a directory for the images
if not os.path.exists(IMAGE_DIR):
os.makedirs(IMAGE_DIR, exist_ok=True)
# Start a set to hold pin URLs
pin_data = set()
new_pins_found = True
counter = 0
while new_pins_found:
new_pins_found = False
# Scroll down by the specified scroll height
wd.execute_script(f"window.scrollBy(0, {SCROLL_HEIGHT});")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
counter += 1
print(f"========= SCROLLED {counter} TIMES, PINS {len(list(pin_data))}=========")
# Get all pins
pins = wd.find_elements(By.XPATH, '//a[contains(@href, "/pin/")]')
# Check the number of pins in the set before adding new pins
num_pins_before = len(pin_data)
# Add the pin's URL and aria-label to the set (this automatically removes duplicates)
for pin in pins:
href = pin.get_attribute('href')
aria_label = pin.get_attribute('aria-label')
pin_data.add((href, aria_label))
# If we found new pins on this scroll, keep going
if len(pin_data) > num_pins_before:
new_pins_found = True
imgs_list = []
# For each pin URL...
for pin_url, aria_label in tqdm(pin_data):
# Go to the pin page
wd.get(pin_url)
# Let the page load completely
time.sleep(PIN_PAUSE_TIME)
# Try to get the image
try:
img = wd.find_element(By.TAG_NAME, 'img')
img_src = img.get_attribute('src')
imgs_list.append((img_src, aria_label))
# Download the image
response = requests.get(img_src, stream=True)
response.raise_for_status()
img_filename = f'{IMAGE_DIR}/{pin_url.split("/")[-2]}.jpg'
with open(img_filename, 'wb') as fd:
for chunk in response.iter_content(chunk_size=8192):
fd.write(chunk)
# Save the aria label to a txt file
with open(f'{IMAGE_DIR}/{pin_url.split("/")[-2]}.txt', 'w') as f:
f.write(aria_label)
except NoSuchElementException:
print("No image element found on this page. Moving to next page.")
continue
except Exception as e:
print(f"An error occurred: {e}")
continue
# print('\n'*3, '*'*100)
# for img_src, aria_label in imgs_list:
# print(f'Image source: {img_src}\nAria label: {aria_label}\n\n')
# print('\n'*3, '*'*100)
wd.quit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment