Created
July 12, 2023 12:08
-
-
Save dobrosketchkun/733672321bf468a9d2c4fa6889b46a5c to your computer and use it in GitHub Desktop.
Fetch images from the pinterest boards
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from selenium import webdriver | |
| from selenium.webdriver.chrome.service import Service | |
| from webdriver_manager.chrome import ChromeDriverManager | |
| from selenium.webdriver.common.by import By | |
| from selenium.common.exceptions import NoSuchElementException | |
| import requests | |
| import os | |
| import time | |
| from tqdm import tqdm | |
| # Configure WebDriver | |
| options = webdriver.ChromeOptions() | |
| options.add_argument('--headless') | |
| options.add_argument('--no-sandbox') | |
| options.add_argument('--disable-dev-shm-usage') | |
| options.add_argument('--log-level=3') | |
| options.add_experimental_option('excludeSwitches', ['enable-logging']) | |
| # Setup WebDriver | |
| wd = webdriver.Chrome(ChromeDriverManager().install(), options=options) | |
| #Pinterest page | |
| url = 'https://www.pinterest.com/meanwhileinm/brutalism/' | |
| wd.get(url) | |
| SCROLL_PAUSE_TIME = 2 | |
| PIN_PAUSE_TIME = 2 | |
| SCROLL_HEIGHT = 800 | |
| IMAGE_DIR = 'meanwhileinm-brutalism' | |
| # Create a directory for the images | |
| if not os.path.exists(IMAGE_DIR): | |
| os.makedirs(IMAGE_DIR, exist_ok=True) | |
| # Start a set to hold pin URLs | |
| pin_data = set() | |
| new_pins_found = True | |
| counter = 0 | |
| while new_pins_found: | |
| new_pins_found = False | |
| # Scroll down by the specified scroll height | |
| wd.execute_script(f"window.scrollBy(0, {SCROLL_HEIGHT});") | |
| # Wait to load page | |
| time.sleep(SCROLL_PAUSE_TIME) | |
| counter += 1 | |
| print(f"========= SCROLLED {counter} TIMES, PINS {len(list(pin_data))}=========") | |
| # Get all pins | |
| pins = wd.find_elements(By.XPATH, '//a[contains(@href, "/pin/")]') | |
| # Check the number of pins in the set before adding new pins | |
| num_pins_before = len(pin_data) | |
| # Add the pin's URL and aria-label to the set (this automatically removes duplicates) | |
| for pin in pins: | |
| href = pin.get_attribute('href') | |
| aria_label = pin.get_attribute('aria-label') | |
| pin_data.add((href, aria_label)) | |
| # If we found new pins on this scroll, keep going | |
| if len(pin_data) > num_pins_before: | |
| new_pins_found = True | |
| imgs_list = [] | |
| # For each pin URL... | |
| for pin_url, aria_label in tqdm(pin_data): | |
| # Go to the pin page | |
| wd.get(pin_url) | |
| # Let the page load completely | |
| time.sleep(PIN_PAUSE_TIME) | |
| # Try to get the image | |
| try: | |
| img = wd.find_element(By.TAG_NAME, 'img') | |
| img_src = img.get_attribute('src') | |
| imgs_list.append((img_src, aria_label)) | |
| # Download the image | |
| response = requests.get(img_src, stream=True) | |
| response.raise_for_status() | |
| img_filename = f'{IMAGE_DIR}/{pin_url.split("/")[-2]}.jpg' | |
| with open(img_filename, 'wb') as fd: | |
| for chunk in response.iter_content(chunk_size=8192): | |
| fd.write(chunk) | |
| # Save the aria label to a txt file | |
| with open(f'{IMAGE_DIR}/{pin_url.split("/")[-2]}.txt', 'w') as f: | |
| f.write(aria_label) | |
| except NoSuchElementException: | |
| print("No image element found on this page. Moving to next page.") | |
| continue | |
| except Exception as e: | |
| print(f"An error occurred: {e}") | |
| continue | |
| # print('\n'*3, '*'*100) | |
| # for img_src, aria_label in imgs_list: | |
| # print(f'Image source: {img_src}\nAria label: {aria_label}\n\n') | |
| # print('\n'*3, '*'*100) | |
| wd.quit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment