dobrosketchkun · July 12, 2023 12:08
diff --git a/pinterest_download.py b/pinterest_download.py
 from selenium import webdriver
 from selenium.webdriver.chrome.service import Service
 from webdriver_manager.chrome import ChromeDriverManager
 from selenium.webdriver.common.by import By
 from selenium.common.exceptions import NoSuchElementException
 import requests
 import os
 import time
 from tqdm import tqdm

 # Configure WebDriver
 options = webdriver.ChromeOptions()
 options.add_argument('--headless')
 options.add_argument('--no-sandbox')
 options.add_argument('--disable-dev-shm-usage')
 options.add_argument('--log-level=3')
 options.add_experimental_option('excludeSwitches', ['enable-logging']) 


 # Setup WebDriver
 wd = webdriver.Chrome(ChromeDriverManager().install(), options=options)


 #Pinterest page
 url = 'https://www.pinterest.com/meanwhileinm/brutalism/' 
 wd.get(url)

 SCROLL_PAUSE_TIME = 2
 PIN_PAUSE_TIME = 2
 SCROLL_HEIGHT = 800  
 IMAGE_DIR = 'meanwhileinm-brutalism'

 # Create a directory for the images
 if not os.path.exists(IMAGE_DIR):
    os.makedirs(IMAGE_DIR, exist_ok=True)

 # Start a set to hold pin URLs
 pin_data = set()
 new_pins_found = True

 counter = 0

 while new_pins_found:
    new_pins_found = False

    # Scroll down by the specified scroll height
    wd.execute_script(f"window.scrollBy(0, {SCROLL_HEIGHT});")

    # Wait to load page
    time.sleep(SCROLL_PAUSE_TIME)
    counter += 1
    print(f"========= SCROLLED {counter} TIMES, PINS {len(list(pin_data))}=========")

    # Get all pins
    pins = wd.find_elements(By.XPATH, '//a[contains(@href, "/pin/")]')

    # Check the number of pins in the set before adding new pins
    num_pins_before = len(pin_data)

    # Add the pin's URL and aria-label to the set (this automatically removes duplicates)
    for pin in pins:
        href = pin.get_attribute('href')
        aria_label = pin.get_attribute('aria-label')
        pin_data.add((href, aria_label))

    # If we found new pins on this scroll, keep going
    if len(pin_data) > num_pins_before:
        new_pins_found = True

 imgs_list = []




 # For each pin URL...
 for pin_url, aria_label in tqdm(pin_data):
    # Go to the pin page
    wd.get(pin_url)
    
    # Let the page load completely
    time.sleep(PIN_PAUSE_TIME)

    # Try to get the image
    try:
        img = wd.find_element(By.TAG_NAME, 'img')
        img_src = img.get_attribute('src')
        imgs_list.append((img_src, aria_label))

        # Download the image
        response = requests.get(img_src, stream=True)
        response.raise_for_status()
        img_filename = f'{IMAGE_DIR}/{pin_url.split("/")[-2]}.jpg'
        with open(img_filename, 'wb') as fd:
            for chunk in response.iter_content(chunk_size=8192):
                fd.write(chunk)

        # Save the aria label to a txt file
        with open(f'{IMAGE_DIR}/{pin_url.split("/")[-2]}.txt', 'w') as f:
            f.write(aria_label)
    except NoSuchElementException:
        print("No image element found on this page. Moving to next page.")
        continue
    except Exception as e:
        print(f"An error occurred: {e}")
        continue

 # print('\n'*3, '*'*100)
 # for img_src, aria_label in imgs_list:
    # print(f'Image source: {img_src}\nAria label: {aria_label}\n\n')
 # print('\n'*3, '*'*100)

 wd.quit()
	from selenium import webdriver
	from selenium.webdriver.chrome.service import Service
	from webdriver_manager.chrome import ChromeDriverManager
	from selenium.webdriver.common.by import By
	from selenium.common.exceptions import NoSuchElementException
	import requests
	import os
	import time
	from tqdm import tqdm

	# Configure WebDriver
	options = webdriver.ChromeOptions()
	options.add_argument('--headless')
	options.add_argument('--no-sandbox')
	options.add_argument('--disable-dev-shm-usage')
	options.add_argument('--log-level=3')
	options.add_experimental_option('excludeSwitches', ['enable-logging'])


	# Setup WebDriver
	wd = webdriver.Chrome(ChromeDriverManager().install(), options=options)


	#Pinterest page
	url = 'https://www.pinterest.com/meanwhileinm/brutalism/'
	wd.get(url)

	SCROLL_PAUSE_TIME = 2
	PIN_PAUSE_TIME = 2
	SCROLL_HEIGHT = 800
	IMAGE_DIR = 'meanwhileinm-brutalism'

	# Create a directory for the images
	if not os.path.exists(IMAGE_DIR):
	os.makedirs(IMAGE_DIR, exist_ok=True)

	# Start a set to hold pin URLs
	pin_data = set()
	new_pins_found = True

	counter = 0

	while new_pins_found:
	new_pins_found = False

	# Scroll down by the specified scroll height
	wd.execute_script(f"window.scrollBy(0, {SCROLL_HEIGHT});")

	# Wait to load page
	time.sleep(SCROLL_PAUSE_TIME)
	counter += 1
	print(f"========= SCROLLED {counter} TIMES, PINS {len(list(pin_data))}=========")

	# Get all pins
	pins = wd.find_elements(By.XPATH, '//a[contains(@href, "/pin/")]')

	# Check the number of pins in the set before adding new pins
	num_pins_before = len(pin_data)

	# Add the pin's URL and aria-label to the set (this automatically removes duplicates)
	for pin in pins:
	href = pin.get_attribute('href')
	aria_label = pin.get_attribute('aria-label')
	pin_data.add((href, aria_label))

	# If we found new pins on this scroll, keep going
	if len(pin_data) > num_pins_before:
	new_pins_found = True

	imgs_list = []




	# For each pin URL...
	for pin_url, aria_label in tqdm(pin_data):
	# Go to the pin page
	wd.get(pin_url)

	# Let the page load completely
	time.sleep(PIN_PAUSE_TIME)

	# Try to get the image
	try:
	img = wd.find_element(By.TAG_NAME, 'img')
	img_src = img.get_attribute('src')
	imgs_list.append((img_src, aria_label))

	# Download the image
	response = requests.get(img_src, stream=True)
	response.raise_for_status()
	img_filename = f'{IMAGE_DIR}/{pin_url.split("/")[-2]}.jpg'
	with open(img_filename, 'wb') as fd:
	for chunk in response.iter_content(chunk_size=8192):
	fd.write(chunk)

	# Save the aria label to a txt file
	with open(f'{IMAGE_DIR}/{pin_url.split("/")[-2]}.txt', 'w') as f:
	f.write(aria_label)
	except NoSuchElementException:
	print("No image element found on this page. Moving to next page.")
	continue
	except Exception as e:
	print(f"An error occurred: {e}")
	continue

	# print('\n'3, ''*100)
	# for img_src, aria_label in imgs_list:
	# print(f'Image source: {img_src}\nAria label: {aria_label}\n\n')
	# print('\n'3, ''*100)

	wd.quit()
No results found