Last active
August 25, 2022 11:22
-
-
Save xdanielc/967b35af486fcf726f0bace805fcaf7c to your computer and use it in GitHub Desktop.
Buscador en el archivo municipal de valladolid
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import selenium | |
| from selenium import webdriver | |
| from selenium.webdriver.support.ui import WebDriverWait | |
| from selenium.webdriver.support import expected_conditions as EC | |
| from selenium.webdriver.common.by import By | |
| from selenium.webdriver.chrome.service import Service | |
| from selenium.webdriver.chrome.options import Options | |
| from webdriver_manager.chrome import ChromeDriverManager | |
| from colorama import Fore, Back, Style | |
| import pandas as pd | |
| import chime | |
| import math | |
| # * VARIABLES Y CONFIGURACION | |
| pag_municipal = "http://www10.ava.es/amv/" | |
| busqueda = 'plaza mayor noche' | |
| counter_pages = 0 | |
| current_entry = 1 | |
| df2 = pd.DataFrame() | |
| options = Options() | |
| options.add_argument("disable-extensions") | |
| driver = webdriver.Chrome(service=Service(ChromeDriverManager().install())) | |
| wait = WebDriverWait(driver, 30) | |
| def wait_loading_overlay(): | |
| wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR, '.loading-overlay-content'))) | |
| def buttons_present(): | |
| wait = WebDriverWait(driver, 1) | |
| wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#document .pagination'))) | |
| wait = WebDriverWait(driver, 30) | |
| def start_and_search(): | |
| # * INITIALIZE | |
| driver.get(pag_municipal) | |
| wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="dropdownMenuButton"]'))) | |
| wait_loading_overlay() | |
| wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="dropdownMenuButton"]'))).click() | |
| wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/section/div/div/div/div[2]/div/div[1]/div/div[1]/div/div[2]/div[2]/div[6]/label'))).click() | |
| # * SEARCH INPUT | |
| wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="fieldValueFreeText"]'))).send_keys(busqueda) | |
| wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/section/div/div/div/div[2]/div/div[1]/div/div[2]/div/div[2]/form/div[1]/div[2]/label'))).click() | |
| wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/section/div/div/div/div[2]/div/div[1]/div/div[2]/div/div[2]/form/div[2]/button[1]'))).click() | |
| def find_lenght(): | |
| """ | |
| Finds lenght of results query | |
| Sets counter_pages | |
| """ | |
| wait_loading_overlay() | |
| global counter_pages | |
| counter_pages = int(driver.find_element(By.CSS_SELECTOR, '.badge-primary').text) | |
| print("Se han encontrado {} elementos".format(counter_pages)) | |
| def select_first(): | |
| wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'td:nth-of-type(2)'))).click() | |
| def get_data_single_page(current_entry): | |
| """ | |
| OPERATIONS TO RETRIEVE DATA | |
| AND ADDING TO COMPLETE LIST | |
| """ | |
| print(Fore.YELLOW, 'pag datos inicio') | |
| contenedor = driver.find_element(By.CSS_SELECTOR, '#body_document') | |
| txt_identificadores = contenedor.find_element(By.XPATH, '/html/body/div[1]/section/div/div/div/div[2]/div/div[3]/div/div[2]/div[2]/div/div[1]/div[2]').text.replace(' ', '').replace(':', ';').replace(',', ';').split(';') | |
| titulo = contenedor.find_element(By.XPATH, "//DIV[@class='field col-sm-2 col-xs-12'][text()='TITULO']/..//P").text | |
| try: | |
| fecha = contenedor.find_element(By.XPATH, "//DIV[@class='field col-sm-2 col-xs-12'][text()='FECHA CREACION INICIAL']/..//P").text | |
| except: | |
| fecha = 'desconocida' | |
| imagenes = contenedor.find_elements(By.CSS_SELECTOR, 'img') | |
| src_imagenes = [] | |
| titulos = [] | |
| fechas = [] | |
| # * CHECK IF THERE ARE MULTIPLE IMAGES | |
| for i in imagenes: | |
| url_imagen = str(i.get_dom_attribute('src')) | |
| url_full = pag_municipal + url_imagen | |
| src_imagenes.append(url_full) | |
| # * HANDLES MULTIPLE IMAGES MISMATCH | |
| if(len(src_imagenes) > len(txt_identificadores)): | |
| for i in range(len(src_imagenes)): | |
| titulos.append(titulo) | |
| fechas.append(fecha) | |
| for i in range(1, len(src_imagenes)): | |
| txt_identificadores.append(src_imagenes[i]) | |
| elif(len(src_imagenes) == 1): | |
| print('dentro de elif') | |
| titulos.append(titulo) | |
| fechas.append(fecha) | |
| else: | |
| print('dentro de else') | |
| for i in range(len(src_imagenes)): | |
| titulos.append(titulo) | |
| fechas.append(fecha) | |
| print("{} | {} | {} | {}".format(len(src_imagenes), len(txt_identificadores), len(titulos), len(fechas))) | |
| # * SAVE TO CSV | |
| print('Appending to complete list') | |
| try: | |
| df = pd.DataFrame({'Id': txt_identificadores, 'URL': src_imagenes, 'Titulos': titulos, 'Fecha': fechas}) | |
| except: | |
| df = pd.DataFrame({'Id': pd.Series(txt_identificadores), 'URL': pd.Series(src_imagenes), 'Titulos': pd.Series(titulos), 'Fecha': pd.Series(fechas)}) | |
| global df2 | |
| df2 = pd.concat([df2, df], ignore_index=True) | |
| # * UNCOMENT TO SAVE EACH ONE | |
| # df.to_csv(f'lista_imagenes_{current_entry}.csv', index=False) | |
| def back_to_list_and_continue(): | |
| global current_entry | |
| wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/section/div/div/div/div[1]/div/div/ul/li[2]/a'))).click() | |
| page_togo = math.ceil(current_entry/5) | |
| script = "searchPage('9FBA3EC3CB10', '{}', '', '')".format(page_togo) | |
| driver.execute_script(script) | |
| modulo = current_entry % 5 | |
| print(str(modulo) + ' es el modulo del archivo actual') | |
| css_first = "tbody tr:nth-child(5) td:nth-of-type(2)" | |
| css_other = "tbody tr:nth-child({}) td:nth-of-type(2)".format(modulo) | |
| # * HANDLE SELECTING N ELEMENT | |
| if modulo == 0: | |
| css_other = css_first | |
| print("{} element selected".format(css_other)) | |
| wait_loading_overlay() | |
| wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_first))).click() | |
| wait_loading_overlay() | |
| buttons_present() | |
| start_and_search() | |
| find_lenght() # * SETS counter_pages | |
| select_first() | |
| while current_entry <= counter_pages: | |
| wait_loading_overlay() # * VENIMOS DE LISTA O DE PÁGINA ANTERIOR | |
| print(Fore.BLACK, Back.WHITE, "Ahora estamos en {}".format(current_entry), end='') | |
| print(Style.RESET_ALL, end='') | |
| try: | |
| # * PAGE IS OK | |
| buttons_present() | |
| except: | |
| # * PAGE DIDN'T LOAD | |
| print('La página no cargó, volviendo al listado') | |
| back_to_list_and_continue() | |
| get_data_single_page(current_entry) | |
| # * NEXT PAGE | |
| buttons = driver.find_elements(By.CSS_SELECTOR, '.page-link') | |
| for i in buttons: | |
| try: | |
| if(str(i.get_attribute('title')) == 'Documento siguiente'): | |
| print(Fore.CYAN, 'Buscando boton de siguiente') | |
| wait.until(EC.element_to_be_clickable(i)) | |
| wait_loading_overlay() | |
| i.click() | |
| except: | |
| chime.error() | |
| print('Item saltado') | |
| current_entry += 1 | |
| df2.to_csv("complete_list_{}.csv".format(busqueda), index=False) | |
| chime.success() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment