Skip to content

Instantly share code, notes, and snippets.

@xdanielc
Last active August 25, 2022 11:22
Show Gist options
  • Select an option

  • Save xdanielc/967b35af486fcf726f0bace805fcaf7c to your computer and use it in GitHub Desktop.

Select an option

Save xdanielc/967b35af486fcf726f0bace805fcaf7c to your computer and use it in GitHub Desktop.
Buscador en el archivo municipal de valladolid
import selenium
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from colorama import Fore, Back, Style
import pandas as pd
import chime
import math
# * VARIABLES Y CONFIGURACION
pag_municipal = "http://www10.ava.es/amv/"
busqueda = 'plaza mayor noche'
counter_pages = 0
current_entry = 1
df2 = pd.DataFrame()
options = Options()
options.add_argument("disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
wait = WebDriverWait(driver, 30)
def wait_loading_overlay():
wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR, '.loading-overlay-content')))
def buttons_present():
wait = WebDriverWait(driver, 1)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#document .pagination')))
wait = WebDriverWait(driver, 30)
def start_and_search():
# * INITIALIZE
driver.get(pag_municipal)
wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="dropdownMenuButton"]')))
wait_loading_overlay()
wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="dropdownMenuButton"]'))).click()
wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/section/div/div/div/div[2]/div/div[1]/div/div[1]/div/div[2]/div[2]/div[6]/label'))).click()
# * SEARCH INPUT
wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="fieldValueFreeText"]'))).send_keys(busqueda)
wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/section/div/div/div/div[2]/div/div[1]/div/div[2]/div/div[2]/form/div[1]/div[2]/label'))).click()
wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/section/div/div/div/div[2]/div/div[1]/div/div[2]/div/div[2]/form/div[2]/button[1]'))).click()
def find_lenght():
"""
Finds lenght of results query
Sets counter_pages
"""
wait_loading_overlay()
global counter_pages
counter_pages = int(driver.find_element(By.CSS_SELECTOR, '.badge-primary').text)
print("Se han encontrado {} elementos".format(counter_pages))
def select_first():
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'td:nth-of-type(2)'))).click()
def get_data_single_page(current_entry):
"""
OPERATIONS TO RETRIEVE DATA
AND ADDING TO COMPLETE LIST
"""
print(Fore.YELLOW, 'pag datos inicio')
contenedor = driver.find_element(By.CSS_SELECTOR, '#body_document')
txt_identificadores = contenedor.find_element(By.XPATH, '/html/body/div[1]/section/div/div/div/div[2]/div/div[3]/div/div[2]/div[2]/div/div[1]/div[2]').text.replace(' ', '').replace(':', ';').replace(',', ';').split(';')
titulo = contenedor.find_element(By.XPATH, "//DIV[@class='field col-sm-2 col-xs-12'][text()='TITULO']/..//P").text
try:
fecha = contenedor.find_element(By.XPATH, "//DIV[@class='field col-sm-2 col-xs-12'][text()='FECHA CREACION INICIAL']/..//P").text
except:
fecha = 'desconocida'
imagenes = contenedor.find_elements(By.CSS_SELECTOR, 'img')
src_imagenes = []
titulos = []
fechas = []
# * CHECK IF THERE ARE MULTIPLE IMAGES
for i in imagenes:
url_imagen = str(i.get_dom_attribute('src'))
url_full = pag_municipal + url_imagen
src_imagenes.append(url_full)
# * HANDLES MULTIPLE IMAGES MISMATCH
if(len(src_imagenes) > len(txt_identificadores)):
for i in range(len(src_imagenes)):
titulos.append(titulo)
fechas.append(fecha)
for i in range(1, len(src_imagenes)):
txt_identificadores.append(src_imagenes[i])
elif(len(src_imagenes) == 1):
print('dentro de elif')
titulos.append(titulo)
fechas.append(fecha)
else:
print('dentro de else')
for i in range(len(src_imagenes)):
titulos.append(titulo)
fechas.append(fecha)
print("{} | {} | {} | {}".format(len(src_imagenes), len(txt_identificadores), len(titulos), len(fechas)))
# * SAVE TO CSV
print('Appending to complete list')
try:
df = pd.DataFrame({'Id': txt_identificadores, 'URL': src_imagenes, 'Titulos': titulos, 'Fecha': fechas})
except:
df = pd.DataFrame({'Id': pd.Series(txt_identificadores), 'URL': pd.Series(src_imagenes), 'Titulos': pd.Series(titulos), 'Fecha': pd.Series(fechas)})
global df2
df2 = pd.concat([df2, df], ignore_index=True)
# * UNCOMENT TO SAVE EACH ONE
# df.to_csv(f'lista_imagenes_{current_entry}.csv', index=False)
def back_to_list_and_continue():
global current_entry
wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/section/div/div/div/div[1]/div/div/ul/li[2]/a'))).click()
page_togo = math.ceil(current_entry/5)
script = "searchPage('9FBA3EC3CB10', '{}', '', '')".format(page_togo)
driver.execute_script(script)
modulo = current_entry % 5
print(str(modulo) + ' es el modulo del archivo actual')
css_first = "tbody tr:nth-child(5) td:nth-of-type(2)"
css_other = "tbody tr:nth-child({}) td:nth-of-type(2)".format(modulo)
# * HANDLE SELECTING N ELEMENT
if modulo == 0:
css_other = css_first
print("{} element selected".format(css_other))
wait_loading_overlay()
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_first))).click()
wait_loading_overlay()
buttons_present()
start_and_search()
find_lenght() # * SETS counter_pages
select_first()
while current_entry <= counter_pages:
wait_loading_overlay() # * VENIMOS DE LISTA O DE PÁGINA ANTERIOR
print(Fore.BLACK, Back.WHITE, "Ahora estamos en {}".format(current_entry), end='')
print(Style.RESET_ALL, end='')
try:
# * PAGE IS OK
buttons_present()
except:
# * PAGE DIDN'T LOAD
print('La página no cargó, volviendo al listado')
back_to_list_and_continue()
get_data_single_page(current_entry)
# * NEXT PAGE
buttons = driver.find_elements(By.CSS_SELECTOR, '.page-link')
for i in buttons:
try:
if(str(i.get_attribute('title')) == 'Documento siguiente'):
print(Fore.CYAN, 'Buscando boton de siguiente')
wait.until(EC.element_to_be_clickable(i))
wait_loading_overlay()
i.click()
except:
chime.error()
print('Item saltado')
current_entry += 1
df2.to_csv("complete_list_{}.csv".format(busqueda), index=False)
chime.success()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment