xdanielc · August 25, 2022 11:22
diff --git a/archivo_municipal.py b/archivo_municipal.py
 import selenium
 from selenium import webdriver
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.common.by import By

 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.chrome.options import Options
 from webdriver_manager.chrome import ChromeDriverManager

 from colorama import Fore, Back, Style
 import pandas as pd
 import chime
 import math

 # * VARIABLES Y CONFIGURACION

 pag_municipal = "http://www10.ava.es/amv/"
 busqueda = 'plaza mayor noche'
 counter_pages = 0
 current_entry = 1
 df2 = pd.DataFrame()

 options = Options()
 options.add_argument("disable-extensions")
 driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
 wait = WebDriverWait(driver, 30)

 def wait_loading_overlay():
    wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR, '.loading-overlay-content')))


 def buttons_present():
        wait = WebDriverWait(driver, 1)
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#document .pagination')))
        wait = WebDriverWait(driver, 30)


 def start_and_search():
    # * INITIALIZE
    driver.get(pag_municipal)
    wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="dropdownMenuButton"]')))
    wait_loading_overlay()
    wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="dropdownMenuButton"]'))).click()
    wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/section/div/div/div/div[2]/div/div[1]/div/div[1]/div/div[2]/div[2]/div[6]/label'))).click()
    # * SEARCH INPUT
    wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="fieldValueFreeText"]'))).send_keys(busqueda)
    wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/section/div/div/div/div[2]/div/div[1]/div/div[2]/div/div[2]/form/div[1]/div[2]/label'))).click()
    wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/section/div/div/div/div[2]/div/div[1]/div/div[2]/div/div[2]/form/div[2]/button[1]'))).click()


 def find_lenght():
    """
    Finds lenght of results query
    Sets counter_pages
    """
    wait_loading_overlay()
    global counter_pages
    counter_pages = int(driver.find_element(By.CSS_SELECTOR, '.badge-primary').text)
    print("Se han encontrado {} elementos".format(counter_pages))


 def select_first():
    wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'td:nth-of-type(2)'))).click()


 def get_data_single_page(current_entry):
    """
    OPERATIONS TO RETRIEVE DATA
    AND ADDING TO COMPLETE LIST
    """
    print(Fore.YELLOW, 'pag datos inicio')
    contenedor = driver.find_element(By.CSS_SELECTOR, '#body_document')
    txt_identificadores = contenedor.find_element(By.XPATH, '/html/body/div[1]/section/div/div/div/div[2]/div/div[3]/div/div[2]/div[2]/div/div[1]/div[2]').text.replace(' ', '').replace(':', ';').replace(',', ';').split(';')
    titulo = contenedor.find_element(By.XPATH, "//DIV[@class='field col-sm-2 col-xs-12'][text()='TITULO']/..//P").text
    try:
        fecha = contenedor.find_element(By.XPATH, "//DIV[@class='field col-sm-2 col-xs-12'][text()='FECHA CREACION INICIAL']/..//P").text
    except:
        fecha = 'desconocida'
    imagenes = contenedor.find_elements(By.CSS_SELECTOR, 'img')
    src_imagenes = []
    titulos = []
    fechas = []
    # * CHECK IF THERE ARE MULTIPLE IMAGES
    for i in imagenes:
        url_imagen = str(i.get_dom_attribute('src'))
        url_full = pag_municipal + url_imagen
        src_imagenes.append(url_full)
    # * HANDLES MULTIPLE IMAGES MISMATCH
    if(len(src_imagenes) > len(txt_identificadores)):
        for i in range(len(src_imagenes)):
            titulos.append(titulo)
            fechas.append(fecha)
            

        for i in range(1, len(src_imagenes)):
            txt_identificadores.append(src_imagenes[i])
    elif(len(src_imagenes) == 1):
        print('dentro de elif')
        titulos.append(titulo)
        fechas.append(fecha)
    else:
        print('dentro de else')
        for i in range(len(src_imagenes)):
            titulos.append(titulo)
            fechas.append(fecha)

    print("{} | {} | {} | {}".format(len(src_imagenes), len(txt_identificadores), len(titulos), len(fechas)))
    
    # * SAVE TO CSV
    print('Appending to complete list')
    try:
        df = pd.DataFrame({'Id': txt_identificadores, 'URL': src_imagenes, 'Titulos': titulos, 'Fecha': fechas})
    except:
        df = pd.DataFrame({'Id': pd.Series(txt_identificadores), 'URL': pd.Series(src_imagenes), 'Titulos': pd.Series(titulos), 'Fecha': pd.Series(fechas)})
    global df2
    df2 = pd.concat([df2, df], ignore_index=True)
    # * UNCOMENT TO SAVE EACH ONE
    # df.to_csv(f'lista_imagenes_{current_entry}.csv', index=False)


 def back_to_list_and_continue():
    global current_entry
    wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/section/div/div/div/div[1]/div/div/ul/li[2]/a'))).click()
    page_togo = math.ceil(current_entry/5)
    script = "searchPage('9FBA3EC3CB10', '{}', '', '')".format(page_togo)
    driver.execute_script(script)
    modulo = current_entry % 5
    print(str(modulo) + ' es el modulo del archivo actual')
    css_first = "tbody tr:nth-child(5) td:nth-of-type(2)"
    css_other = "tbody tr:nth-child({}) td:nth-of-type(2)".format(modulo)
    # * HANDLE SELECTING N ELEMENT
    if modulo == 0:
        css_other = css_first
    print("{} element selected".format(css_other))
    wait_loading_overlay()
    wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_first))).click()
    wait_loading_overlay()
    buttons_present()


 start_and_search()
 find_lenght() # * SETS counter_pages
 select_first()

 while current_entry <= counter_pages:
    wait_loading_overlay() # * VENIMOS DE LISTA O DE PÁGINA ANTERIOR
    print(Fore.BLACK, Back.WHITE, "Ahora estamos en {}".format(current_entry), end='')
    print(Style.RESET_ALL, end='')

    try:
        # * PAGE IS OK
        buttons_present()
    except:
        # * PAGE DIDN'T LOAD
        print('La página no cargó, volviendo al listado')
        back_to_list_and_continue()

    get_data_single_page(current_entry)
    
    # * NEXT PAGE
    buttons = driver.find_elements(By.CSS_SELECTOR, '.page-link')

    for i in buttons:
        try:
            if(str(i.get_attribute('title')) == 'Documento siguiente'):
                print(Fore.CYAN, 'Buscando boton de siguiente')
                wait.until(EC.element_to_be_clickable(i))
                wait_loading_overlay()
                i.click()
        except:
            chime.error()
            print('Item saltado')


    current_entry += 1

 df2.to_csv("complete_list_{}.csv".format(busqueda), index=False)
 chime.success()
	import selenium
	from selenium import webdriver
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from selenium.webdriver.common.by import By

	from selenium.webdriver.chrome.service import Service
	from selenium.webdriver.chrome.options import Options
	from webdriver_manager.chrome import ChromeDriverManager

	from colorama import Fore, Back, Style
	import pandas as pd
	import chime
	import math

	# * VARIABLES Y CONFIGURACION

	pag_municipal = "http://www10.ava.es/amv/"
	busqueda = 'plaza mayor noche'
	counter_pages = 0
	current_entry = 1
	df2 = pd.DataFrame()

	options = Options()
	options.add_argument("disable-extensions")
	driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
	wait = WebDriverWait(driver, 30)

	def wait_loading_overlay():
	wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR, '.loading-overlay-content')))


	def buttons_present():
	wait = WebDriverWait(driver, 1)
	wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#document .pagination')))
	wait = WebDriverWait(driver, 30)


	def start_and_search():
	# * INITIALIZE
	driver.get(pag_municipal)
	wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="dropdownMenuButton"]')))
	wait_loading_overlay()
	wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="dropdownMenuButton"]'))).click()
	wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/section/div/div/div/div[2]/div/div[1]/div/div[1]/div/div[2]/div[2]/div[6]/label'))).click()
	# * SEARCH INPUT
	wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="fieldValueFreeText"]'))).send_keys(busqueda)
	wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/section/div/div/div/div[2]/div/div[1]/div/div[2]/div/div[2]/form/div[1]/div[2]/label'))).click()
	wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/section/div/div/div/div[2]/div/div[1]/div/div[2]/div/div[2]/form/div[2]/button[1]'))).click()


	def find_lenght():
	"""
	Finds lenght of results query
	Sets counter_pages
	"""
	wait_loading_overlay()
	global counter_pages
	counter_pages = int(driver.find_element(By.CSS_SELECTOR, '.badge-primary').text)
	print("Se han encontrado {} elementos".format(counter_pages))


	def select_first():
	wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'td:nth-of-type(2)'))).click()


	def get_data_single_page(current_entry):
	"""
	OPERATIONS TO RETRIEVE DATA
	AND ADDING TO COMPLETE LIST
	"""
	print(Fore.YELLOW, 'pag datos inicio')
	contenedor = driver.find_element(By.CSS_SELECTOR, '#body_document')
	txt_identificadores = contenedor.find_element(By.XPATH, '/html/body/div[1]/section/div/div/div/div[2]/div/div[3]/div/div[2]/div[2]/div/div[1]/div[2]').text.replace(' ', '').replace(':', ';').replace(',', ';').split(';')
	titulo = contenedor.find_element(By.XPATH, "//DIV[@class='field col-sm-2 col-xs-12'][text()='TITULO']/..//P").text
	try:
	fecha = contenedor.find_element(By.XPATH, "//DIV[@class='field col-sm-2 col-xs-12'][text()='FECHA CREACION INICIAL']/..//P").text
	except:
	fecha = 'desconocida'
	imagenes = contenedor.find_elements(By.CSS_SELECTOR, 'img')
	src_imagenes = []
	titulos = []
	fechas = []
	# * CHECK IF THERE ARE MULTIPLE IMAGES
	for i in imagenes:
	url_imagen = str(i.get_dom_attribute('src'))
	url_full = pag_municipal + url_imagen
	src_imagenes.append(url_full)
	# * HANDLES MULTIPLE IMAGES MISMATCH
	if(len(src_imagenes) > len(txt_identificadores)):
	for i in range(len(src_imagenes)):
	titulos.append(titulo)
	fechas.append(fecha)


	for i in range(1, len(src_imagenes)):
	txt_identificadores.append(src_imagenes[i])
	elif(len(src_imagenes) == 1):
	print('dentro de elif')
	titulos.append(titulo)
	fechas.append(fecha)
	else:
	print('dentro de else')
	for i in range(len(src_imagenes)):
	titulos.append(titulo)
	fechas.append(fecha)

	print("{} \| {} \| {} \| {}".format(len(src_imagenes), len(txt_identificadores), len(titulos), len(fechas)))

	# * SAVE TO CSV
	print('Appending to complete list')
	try:
	df = pd.DataFrame({'Id': txt_identificadores, 'URL': src_imagenes, 'Titulos': titulos, 'Fecha': fechas})
	except:
	df = pd.DataFrame({'Id': pd.Series(txt_identificadores), 'URL': pd.Series(src_imagenes), 'Titulos': pd.Series(titulos), 'Fecha': pd.Series(fechas)})
	global df2
	df2 = pd.concat([df2, df], ignore_index=True)
	# * UNCOMENT TO SAVE EACH ONE
	# df.to_csv(f'lista_imagenes_{current_entry}.csv', index=False)


	def back_to_list_and_continue():
	global current_entry
	wait.until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/section/div/div/div/div[1]/div/div/ul/li[2]/a'))).click()
	page_togo = math.ceil(current_entry/5)
	script = "searchPage('9FBA3EC3CB10', '{}', '', '')".format(page_togo)
	driver.execute_script(script)
	modulo = current_entry % 5
	print(str(modulo) + ' es el modulo del archivo actual')
	css_first = "tbody tr:nth-child(5) td:nth-of-type(2)"
	css_other = "tbody tr:nth-child({}) td:nth-of-type(2)".format(modulo)
	# * HANDLE SELECTING N ELEMENT
	if modulo == 0:
	css_other = css_first
	print("{} element selected".format(css_other))
	wait_loading_overlay()
	wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, css_first))).click()
	wait_loading_overlay()
	buttons_present()


	start_and_search()
	find_lenght() # * SETS counter_pages
	select_first()

	while current_entry <= counter_pages:
	wait_loading_overlay() # * VENIMOS DE LISTA O DE PÁGINA ANTERIOR
	print(Fore.BLACK, Back.WHITE, "Ahora estamos en {}".format(current_entry), end='')
	print(Style.RESET_ALL, end='')

	try:
	# * PAGE IS OK
	buttons_present()
	except:
	# * PAGE DIDN'T LOAD
	print('La página no cargó, volviendo al listado')
	back_to_list_and_continue()

	get_data_single_page(current_entry)

	# * NEXT PAGE
	buttons = driver.find_elements(By.CSS_SELECTOR, '.page-link')

	for i in buttons:
	try:
	if(str(i.get_attribute('title')) == 'Documento siguiente'):
	print(Fore.CYAN, 'Buscando boton de siguiente')
	wait.until(EC.element_to_be_clickable(i))
	wait_loading_overlay()
	i.click()
	except:
	chime.error()
	print('Item saltado')


	current_entry += 1

	df2.to_csv("complete_list_{}.csv".format(busqueda), index=False)
	chime.success()
No results found