Created
January 22, 2026 19:08
-
-
Save jtrecenti/7c8771600a060b91581acb5aae18a2d8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import juscraper as jus | |
| import pandas as pd | |
| import numpy as np | |
| import requests | |
| import os | |
| import json | |
| datajud = jus.scraper('datajud') | |
| assuntos = [ | |
| 12480, 15218, 15221, 15514, 12520, 12507, 12508, 12509, 12510, | |
| 12481, 12485, 12498, 12497, 12499, 12484, 12496, 12492, 12495, | |
| 12494, 12493, 12483, 12505, 12506, 12511, 12518, 12512, 12513, | |
| 12514, 12515, 12516, 12517, 14759, 12491, 12501, 12502, 12503, | |
| 12500, 12504, 12519, 12482, 12486, 12490, 12487, 12488, 12489, | |
| 14760, | |
| 10503, 9995, 10434, 10440 # revisar, está na parte cível | |
| ] | |
| tribunais = [ | |
| "TJAC", "TJAL", "TJAM", "TJAP", "TJBA", "TJCE", "TJDFT", "TJES", | |
| "TJGO", "TJMA", "TJMG", "TJMS", "TJMT", "TJPA", "TJPB", "TJPE", | |
| "TJPI", "TJPR", "TJRJ", "TJRN", "TJRO", "TJRR", "TJRS", "TJSC", | |
| "TJSE", "TJSP", "TJTO", | |
| "TRF1", "TRF2", "TRF3", "TRF4", "TRF5", "TRF6" | |
| ] | |
| for assunto in assuntos: | |
| for tribunal in tribunais: | |
| try: | |
| f = f"datajud/processos_{assunto}_{tribunal}.parquet" | |
| if not os.path.exists(f): | |
| print(f"Assunto: {assunto}, Tribunal: {tribunal}...") | |
| processos = datajud.listar_processos( | |
| assuntos = [assunto], | |
| tribunal = tribunal, | |
| mostrar_movs=False | |
| ) | |
| print(f"Processos encontrados: {len(processos)}") | |
| if len(processos) > 0: | |
| processos['assuntos'] = processos['assuntos'].apply(json.dumps) | |
| processos['classe'] = processos['classe'].apply(json.dumps) | |
| processos['formato'] = processos['formato'].apply(json.dumps) | |
| processos.to_parquet(f) | |
| except Exception as e: | |
| print(f"Erro ao baixar processos para assunto {assunto} e tribunal {tribunal}: {e}") | |
| # obtem os que não baixaram | |
| lista_arquivos = os.listdir("datajud") | |
| todas_combinacoes = [] | |
| for assunto in assuntos: | |
| for tribunal in tribunais: | |
| todas_combinacoes.append(f"processos_{assunto}_{tribunal}.parquet") | |
| arquivos_faltantes = [arquivo for arquivo in todas_combinacoes if arquivo not in lista_arquivos] | |
| len(arquivos_faltantes) | |
| # le todos os arquivos e concatena | |
| from tqdm import tqdm | |
| processos = pd.DataFrame() | |
| arquivos = os.listdir("datajud") | |
| for arquivo in tqdm(arquivos): | |
| f = f"datajud/{arquivo}" | |
| df = pd.read_parquet(f) | |
| processos = pd.concat([processos, df], ignore_index=True) | |
| processos.to_parquet("lista_judsaude.parquet") | |
| import numpy as np | |
| def safe_serialize(x): | |
| if isinstance(x, (dict, list)): | |
| return json.dumps(x) | |
| elif isinstance(x, np.ndarray): | |
| return json.dumps(x.tolist()) | |
| else: | |
| return x | |
| processos['assuntos'] = processos['assuntos'].apply(safe_serialize) | |
| processos['classe'] = processos['classe'].apply(safe_serialize) | |
| processos['formato'] = processos['formato'].apply(safe_serialize) | |
| processos.to_parquet("lista_judsaude.parquet") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment