Skip to content

Instantly share code, notes, and snippets.

@jtrecenti
Created January 22, 2026 19:08
Show Gist options
  • Select an option

  • Save jtrecenti/7c8771600a060b91581acb5aae18a2d8 to your computer and use it in GitHub Desktop.

Select an option

Save jtrecenti/7c8771600a060b91581acb5aae18a2d8 to your computer and use it in GitHub Desktop.
import juscraper as jus
import pandas as pd
import numpy as np
import requests
import os
import json
datajud = jus.scraper('datajud')
assuntos = [
12480, 15218, 15221, 15514, 12520, 12507, 12508, 12509, 12510,
12481, 12485, 12498, 12497, 12499, 12484, 12496, 12492, 12495,
12494, 12493, 12483, 12505, 12506, 12511, 12518, 12512, 12513,
12514, 12515, 12516, 12517, 14759, 12491, 12501, 12502, 12503,
12500, 12504, 12519, 12482, 12486, 12490, 12487, 12488, 12489,
14760,
10503, 9995, 10434, 10440 # revisar, está na parte cível
]
tribunais = [
"TJAC", "TJAL", "TJAM", "TJAP", "TJBA", "TJCE", "TJDFT", "TJES",
"TJGO", "TJMA", "TJMG", "TJMS", "TJMT", "TJPA", "TJPB", "TJPE",
"TJPI", "TJPR", "TJRJ", "TJRN", "TJRO", "TJRR", "TJRS", "TJSC",
"TJSE", "TJSP", "TJTO",
"TRF1", "TRF2", "TRF3", "TRF4", "TRF5", "TRF6"
]
for assunto in assuntos:
for tribunal in tribunais:
try:
f = f"datajud/processos_{assunto}_{tribunal}.parquet"
if not os.path.exists(f):
print(f"Assunto: {assunto}, Tribunal: {tribunal}...")
processos = datajud.listar_processos(
assuntos = [assunto],
tribunal = tribunal,
mostrar_movs=False
)
print(f"Processos encontrados: {len(processos)}")
if len(processos) > 0:
processos['assuntos'] = processos['assuntos'].apply(json.dumps)
processos['classe'] = processos['classe'].apply(json.dumps)
processos['formato'] = processos['formato'].apply(json.dumps)
processos.to_parquet(f)
except Exception as e:
print(f"Erro ao baixar processos para assunto {assunto} e tribunal {tribunal}: {e}")
# obtem os que não baixaram
lista_arquivos = os.listdir("datajud")
todas_combinacoes = []
for assunto in assuntos:
for tribunal in tribunais:
todas_combinacoes.append(f"processos_{assunto}_{tribunal}.parquet")
arquivos_faltantes = [arquivo for arquivo in todas_combinacoes if arquivo not in lista_arquivos]
len(arquivos_faltantes)
# le todos os arquivos e concatena
from tqdm import tqdm
processos = pd.DataFrame()
arquivos = os.listdir("datajud")
for arquivo in tqdm(arquivos):
f = f"datajud/{arquivo}"
df = pd.read_parquet(f)
processos = pd.concat([processos, df], ignore_index=True)
processos.to_parquet("lista_judsaude.parquet")
import numpy as np
def safe_serialize(x):
if isinstance(x, (dict, list)):
return json.dumps(x)
elif isinstance(x, np.ndarray):
return json.dumps(x.tolist())
else:
return x
processos['assuntos'] = processos['assuntos'].apply(safe_serialize)
processos['classe'] = processos['classe'].apply(safe_serialize)
processos['formato'] = processos['formato'].apply(safe_serialize)
processos.to_parquet("lista_judsaude.parquet")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment