Skip to content

Instantly share code, notes, and snippets.

@mesailde
Last active December 18, 2020 15:44
Show Gist options
  • Select an option

  • Save mesailde/dd6d24d0c27ddcb903ed to your computer and use it in GitHub Desktop.

Select an option

Save mesailde/dd6d24d0c27ddcb903ed to your computer and use it in GitHub Desktop.
Script para extração de CVs Lattes a partir de um arquivo CSV
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import os, sys, re, time, traceback, csv, base64, io, zipfile
import suds
import suds.client
from optparse import OptionParser
from retry import retry
class WSCurriculo(suds.client.Client):
def __init__(self):
suds.client.Client.__init__(self,
'https://cnpqwsproxy.ufscar.br:7443/srvcurriculo/WSCurriculo?wsdl')
@retry(tries=3, delay=1)
def obterCV(self, idCNPq):
xmlz = zipfile.ZipFile(io.BytesIO(base64.b64decode(
self.service.getCurriculoCompactado(id=idCNPq)
)))
return xmlz.read(xmlz.namelist()[0])
@retry(tries=3, delay=1)
def obterIdCNPq(self, *args):
""" obterIdCNPq(cpf) ou obterIdCNPq(nomeCompleto, dataNascimento) """
if len(args) == 1:
return self.service.getIdentificadorCNPq(cpf=args[0],nomeCompleto='',dataNascimento='')
elif len(args) == 2:
return self.service.getIdentificadorCNPq(cpf='',nomeCompleto=args[0],dataNascimento=args[1])
raise ValueError('obterIdCNPq deve receber 1 ou 2 parâmetros (cpf ou nomeCompleto e dataNascimento)')
@retry(tries=3, delay=1)
def obterOcorrencia(self, idCNPq):
return self.service.getOcorrenciaCV(id=idCNPq)
def onlyNumbers(s):
return re.sub(r'[^\d]', '', s)
def process(filename, destdir='.', outfile=None, verbose=False):
idCNPqField = 'idCNPq'
cpfField = 'cpf'
nomeNascimentoFields = ('nome', 'nascimento')
ws = WSCurriculo()
csvout = open(outfile, 'w') if outfile != None else None
with open(filename, 'r') as csvin:
csvreader = csv.DictReader(csvin, delimiter=';')
if cpfField not in csvreader.fieldnames and not set(nomeNascimentoFields).issubset(csvreader.fieldnames):
sys.stderr.write(u'Forneça um arquivo de entrada contendo:\n')
sys.stderr.write(u' * Uma coluna chamada "%s" com o CPF de cada pessoa; e/ou\n' % cpfField)
sys.stderr.write(u' * Colunas "%s" e "%s" com o nome completo e data de nascimento\n' % nomeNascimentoFields)
sys.exit(1)
if csvout:
csvwriter = csv.DictWriter(csvout, fieldnames=csvreader.fieldnames + [idCNPqField], delimiter=';', quoting=csv.QUOTE_ALL)
csvwriter.writeheader()
for row in csvreader:
persondata = (onlyNumbers(row.get(cpfField, '')),)
if persondata == ('',):
persondata = tuple(row.get(x, '') for x in nomeNascimentoFields)
if verbose:
sys.stderr.write(u'=> Obtendo idCNPq de %s\n' % repr(persondata))
try:
row[idCNPqField] = ws.obterIdCNPq(*persondata)
except:
row[idCNPqField] = None
if row[idCNPqField] is None:
sys.stderr.write(u'=> idCNPQ não encontrado para %s!\n' % repr(persondata))
continue
if csvout:
csvwriter.writerow(row)
if verbose:
sys.stderr.write(u'=> Obtendo CV do idCNPq %s\n' % row[idCNPqField])
try:
cv = ws.obterCV(row[idCNPqField])
except:
ocorrencia = None
try:
ocorrencia = ws.obterOcorrencia(row[idCNPqField])
except Exception as e:
traceback.print_exc(e)
sys.stderr.write(u'Impossível obter CV do idCNPq %s: %s\n' % (row[idCNPqField], repr(ocorrencia)))
continue
with open(os.path.join(destdir,'lattes%s.xml' % row[idCNPqField]), 'wb') as xmlout:
xmlout.write(cv)
if csvout:
csvout.close()
def main():
parser = OptionParser(usage=u'uso: %prog [opções] entrada.csv')
parser.add_option('-o', '--output', dest='outfile',
help=u'Grava um CSV com idCNPq no ARQUIVO',
metavar='ARQUIVO')
parser.add_option('-d', '--destdir', dest='destdir',
help=u'Grava os CVs no DIRETORIO',
metavar='DIRETORIO',
default='.')
parser.add_option('-v', '--verbose', action='store_true',
dest='verbose', default=False)
options, args = parser.parse_args()
if len(args) != 1:
parser.error(u'Forneça um arquivo CSV de entrada')
process(args[0], options.destdir, options.outfile, options.verbose)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment