rflamary · July 26, 2019 14:13
diff --git a/parse_scolar.py b/parse_scolar.py

 from bs4 import BeautifulSoup
 import urllib



 def parse_scholar_page(url,paper_list=False):

    response = urllib.request.urlopen(url)
    webContent = response.read()
    soup = BeautifulSoup(webContent, 'html.parser')
    
    def toint(v):
        try:
          return int(v)
        except ValueError as verr:
          return 0 # do job to handle: s does not c
    
    #%% Citations stats
    
    name=soup.find('div',{"id": "gsc_prf_in"}).text
    
    tabe_cite=soup.find('table',{"id": "gsc_rsb_st"})
    tr_tcite=tabe_cite.findAll('tr')
    
    citations=toint(tr_tcite[1].find('td',{"class": "gsc_rsb_std"}).text)
    h_index=toint(tr_tcite[2].find('td',{"class": "gsc_rsb_std"}).text)
    
    
    #%% citation graph
    
    years_list=[toint(y.text) for y in soup.findAll("span", {"class": "gsc_g_t"})]
    
    citations_list=[toint(y.text) for y in soup.findAll("span", {"class": "gsc_g_al"})]
    
    #%% papers list
    res={'name':name,'citations':citations,'h_index':h_index,
         'years_list':years_list,'citations_list':citations_list}
    
    if paper_list:
        pap=soup.findAll("tr", {"class": "gsc_a_tr"})
        
        papers_list=[]
        
        for p in pap:
            
            temp=dict()
            
            temp['title']=p.find('a',{"class": "gsc_a_at"}).text
            
            infos=p.findAll('div',{"class": "gs_gray"})
            temp['authors']=infos[0].text
            temp['published_in']=','.join(infos[1].text.split(',')[:-1])
            temp['year']=toint(infos[1].text.split(',')[-1])
            temp['citations']=toint(p.find('a',{"class": "gsc_a_ac gs_ibl"}).text)
            
            papers_list.append(temp)
        
        res['papers_list']=papers_list

    
    return res

	from bs4 import BeautifulSoup
	import urllib



	def parse_scholar_page(url,paper_list=False):

	response = urllib.request.urlopen(url)
	webContent = response.read()
	soup = BeautifulSoup(webContent, 'html.parser')

	def toint(v):
	try:
	return int(v)
	except ValueError as verr:
	return 0 # do job to handle: s does not c

	#%% Citations stats

	name=soup.find('div',{"id": "gsc_prf_in"}).text

	tabe_cite=soup.find('table',{"id": "gsc_rsb_st"})
	tr_tcite=tabe_cite.findAll('tr')

	citations=toint(tr_tcite[1].find('td',{"class": "gsc_rsb_std"}).text)
	h_index=toint(tr_tcite[2].find('td',{"class": "gsc_rsb_std"}).text)


	#%% citation graph

	years_list=[toint(y.text) for y in soup.findAll("span", {"class": "gsc_g_t"})]

	citations_list=[toint(y.text) for y in soup.findAll("span", {"class": "gsc_g_al"})]

	#%% papers list
	res={'name':name,'citations':citations,'h_index':h_index,
	'years_list':years_list,'citations_list':citations_list}

	if paper_list:
	pap=soup.findAll("tr", {"class": "gsc_a_tr"})

	papers_list=[]

	for p in pap:

	temp=dict()

	temp['title']=p.find('a',{"class": "gsc_a_at"}).text

	infos=p.findAll('div',{"class": "gs_gray"})
	temp['authors']=infos[0].text
	temp['published_in']=','.join(infos[1].text.split(',')[:-1])
	temp['year']=toint(infos[1].text.split(',')[-1])
	temp['citations']=toint(p.find('a',{"class": "gsc_a_ac gs_ibl"}).text)

	papers_list.append(temp)

	res['papers_list']=papers_list


	return res
No results found