Skip to content

Instantly share code, notes, and snippets.

@rflamary
Created July 26, 2019 14:13
Show Gist options
  • Select an option

  • Save rflamary/4d92e7c03657523584783a2d409b4e3f to your computer and use it in GitHub Desktop.

Select an option

Save rflamary/4d92e7c03657523584783a2d409b4e3f to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import urllib
def parse_scholar_page(url,paper_list=False):
response = urllib.request.urlopen(url)
webContent = response.read()
soup = BeautifulSoup(webContent, 'html.parser')
def toint(v):
try:
return int(v)
except ValueError as verr:
return 0 # do job to handle: s does not c
#%% Citations stats
name=soup.find('div',{"id": "gsc_prf_in"}).text
tabe_cite=soup.find('table',{"id": "gsc_rsb_st"})
tr_tcite=tabe_cite.findAll('tr')
citations=toint(tr_tcite[1].find('td',{"class": "gsc_rsb_std"}).text)
h_index=toint(tr_tcite[2].find('td',{"class": "gsc_rsb_std"}).text)
#%% citation graph
years_list=[toint(y.text) for y in soup.findAll("span", {"class": "gsc_g_t"})]
citations_list=[toint(y.text) for y in soup.findAll("span", {"class": "gsc_g_al"})]
#%% papers list
res={'name':name,'citations':citations,'h_index':h_index,
'years_list':years_list,'citations_list':citations_list}
if paper_list:
pap=soup.findAll("tr", {"class": "gsc_a_tr"})
papers_list=[]
for p in pap:
temp=dict()
temp['title']=p.find('a',{"class": "gsc_a_at"}).text
infos=p.findAll('div',{"class": "gs_gray"})
temp['authors']=infos[0].text
temp['published_in']=','.join(infos[1].text.split(',')[:-1])
temp['year']=toint(infos[1].text.split(',')[-1])
temp['citations']=toint(p.find('a',{"class": "gsc_a_ac gs_ibl"}).text)
papers_list.append(temp)
res['papers_list']=papers_list
return res
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment