-
-
Save Venkatstatistics/25a17956436b4cf9738ce722ea55aae6 to your computer and use it in GitHub Desktop.
| #Resume Phrase Matcher code | |
| #importing all required libraries | |
| import PyPDF2 | |
| import os | |
| from os import listdir | |
| from os.path import isfile, join | |
| from io import StringIO | |
| import pandas as pd | |
| from collections import Counter | |
| import en_core_web_sm | |
| nlp = en_core_web_sm.load() | |
| from spacy.matcher import PhraseMatcher | |
| #Function to read resumes from the folder one by one | |
| mypath='D:/NLP_Resume/Candidate Resume' #enter your path here where you saved the resumes | |
| onlyfiles = [os.path.join(mypath, f) for f in os.listdir(mypath) if os.path.isfile(os.path.join(mypath, f))] | |
| def pdfextract(file): | |
| fileReader = PyPDF2.PdfFileReader(open(file,'rb')) | |
| countpage = fileReader.getNumPages() | |
| count = 0 | |
| text = [] | |
| while count < countpage: | |
| pageObj = fileReader.getPage(count) | |
| count +=1 | |
| t = pageObj.extractText() | |
| print (t) | |
| text.append(t) | |
| return text | |
| #function to read resume ends | |
| #function that does phrase matching and builds a candidate profile | |
| def create_profile(file): | |
| text = pdfextract(file) | |
| text = str(text) | |
| text = text.replace("\\n", "") | |
| text = text.lower() | |
| #below is the csv where we have all the keywords, you can customize your own | |
| keyword_dict = pd.read_csv('D:/NLP_Resume/resume/template_new.csv') | |
| stats_words = [nlp(text) for text in keyword_dict['Statistics'].dropna(axis = 0)] | |
| NLP_words = [nlp(text) for text in keyword_dict['NLP'].dropna(axis = 0)] | |
| ML_words = [nlp(text) for text in keyword_dict['Machine Learning'].dropna(axis = 0)] | |
| DL_words = [nlp(text) for text in keyword_dict['Deep Learning'].dropna(axis = 0)] | |
| R_words = [nlp(text) for text in keyword_dict['R Language'].dropna(axis = 0)] | |
| python_words = [nlp(text) for text in keyword_dict['Python Language'].dropna(axis = 0)] | |
| Data_Engineering_words = [nlp(text) for text in keyword_dict['Data Engineering'].dropna(axis = 0)] | |
| matcher = PhraseMatcher(nlp.vocab) | |
| matcher.add('Stats', None, *stats_words) | |
| matcher.add('NLP', None, *NLP_words) | |
| matcher.add('ML', None, *ML_words) | |
| matcher.add('DL', None, *DL_words) | |
| matcher.add('R', None, *R_words) | |
| matcher.add('Python', None, *python_words) | |
| matcher.add('DE', None, *Data_Engineering_words) | |
| doc = nlp(text) | |
| d = [] | |
| matches = matcher(doc) | |
| for match_id, start, end in matches: | |
| rule_id = nlp.vocab.strings[match_id] # get the unicode ID, i.e. 'COLOR' | |
| span = doc[start : end] # get the matched slice of the doc | |
| d.append((rule_id, span.text)) | |
| keywords = "\n".join(f'{i[0]} {i[1]} ({j})' for i,j in Counter(d).items()) | |
| ## convertimg string of keywords to dataframe | |
| df = pd.read_csv(StringIO(keywords),names = ['Keywords_List']) | |
| df1 = pd.DataFrame(df.Keywords_List.str.split(' ',1).tolist(),columns = ['Subject','Keyword']) | |
| df2 = pd.DataFrame(df1.Keyword.str.split('(',1).tolist(),columns = ['Keyword', 'Count']) | |
| df3 = pd.concat([df1['Subject'],df2['Keyword'], df2['Count']], axis =1) | |
| df3['Count'] = df3['Count'].apply(lambda x: x.rstrip(")")) | |
| base = os.path.basename(file) | |
| filename = os.path.splitext(base)[0] | |
| name = filename.split('_') | |
| name2 = name[0] | |
| name2 = name2.lower() | |
| ## converting str to dataframe | |
| name3 = pd.read_csv(StringIO(name2),names = ['Candidate Name']) | |
| dataf = pd.concat([name3['Candidate Name'], df3['Subject'], df3['Keyword'], df3['Count']], axis = 1) | |
| dataf['Candidate Name'].fillna(dataf['Candidate Name'].iloc[0], inplace = True) | |
| return(dataf) | |
| #function ends | |
| #code to execute/call the above functions | |
| final_database=pd.DataFrame() | |
| i = 0 | |
| while i < len(onlyfiles): | |
| file = onlyfiles[i] | |
| dat = create_profile(file) | |
| final_database = final_database.append(dat) | |
| i +=1 | |
| print(final_database) | |
| #code to count words under each category and visulaize it through Matplotlib | |
| final_database2 = final_database['Keyword'].groupby([final_database['Candidate Name'], final_database['Subject']]).count().unstack() | |
| final_database2.reset_index(inplace = True) | |
| final_database2.fillna(0,inplace=True) | |
| new_data = final_database2.iloc[:,1:] | |
| new_data.index = final_database2['Candidate Name'] | |
| #execute the below line if you want to see the candidate profile in a csv format | |
| #sample2=new_data.to_csv('sample.csv') | |
| import matplotlib.pyplot as plt | |
| plt.rcParams.update({'font.size': 10}) | |
| ax = new_data.plot.barh(title="Resume keywords by category", legend=False, figsize=(25,7), stacked=True) | |
| labels = [] | |
| for j in new_data.columns: | |
| for i in new_data.index: | |
| label = str(j)+": " + str(new_data.loc[i][j]) | |
| labels.append(label) | |
| patches = ax.patches | |
| for label, rect in zip(labels, patches): | |
| width = rect.get_width() | |
| if width > 0: | |
| x = rect.get_x() | |
| y = rect.get_y() | |
| height = rect.get_height() | |
| ax.text(x + width/2., y + height/2., label, ha='center', va='center') | |
| plt.show() |
OSError: Expected file path name or file-like object, got <class 'bytes'> type
I'm getting this error while reading csv file:(hi Mili do you manage to solve it? I got the same error as well and get stucked TT
Hey! I'm sorry...you might be using windows! I heard to use utf-8 encoding if it is an encoding issue, bt didn't work for me though you can try!
Good luck:)
utf-8 encoding
OSError: Expected file path name or file-like object, got <class 'bytes'> type
I'm getting this error while reading csv file:(hi Mili do you manage to solve it? I got the same error as well and get stucked TT
Hey! I'm sorry...you might be using windows! I heard to use utf-8 encoding if it is an encoding issue, bt didn't work for me though you can try!
Good luck:)
Nope, I'm using mac. Anyways, thanks for replying! You too! :]
utf-8 encoding
OSError: Expected file path name or file-like object, got <class 'bytes'> type
I'm getting this error while reading csv file:(hi Mili do you manage to solve it? I got the same error as well and get stucked TT
Hey! I'm sorry...you might be using windows! I heard to use utf-8 encoding if it is an encoding issue, bt didn't work for me though you can try!
Good luck:)Nope, I'm using mac. Anyways, thanks for replying! You too! :]
Hi Mili and hueyyi
Do you Manage to solve it?
I am getting utf-8 encoding issue.
Please do let me know the solution.
Many thanks in advance
Working perfectly for me
PDF starts with '{
"c', but '%PDF-' expected
Any idea how to solve this
hi Mili do you manage to solve it? I got the same error as well and get stucked TT