Last active
December 14, 2020 11:34
-
-
Save songmw90/d016e8084624fa544bef to your computer and use it in GitHub Desktop.
Simple word count
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| # Author : Myeong-Uk ([email protected]) | |
| # Date : 2015. 03. 24 | |
| # Desc : Count alphabetword frequency | |
| # Command : <python wordcount.py filename> e.g. rfc3615.txt | |
| # OUTPUT : filename_word_count.out e.g. rfc3615.txt_word_count.out | |
| import sys,re,string,operator,csv | |
| def read_file(filename): | |
| fp = open(filename, 'r') | |
| text = prepare_raw_text(fp.read()) | |
| fp.close() | |
| return text | |
| def prepare_raw_text(rawtext): | |
| text = re.sub('[^A-Za-z]+', ' ', rawtext) #alphabetize word | |
| text = text.lower().split() # convert to lowercase , split words using white space | |
| return text | |
| def group_words(words): | |
| dict = {} | |
| for word in words: #loop 0 to word | |
| #if len(word) > 1: #word should longer than 1 (optional) | |
| if word in dict: | |
| dict[word] += 1 | |
| else: | |
| dict[word] = 1 | |
| return sorted(dict.items(), key=operator.itemgetter(1), reverse=True) #sort dictionary to check frequency | |
| def record_frequency(filename,list): | |
| fp = open(filename+"_word_count.out","w") | |
| for item in list: | |
| fp.write(item[0] + " " + str(item[1]) + "\n") | |
| fp.close() | |
| if __name__ == '__main__': | |
| filename = sys.argv[1] | |
| text = read_file(sys.argv[1]) | |
| list = group_words(text) | |
| record_frequency(filename,list) | |
| print "done!" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment