Skip to content

Instantly share code, notes, and snippets.

@logicx24
Last active August 29, 2015 14:18
Show Gist options
  • Select an option

  • Save logicx24/b70681bb76c028744995 to your computer and use it in GitHub Desktop.

Select an option

Save logicx24/b70681bb76c028744995 to your computer and use it in GitHub Desktop.
from sys import argv
import os
def inputFunc():
if len(argv) >= 2:
if os.path.isfile(argv[1]):
text = open(argv[1]).read()
else:
print('not a file')
else:
text = input("Enter text > ")
return text
def splitIntoSentences(text):
endChars = ['.', '?', '!']
endIndices = []
text = text.replace("\n\n", "").replace("\n", " ").strip()
for ind, char in enumerate(text):
if char in endChars:
if char == '.':
if ind < (len(text) - 1) and text[ind+1] == " " and text[ind+2].isupper():
endIndices.append(ind)
elif ind == (len(text) - 1):
endIndices.append(ind)
else:
endIndices.append(ind)
sentenceList = []
i = 0
for sentDex in endIndices:
lst = list(text)
if sentDex < (len(lst) - 1):
lst[sentDex+1] = ""
text = "".join(lst)
for sentDex in endIndices:
sentenceList.append(text[i:sentDex])
i = sentDex+1
for index in range(len(sentenceList)):
sentenceList[index] = sentenceList[index].lower()
if not sentenceList:
sentenceList.append(text + ".")
tmp = list(sentenceList[-1])
tmp[-1] = ""
sentenceList[-1] = "".join(tmp)
return sentenceList
def generateWordTuples(sentenceList):
for ind in range(len(sentenceList)):
sentenceList[ind] = ''.join(char for char in sentenceList[ind] if char.isalnum() or char == '.' or char == " ")
sentenceList[ind] = (ind + 1, sentenceList[ind])
wordTups = []
for sentenceTup in sentenceList:
for word in sentenceTup[1].split():
wordTups.append((sentenceTup[0], word))
return wordTups
def frequencyCount(wordTups):
wordFrequency = {}
wordToSentences = {}
for tup in wordTups:
if tup[1] in wordFrequency:
wordFrequency[tup[1]] += 1
else:
wordFrequency[tup[1]] = 1
if tup[1] in wordToSentences:
wordToSentences[tup[1]].append(tup[0])
else:
wordToSentences[tup[1]] = [tup[0]]
return wordFrequency, wordToSentences
def sortAndWrite(wordFrequency, wordToSentences):
sortedItems = sorted(wordFrequency.items(), key=lambda x: x[0])
rows = []
count = 0
for index, tup in enumerate(sortedItems):
count += 1
lineEls = [count, tup[0], tup[1], str(wordToSentences[tup[0]]).replace("[", "").replace("]","").replace(" ", "")]
row = "{0}. {1} {{{2}:{3}}}".format(*lineEls)
rows.append(row)
return rows
def writeToFile(lst):
with open('count.txt', 'w') as output:
output.write("\n".join(lst))
output.close()
def main():
text = inputFunc()
wordFrequency, wordToSentences = frequencyCount(generateWordTuples(splitIntoSentences(text)))
writeToFile(sortAndWrite(wordFrequency, wordToSentences))
print("Written to file count.txt in this directory.")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment