Last active
December 10, 2019 11:18
-
-
Save light-bringer/eaf6ab769b1f61c32331c00a6b14f9fd to your computer and use it in GitHub Desktop.
Python 3 program to count number of words in txt or doc or docx files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Have a python3 setup ready! | |
| Install docx from pip | |
| ~/Desktop/awesome-performance-test-framework master ✗ 6h8m ✖ ⚑ ◒ | |
| ▶ pip install --pre python-docx | |
| Collecting python-docx | |
| Downloading https://files.pythonhosted.org/packages/e4/83/c66a1934ed5ed8ab1dbb9931f1779079f8bca0f6bbc5793c06c4b5e7d671/python-docx-0.8.10.tar.gz (5.5MB) | |
| |████████████████████████████████| 5.5MB 9.2MB/s | |
| Requirement already satisfied: lxml>=2.3.2 in ./venv/lib/python3.7/site-packages (from python-docx) (4.4.2) | |
| Building wheels for collected packages: python-docx | |
| WARNING: Building wheel for python-docx failed: [Errno 13] Permission denied: '/Users/efi/Library/Caches/pip/wheels/18' | |
| Failed to build python-docx | |
| Installing collected packages: python-docx | |
| Running setup.py install for python-docx ... done | |
| Successfully installed python-docx-0.8.10 | |
| Kindly try to provide full paths as a parameter to the Script. | |
| Example - /home/debaprid/example | |
| UNSUCCESFULL RUN : | |
| ▶ python configspec/test.py | |
| Provide directory path eg: | |
| ./ /home/debaprid/example | |
| # Successful RUN : | |
| ~/Desktop/awesome-performance-test-framework master ✗ 6h10m ✖ ⚑ ◒ | |
| ▶ python configspec/test.py configspec/test | |
| configspec/test [] ['1.txt', '2.txt'] | |
| ['configspec/test/1.txt', 'configspec/test/2.txt'] | |
| WordCount for configspec/test/1.txt : 3 | |
| WordCount for configspec/test/2.txt : 3 | |
| (venv) | |
| ~/Desktop/awesome-performance-test-framework master ✗ 6h11m ✖ ⚑ ◒ | |
| ▶ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import sys | |
| import docx | |
| import re | |
| def get_all_files(path): | |
| # r=root, d=directories, f = files | |
| files = [] | |
| for r, d, f in os.walk(path): | |
| print(r, d, f) | |
| for file in f: | |
| if '.txt' or '.doc' or '.docx' in file: | |
| files.append(os.path.join(r, file)) | |
| return files | |
| def count_docx(file_name): | |
| try: | |
| document = docx.opendocx(file_name) | |
| except: | |
| print('Cannot open file to read.') | |
| return -1 | |
| paratextlist = docx.getdocumenttext(document) | |
| newparatextlist = [] | |
| for paratext in paratextlist: | |
| newparatextlist.append(paratext.encode("utf-8")) | |
| return len(re.findall(r'\w+', '\n'.join(newparatextlist))) | |
| def count_txt(file_name): | |
| wordcount = 0 | |
| try: | |
| document = open(file_name) | |
| except: | |
| print('Cannot open file to read') | |
| return -1 | |
| while 1: | |
| lines = document.readlines(100000) | |
| if not lines: | |
| break | |
| for line in lines: | |
| wordcount = wordcount + len(re.findall(r'\w+', line)) | |
| return wordcount | |
| if __name__ == '__main__': | |
| extensions = { | |
| 'txt' : count_txt, | |
| 'docx' : count_docx, | |
| 'doc' : count_docx, | |
| } | |
| try: | |
| all_files = get_all_files(sys.argv[1]) | |
| print(all_files) | |
| for new_file in all_files: | |
| file_name, file_extension = os.path.splitext(new_file) | |
| print("WordCount for {0} : {1}".format(new_file, extensions[file_extension.lower().replace('.','')](new_file))) | |
| except: | |
| print ("Provide directory path eg:\n./ /home/debaprid/example") | |
| exit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment