Last active
June 3, 2025 19:19
-
-
Save alexfriant/3d1bc1102d770361d21e66850a1146aa to your computer and use it in GitHub Desktop.
This Python script will provide a visual summary of alphanumeric patterns which exist in a list of values
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ##################################################################################### | |
| # | |
| # This script will provide you a basic understanding of the alphanumeric patterns | |
| # which exist in a list. You might get this list from a SQL query or something like | |
| # that. | |
| # | |
| # INPUT: Give this script a file that has a single column of ID type strings. | |
| # EXAMPLES: | |
| # (from windows command line): | |
| # > python patternEyes.py "c:\temp\id_list.txt" | |
| # | |
| # (from Python Console in PyCharm): | |
| # >>> from patternEyes import * | |
| # >>> patternEyes(r"C:\LocalData\temp\parcels.csv") | |
| # | |
| # OUTPUT: Comes in the form of print statement converting numbers to "#" and alpha | |
| # characters to "X". All punctuation stays as it exists. | |
| # | |
| # For example, if you want to see if all records are phone numbers, you might expect | |
| # to see something like this: | |
| # ###-###-#### | |
| # But if you also see something like this, you know the data isn't as "clean" as | |
| # you were hoping, requiring further investigation: | |
| # ##-XXX-###### | |
| # Spaces are now displayed as '·', so something with spaces might look like this: | |
| # ###·XXXX·· | |
| # | |
| ##################################################################################### | |
| import re, os.path, sys | |
| from collections import defaultdict | |
| from pathlib import Path | |
| def patternEyes( filePath = r'H:\IGS\GIS\GISADMIN\Parcel Tax System Update\Automation Integration\sql\PCSalesComp_research\all_docnum_values.txt'): | |
| strings = [] | |
| patterns = [] | |
| input_file = filePath | |
| if os.path.isfile( input_file ): | |
| cp = re.compile(r'[,]') | |
| np = re.compile(r'\d') | |
| ap = re.compile(r'[a-z]', re.IGNORECASE) | |
| sp = re.compile(r' ') # New regex pattern for space | |
| file = open(input_file, 'r') | |
| for line in file: | |
| strings.extend(line.strip('\n').split(',')) | |
| file.close() | |
| for string in strings: | |
| sm = sp.sub('·', string) # Replace spaces with 's' | |
| nm = np.sub('#', sm) | |
| am = ap.sub('X', nm) | |
| patterns.append(am) | |
| pattern_counts = defaultdict(int) | |
| for pattern in patterns: | |
| if pattern == '': | |
| pattern_counts['No Data'] += 1 | |
| else: | |
| pattern_counts[pattern] += 1 | |
| pattern_rank = [] | |
| for k in sorted(pattern_counts, key=pattern_counts.__getitem__, reverse=True): | |
| pattern_rank.append([k, pattern_counts[k]]) | |
| print("\nREPORT FOR: {}".format(Path(input_file).resolve())) | |
| print("\n{0:40} | {1:10}".format("PATTERN", "COUNT")) | |
| print("-"*50) | |
| for pattern, count in pattern_rank: | |
| print("{0:40} | {1:10}".format(pattern, str(count))) | |
| else: | |
| print( "\nSorry, there is no file here: {}".format(input_file)) | |
| def main( inputs ): | |
| if len( inputs ) > 1: | |
| print('okay, gonna try and run this: ' + inputs[1]) | |
| patternEyes( inputs[1] ) | |
| else: | |
| patternEyes() | |
| if __name__ == "__main__": main( sys.argv ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment