Created
June 10, 2013 07:07
-
-
Save jcantrell-trulia/5747016 to your computer and use it in GitHub Desktop.
Learning about the best ways to find file names given a partial match. So regex =bad, .find = okay-ish, PartialTrie = awesome but unholy ram usuage.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import re | |
| import timeit | |
| global cache_list | |
| class PartialTrie: | |
| def __init__(self): | |
| self.leaf = False | |
| self.tries = {} | |
| self.data = [] | |
| def get(self, key): | |
| if key == '': | |
| return self.data | |
| if key[0] in self.tries: | |
| return self.tries[key[0]].get(key[1:]) | |
| else: | |
| return [] | |
| def add(self, key, data): | |
| if key[0] in self.tries: | |
| pass | |
| else: | |
| self.tries[key[0]] = PartialTrie() | |
| self.tries[key[0]].addData(data) | |
| if len(key) > 1: | |
| self.tries[key[0]].add(key[1:], data) | |
| def addData(self, data): | |
| self.data.append(data) | |
| #really dumb way to do this | |
| def regex_search(needle): | |
| regex=re.compile(".*(%s).*" % needle) | |
| return [m.group(0) for l in cache_list for m in [regex.search(l)] if m] | |
| #simple way | |
| def simple_search(needle): | |
| return [s for s in cache_list if s.find(needle) != -1] | |
| def trie_search(needle): | |
| return root.get(needle) | |
| if __name__ == "__main__": | |
| global cache_list | |
| global root | |
| f = open('files.txt', 'r') | |
| cache_list = f.read().splitlines() | |
| print 'Building Partial Trie' | |
| root = PartialTrie() | |
| for f in cache_list: | |
| parts = f.split('/') | |
| parts.remove('.') | |
| path = parts.pop() | |
| add = True | |
| for x in range(0, 1): | |
| for y in range(0, len(path)): | |
| root.add(path[y:], f); | |
| if len(parts): | |
| path = '%s/%s' % (parts.pop(), path) | |
| else: | |
| break | |
| #print 'Testing regex search' | |
| #print timeit.timeit("regex_search('test')", "from __main__ import regex_search", number=50) | |
| print 'Simple output' | |
| print '******************************************************' | |
| print simple_search('test') | |
| print "\nTrie output" | |
| print '******************************************************' | |
| print trie_search('test') | |
| print 'Testing simple search' | |
| print timeit.timeit("simple_search('test')", "from __main__ import simple_search", number=250) | |
| print 'Testing trie search' | |
| print timeit.timeit("trie_search('test')", "from __main__ import trie_search", number=330000) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment