Skip to content

Instantly share code, notes, and snippets.

@jcantrell-trulia
Created June 10, 2013 07:07
Show Gist options
  • Select an option

  • Save jcantrell-trulia/5747016 to your computer and use it in GitHub Desktop.

Select an option

Save jcantrell-trulia/5747016 to your computer and use it in GitHub Desktop.
Learning about the best ways to find file names given a partial match. So regex =bad, .find = okay-ish, PartialTrie = awesome but unholy ram usuage.
import re
import timeit
global cache_list
class PartialTrie:
def __init__(self):
self.leaf = False
self.tries = {}
self.data = []
def get(self, key):
if key == '':
return self.data
if key[0] in self.tries:
return self.tries[key[0]].get(key[1:])
else:
return []
def add(self, key, data):
if key[0] in self.tries:
pass
else:
self.tries[key[0]] = PartialTrie()
self.tries[key[0]].addData(data)
if len(key) > 1:
self.tries[key[0]].add(key[1:], data)
def addData(self, data):
self.data.append(data)
#really dumb way to do this
def regex_search(needle):
regex=re.compile(".*(%s).*" % needle)
return [m.group(0) for l in cache_list for m in [regex.search(l)] if m]
#simple way
def simple_search(needle):
return [s for s in cache_list if s.find(needle) != -1]
def trie_search(needle):
return root.get(needle)
if __name__ == "__main__":
global cache_list
global root
f = open('files.txt', 'r')
cache_list = f.read().splitlines()
print 'Building Partial Trie'
root = PartialTrie()
for f in cache_list:
parts = f.split('/')
parts.remove('.')
path = parts.pop()
add = True
for x in range(0, 1):
for y in range(0, len(path)):
root.add(path[y:], f);
if len(parts):
path = '%s/%s' % (parts.pop(), path)
else:
break
#print 'Testing regex search'
#print timeit.timeit("regex_search('test')", "from __main__ import regex_search", number=50)
print 'Simple output'
print '******************************************************'
print simple_search('test')
print "\nTrie output"
print '******************************************************'
print trie_search('test')
print 'Testing simple search'
print timeit.timeit("simple_search('test')", "from __main__ import simple_search", number=250)
print 'Testing trie search'
print timeit.timeit("trie_search('test')", "from __main__ import trie_search", number=330000)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment