unendin · August 15, 2017 16:38
diff --git a/ppdb_re.py b/ppdb_re.py
 '''
 Purpose-built regular expression to convert ppdb v2.0 entries into dicts

 PPDB is an automatically extracted database containing millions of paraphrases
 http://paraphrase.org/#/download

 Code uses enhanced regex module for convenience of groupdict 
 https://pypi.python.org/pypi/regex/

 Pattern matches every line in the xxxl size all English ppdb pack. 
 Not tested on other packs.
 '''

 import regex as re

 features = [
    r'\[(?<pos>[A-Z#$:/.\\\-\']+)\]', 
    r'\|\|\|', 
    r'(?<phrase>[^|]+) ', 
    r'\|\|\|', 
    r'(?<paraphrase>[^|]+) ', 
    r'\|\|\|', 
    r'PPDB2\.0Score=(?<PPDB2_0Score>{real})',
    r'PPDB1\.0Score=(?<PPDB1_0Score>{real})',
    r'-logp\(LHS\|e1\)=(?<logp_LHS_e1>{real})',
    r'-logp\(LHS\|e2\)=(?<logp_LHS_e2>{real})',
    r'-logp\(e1\|LHS\)=(?<logp_e1_LHS>{real})',
    r'-logp\(e1\|e2\)=(?<logp_e1_e2>{real})',
    r'-logp\(e1\|e2,LHS\)=(?<logp_e1_e2_LHS>{real})',
    r'-logp\(e2\|LHS\)=(?<logp_e2_LHS>{real})',
    r'-logp\(e2\|e1\)=(?<logp_e2_e1>{real})',
    r'-logp\(e2\|e1,LHS\)=(?<logp_e2_e1_LHS>{real})',
    r'(AGigaSim=(?<AGigaSim>{real}))?',
    r'Abstract=(?<Abstract>{boolean})',
    r'Adjacent=(?<Adjacent>{boolean})',
    r'CharCountDiff=(?<CharCountDiff>{smallint})',
    r'CharLogCR=(?<CharLogCR>{real})',
    r'ContainsX=(?<ContainsX>{boolean})',
    r'Equivalence=(?<Equivalence>{real})',
    r'Exclusion=(?<Exclusion>{real})',
    r'GlueRule=(?<GlueRule>{boolean})',
    r'(GoogleNgramSim=(?<GoogleNgramSim>{real}))?',
    r'Identity=(?<Identity>{boolean})',
    r'Independent=(?<Independent>{real})',
    r'Lex\(e1\|e2\)=(?<Lex_e1_e2>{real})',
    r'Lex\(e2\|e1\)=(?<Lex_e2_e1>{real})',
    r'Lexical=(?<Lexical>{boolean})',
    r'LogCount=(?<LogCount>{real})',
    r'MVLSASim=(?<MVLSASim>NA|.*?)',
    r'Monotonic=(?<Monotonic>{boolean})',
    r'OtherRelated=(?<OtherRelated>{real})',
    r'PhrasePenalty=(?<PhrasePenalty>{boolean})',
    r'RarityPenalty=(?<RarityPenalty>{real})',
    r'(ReverseEntailment=(?<ReverseEntailment>{real})|ForwardEntailment=(?<ForwardEntailment>{real}))',
    r'SourceTerminalsButNoTarget=(?<SourceTerminalsButNoTarget>{boolean})',
    r'SourceWords=(?<SourceWords>{smallint})',
    r'(TargetComplexity=(?<TargetComplexity>{real}))?',
    r'(TargetFormality=(?<TargetFormality>{real}))?',
    r'TargetTerminalsButNoSource=(?<TargetTerminalsButNoSource>{boolean})',
    r'TargetWords=(?<TargetWords>{smallint})',
    r'UnalignedSource=(?<UnalignedSource>{smallint})',
    r'UnalignedTarget=(?<UnalignedTarget>{smallint})',
    r'WordCountDiff=(?<WordCountDiff>{smallint})',
    r'WordLenDiff=(?<WordLenDiff>{real})',
    r'WordLogCR=(?<WordLogCR>{real})',
    r'\|\|\|',
    r'\d-\d[^|]*',
    r'\|\|\|', 
    r'(?<Semantics>[A-Za-z]*)'
    ]

 values = {
    'real' : r'-?\d{0,16}\.?\d{0,16}',
    'smallint' : r'-?\d{1,5}',
    'boolean' : r'[01]'
 }

 ppdb_re = re.compile(' ?'.join([x.format(**values) for x in features]))

 '''
 then you can iterate through lines of ppdb v2.0 file
 retrieving feature-keyed dict for each phrase, like this:
 '''
 with open(ppdb-2.0-s-all, 'r', encoding='utf-8') as f:
    for line in f:
        m = ppdb_re.search(line)
        data = m.groupdict()
	'''
	Purpose-built regular expression to convert ppdb v2.0 entries into dicts

	PPDB is an automatically extracted database containing millions of paraphrases
	http://paraphrase.org/#/download

	Code uses enhanced regex module for convenience of groupdict
	https://pypi.python.org/pypi/regex/

	Pattern matches every line in the xxxl size all English ppdb pack.
	Not tested on other packs.
	'''

	import regex as re

	features = [
	r'\[(?<pos>[A-Z#$:/.\\\-\']+)\]',
	r'\\|\\|\\|',
	r'(?<phrase>[^\|]+) ',
	r'\\|\\|\\|',
	r'(?<paraphrase>[^\|]+) ',
	r'\\|\\|\\|',
	r'PPDB2\.0Score=(?<PPDB2_0Score>{real})',
	r'PPDB1\.0Score=(?<PPDB1_0Score>{real})',
	r'-logp\(LHS\\|e1\)=(?<logp_LHS_e1>{real})',
	r'-logp\(LHS\\|e2\)=(?<logp_LHS_e2>{real})',
	r'-logp\(e1\\|LHS\)=(?<logp_e1_LHS>{real})',
	r'-logp\(e1\\|e2\)=(?<logp_e1_e2>{real})',
	r'-logp\(e1\\|e2,LHS\)=(?<logp_e1_e2_LHS>{real})',
	r'-logp\(e2\\|LHS\)=(?<logp_e2_LHS>{real})',
	r'-logp\(e2\\|e1\)=(?<logp_e2_e1>{real})',
	r'-logp\(e2\\|e1,LHS\)=(?<logp_e2_e1_LHS>{real})',
	r'(AGigaSim=(?<AGigaSim>{real}))?',
	r'Abstract=(?<Abstract>{boolean})',
	r'Adjacent=(?<Adjacent>{boolean})',
	r'CharCountDiff=(?<CharCountDiff>{smallint})',
	r'CharLogCR=(?<CharLogCR>{real})',
	r'ContainsX=(?<ContainsX>{boolean})',
	r'Equivalence=(?<Equivalence>{real})',
	r'Exclusion=(?<Exclusion>{real})',
	r'GlueRule=(?<GlueRule>{boolean})',
	r'(GoogleNgramSim=(?<GoogleNgramSim>{real}))?',
	r'Identity=(?<Identity>{boolean})',
	r'Independent=(?<Independent>{real})',
	r'Lex\(e1\\|e2\)=(?<Lex_e1_e2>{real})',
	r'Lex\(e2\\|e1\)=(?<Lex_e2_e1>{real})',
	r'Lexical=(?<Lexical>{boolean})',
	r'LogCount=(?<LogCount>{real})',
	r'MVLSASim=(?<MVLSASim>NA\|.*?)',
	r'Monotonic=(?<Monotonic>{boolean})',
	r'OtherRelated=(?<OtherRelated>{real})',
	r'PhrasePenalty=(?<PhrasePenalty>{boolean})',
	r'RarityPenalty=(?<RarityPenalty>{real})',
	r'(ReverseEntailment=(?<ReverseEntailment>{real})\|ForwardEntailment=(?<ForwardEntailment>{real}))',
	r'SourceTerminalsButNoTarget=(?<SourceTerminalsButNoTarget>{boolean})',
	r'SourceWords=(?<SourceWords>{smallint})',
	r'(TargetComplexity=(?<TargetComplexity>{real}))?',
	r'(TargetFormality=(?<TargetFormality>{real}))?',
	r'TargetTerminalsButNoSource=(?<TargetTerminalsButNoSource>{boolean})',
	r'TargetWords=(?<TargetWords>{smallint})',
	r'UnalignedSource=(?<UnalignedSource>{smallint})',
	r'UnalignedTarget=(?<UnalignedTarget>{smallint})',
	r'WordCountDiff=(?<WordCountDiff>{smallint})',
	r'WordLenDiff=(?<WordLenDiff>{real})',
	r'WordLogCR=(?<WordLogCR>{real})',
	r'\\|\\|\\|',
	r'\d-\d[^\|]*',
	r'\\|\\|\\|',
	r'(?<Semantics>[A-Za-z]*)'
	]

	values = {
	'real' : r'-?\d{0,16}\.?\d{0,16}',
	'smallint' : r'-?\d{1,5}',
	'boolean' : r'[01]'
	}

	ppdb_re = re.compile(' ?'.join([x.format(**values) for x in features]))

	'''
	then you can iterate through lines of ppdb v2.0 file
	retrieving feature-keyed dict for each phrase, like this:
	'''
	with open(ppdb-2.0-s-all, 'r', encoding='utf-8') as f:
	for line in f:
	m = ppdb_re.search(line)
	data = m.groupdict()
No results found