Last active
August 28, 2018 16:37
-
-
Save thecolorblue/f9c23f8e107f6b03a5b22d168a18e293 to your computer and use it in GitHub Desktop.
nlp example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from __future__ import unicode_literals, print_function | |
| import spacy | |
| import json | |
| from spacy.matcher import PhraseMatcher | |
| from spacy.tokens import Doc, Span, Token | |
| import gi | |
| gi.require_version('Gst', '1.0') | |
| from gi.repository import GObject, Gst | |
| def getActionAndTarget(root): | |
| if len([m for m in root['modifiers'] if m['arc'] == 'prt']): | |
| action = root['word'] + ' ' + [m for m in root['modifiers'] if m['arc'] == 'prt'][0]['word'] | |
| else: | |
| action = root['word'] | |
| if len([t for t in root['modifiers'] if t['arc'] == 'dobj']): | |
| target = [t for t in root['modifiers'] if t['arc'] == 'dobj'][0]['word'] | |
| elif len([t for t in root['modifiers'] if t['arc'] == 'prt']): | |
| partToken = [t for t in root['modifiers'] if t['arc'] == 'prt'][0] | |
| target = [m for m in partToken['modifiers'] if m['arc'] == 'pobj'][0]['word'] | |
| else: | |
| target = '' | |
| if len([t for t in root['modifiers'] if t['POS_coarse'] == 'VERB']): | |
| [getActionAndTarget(t) for t in root['modifiers'] if t['POS_coarse'] == 'VERB'] | |
| print('action: ', action) | |
| print('target: ', target) | |
| def translate(text="Turn on the tv", *tech): | |
| nlp = spacy.load('en_core_web_sm') | |
| if not tech: | |
| tech = ["tv", "lights", "speakers"] | |
| component = TechRecognizer(nlp, tech) | |
| nlp.add_pipe(component, last=True) | |
| doc = nlp(text) | |
| tree = doc.print_tree() | |
| print('text: ', text) | |
| for root in tree: | |
| # print(json.dumps(tree, indent=2)) | |
| getActionAndTarget(root) | |
| # print('chunks:', [chunk for chunk in doc.noun_chunks]) | |
| # print('ents: ', [ent for ent in doc.ents]) | |
| # for s in doc.sents: | |
| # print('types:', [t.dep_ for t in s]) | |
| # print('action:', s.root) | |
| # print('action head:', s.root.head) | |
| # print('action:', ) | |
| # print([t.text for t in s.root.children]) | |
| # print([(t.text, t.head.text) for t in s]) | |
| # print('root:', s.root) | |
| # print('right:', [t.text for t in s.root.rights]) | |
| # print('left:', [t.text for t in s.root.lefts.text]) | |
| # print('right edge:', s.root.right_edge) | |
| # print('left edge:', s.root.left_edge) | |
| class TechRecognizer(object): | |
| name = 'tech' | |
| def __init__(self, nlp, companies=tuple(), label='TECH'): | |
| self.label = nlp.vocab.strings[label] | |
| patterns = [nlp(org) for org in companies] | |
| self.matcher = PhraseMatcher(nlp.vocab) | |
| self.matcher.add('TECH', None, *patterns) | |
| Token.set_extension('is_tech', default=False, force=True) | |
| Doc.set_extension('has_tech', getter=self.has_tech, force=True) | |
| Span.set_extension('has_tech', getter=self.has_tech, force=True) | |
| def __call__(self, doc): | |
| matches = self.matcher(doc) | |
| spans = [] | |
| for _, start, end in matches: | |
| entity = Span(doc, start, end, label=self.label) | |
| spans.append(entity) | |
| for token in entity: | |
| token._.set('is_tech', True) | |
| doc.ents = list(doc.ents) + [entity] | |
| for span in spans: | |
| span.merge() | |
| return doc | |
| def has_tech(self, tokens): | |
| return any([t._.get('is_tech') for t in tokens]) | |
| def bus_message(bus, message): | |
| structure = message.get_structure() | |
| if structure and structure.get_name() == "deepspeech": | |
| text = structure.get_value("text") | |
| translate(unicode(text)) | |
| return True | |
| if __name__ == "__main__": | |
| GObject.threads_init() | |
| Gst.init(None) | |
| loop = GObject.MainLoop() | |
| pipeline = Gst.parse_launch("pulsesrc ! audioconvert ! audiorate ! audioresample ! deepspeech silence-threshold=0.3 silence-length=20 ! fakesink") | |
| bus = pipeline.get_bus() | |
| bus.add_signal_watch() | |
| bus.connect ("message", bus_message) | |
| pipeline.set_state(Gst.State.PLAYING) | |
| loop.run() | |
| # main('Sarah, can you please turn on the tv for our meeting') | |
| # main('The lights are a little bright. Can you turn off the lights?') | |
| # main('Mute the call') | |
| # main('Start up the room, turn on the lights, and bake a cake and muffins.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment