Source code for pepper.language.utils.helper_functions

from __future__ import unicode_literals

import json
import os
import sys
import traceback
import urllib
import urllib2

from nltk import pos_tag
from nltk import tree as ntree
from nltk.stem import WordNetLemmatizer

import wordnet_utils as wu
from pepper import logger

LOG = logger.getChild(__name__)

wnl = WordNetLemmatizer()

ROOT = os.path.join(os.path.dirname(__file__), '..')
lexicon = json.load(open(os.path.join(ROOT, 'data', 'lexicon.json')))

[docs]def trim_dash(rdf): for el in rdf: if rdf[el]: if rdf[el].startswith('-'): rdf[el] = rdf[el][1:] if rdf[el].endswith('-'): rdf[el] = rdf[el][:-1] return rdf
[docs]def get_type(element, forest): if '-' in element: type = {} for el in element.split('-'): type[el] = get_lexname(el, forest) if type[el]=='None': lexicon_category = lexicon_lookup(el,'category') type[el]=lexicon_category else: type = get_lexname(element, forest) if type == 'None': lexicon_category = lexicon_lookup(element, 'category') type = lexicon_category return type
[docs]def get_lexname(element, forest): if element=='': return label = get_node_label(forest[0], element) if label=='': #print(element,' has no label') label = pos_tag([element]) #print('NEW LABEL ', label) if label=='': return None label = label[0][1] #print(element,label) synset = wu.get_synsets(element, label) if synset: type = wu.get_lexname(synset[0]) #print('TYPE OF ' + element + ' IS ' + type) return type else: #print(element + ' has no type!') return None
[docs]def fix_pronouns(pronoun, self): """ :param pronoun: :param self: :return: """ #print('fixing', dict) lexicon = self.LEXICON speaker = self.chat.speaker dict = lexicon_lookup(pronoun, lexicon) if dict and 'person' in dict: if dict['person'] == 'first': return speaker elif dict['person'] == 'second': return 'leolani' else: print('disambiguate third person') return pronoun else: return pronoun
[docs]def lemmatize(word, tag=''): ''' :param word: :param tag: :return: ''' lem = '' if len(word.split()) > 1: for el in word.split(): lem += wnl.lemmatize(el) + ' ' return lem.strip() if tag != '': return wnl.lemmatize(word, tag) return wnl.lemmatize(word)
[docs]def get_node_label(tree, word): ''' :param tree: :param word: :return: ''' label = '' if '-' in word: word = word.replace('-','') for el in tree: for node in el: if type(node)==ntree.Tree: for subtree in node.subtrees(): for n in subtree: if n==word: label = str(subtree.label()) return label
[docs]def lexicon_lookup(word, typ=None): """ Look up and return features of a given word in the lexicon. """ # Define pronoun categories. pronouns = lexicon["pronouns"] subject_pros = pronouns["subject"] object_pros = pronouns["object"] possessive_pros = pronouns["possessive"] dep_possessives = possessive_pros["dependent"] indep_possessives = possessive_pros["independent"] reflexive_pros = pronouns["reflexive"] indefinite_pros = pronouns["indefinite"] indefinite_person = indefinite_pros["person"] indefinite_place = indefinite_pros["place"] indefinite_thing = indefinite_pros["thing"] # Define verbal categories. verbs = lexicon["verbs"] to_be = verbs["to be"] aux_verbs = verbs["auxiliaries"] have = aux_verbs['have'] to_do = aux_verbs["to do"] modals = aux_verbs["modals"] lexicals = verbs["lexical verbs"] # Define determiner categories. determiners = lexicon["determiners"] articles = determiners["articles"] demonstratives = determiners["demonstratives"] possessive_dets = determiners["possessives"] quantifiers = determiners["quantifiers"] wh_dets = determiners["wh-determiners"] numerals = determiners["numerals"] cardinals = numerals["cardinals"] ordinals = numerals["ordinals"] s_genitive = determiners["s-genitive"] # Define conjunction categories. conjunctions = lexicon["conjunctions"] coordinators = conjunctions["coordinating"] subordinators = conjunctions["subordinating"] # Define a question word category. question_words = lexicon["question words"] # Define a kinship category. kinship = lexicon["kinship"] if typ == 'verb': categories = [to_be, to_do, have, modals, lexicals] elif typ == 'pos': categories = [dep_possessives] elif typ == 'to_be': categories = [to_be] elif typ == 'aux': categories = [to_do, to_be, have] elif typ == 'modal': categories = [modals] elif typ == 'pronouns': categories = [subject_pros, object_pros, dep_possessives, indep_possessives, reflexive_pros, indefinite_person, indefinite_place, indefinite_thing] elif typ == 'lexical': categories = [lexicals] elif typ == 'kinship': categories = [kinship] elif typ == 'det': categories = [articles, demonstratives, possessive_dets, possessive_pros, cardinals, ordinals] else: categories = [subject_pros, object_pros, dep_possessives, indep_possessives, reflexive_pros, indefinite_person, indefinite_place, indefinite_thing, to_be, to_do, have, modals, lexicals, articles, demonstratives, possessive_dets, quantifiers, wh_dets, cardinals, ordinals, s_genitive, coordinators, subordinators, question_words, kinship] # print("looking up: ", word) for category in categories: for item in category: if word == item: if typ=='category': #print(type(category), category) return category, category [item] return category[item] return None
[docs]def dbp_query(q, epr, f='application/json'): try: params = {'query': q} params = urllib.urlencode(params) opener = urllib2.build_opener(urllib2.HTTPHandler) request = urllib2.Request(epr + '?' + params) request.add_header('Accept', f) request.get_method = lambda: 'GET' url = opener.open(request) return url.read() except Exception as e: traceback.print_exc(file=sys.stdout) raise e
[docs]def get_uri(string): query = "\ PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> \ SELECT ?x \ WHERE { ?x rdfs:label ?string . \ FILTER ( ?string = \"" + string + "\" ) }\ LIMIT 200" results = dbp_query(query, "http://dbpedia.org/sparql") results = json.loads(results) uris = [] for x in results['results']['bindings']: uris.append(x['x']['value']) if uris: return uris[0] else: return None
''' def dbp_query(q, baseURL, format="application/json"): params = { "default-graph": "", "should-sponge": "soft", "query": q, "debug": "on", "timeout": "", "format": format, "save": "display", "fname": "" } querypart = urllib.urlencode(params) response = urllib.urlopen(baseURL, querypart).read() return json.loads(response) def get_uri(string): query = """PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> SELECT ?pred WHERE { ?pred rdfs:label """ + "'" + string + "'" + """@en . } ORDER BY ?pred""" results = dbp_query(query, "http://dbpedia.org/sparql") uris = [] for x in results['results']['bindings']: uris.append(x['pred']['value']) return uris '''