Errol/src/nlp.py

194 lines
5.3 KiB
Python

import copy
class Noun_phrase():
def __init__(self, doc, noun):
self.core_noun = noun
self.doc = doc
def __str__(self):
#if(self.det):
# return '{0} {1} {2}'.format(str(self.det), str(self.core_noun), ' et '.join(str(x) for x in self.adjs))
#else:
return self.doc.text
def root_of_doc(doc):
for token in doc:
if(token.dep_ == 'ROOT'):
return token
#
# ==== Format ====
# branch are one out of two times "word type", one out of two times "relation between word"
# word type -rel-> ancestor type -rel-> 2nd ancestor type ... Should end with a word/ancestor type
#
# "*" describe "every else relation" or "every else type"
#
# Leaf are rules name
#
RULES_TREE = {
"*": {
"nmod": {
"*": "r1"
},
"cc": {
"*": {
"conj": {
"*": "r2"
}
}
},
"amod": {
"*": "r3",
},
"nsubj": {
"NOUN": "r4"
},
"*": {
"*" : "r0"
}
},
"NOUN": {
"nsubj": {
"ADJ": "r4_prime",
},
}
}
class Rules():
@staticmethod
def identity(a):
return a
@staticmethod
def r0(a,b):
return a
@staticmethod
def r1(a,b):
return (b,a,'?')
@staticmethod
def r2(a,b,c):
return { 'els': [a,b], 'conj': c}
@staticmethod
def r3(a,b):
return a.text + " " + b.text
@staticmethod
def r4(a,b):
return ('?',a,b)
@staticmethod
def r4_prime(b,a):
return ('?',a,b)
def get_from_tree_rules(cur_rules_tree, rules):
if isinstance(cur_rules_tree, str):
return cur_rules_tree
if not rules:
return cur_rules_tree
copy_rules = copy.deepcopy(rules)
rule = copy_rules.pop(0)
if rule in cur_rules_tree:
res = get_from_tree_rules(cur_rules_tree[rule], copy_rules)
if not res is None:
return res
if "*" in cur_rules_tree:
return get_from_tree_rules(cur_rules_tree["*"], copy_rules)
else:
return None
def find_rule_for_token(token):
previous_ancestor = token
selected_ancestors = [token]
loop_increment = 0
path_rules = [token.pos_]
for ancestor in token.ancestors:
path_rules.append(previous_ancestor.dep_)
path_rules.append(ancestor.pos_)
selected_ancestors = [ancestor] + selected_ancestors
rule = get_from_tree_rules(RULES_TREE, path_rules)
if rule and isinstance(rule, str):
return (len(selected_ancestors) - 1,rule,selected_ancestors)
previous_ancestor = ancestor
loop_increment = loop_increment + 1
if loop_increment == 0:
return (0,"identity",[token])
raise Exception('No rules found')
def apply_rules(number_of_ancestors,cur_rules,selected_ancestors):
return getattr(Rules,cur_rules)(*selected_ancestors)
def merge(cur_rule, rule_to_merge):
if cur_rule[0] < rule_to_merge[0]:
big_rule = rule_to_merge
small_rule = cur_rule
else:
big_rule = cur_rule
small_rule = rule_to_merge
(big_n_of_ancestor, big_rule_method, big_ancestors) = big_rule
(small_n_of_ancestor, small_rule_method, small_ancestors) = small_rule
# fix
if big_rule_method == 'r0':
return small_rule
small_ancestors[small_n_of_ancestor] = big_ancestors[big_n_of_ancestor]
big_ancestors[big_n_of_ancestor] = apply_rules(*small_rule)
return (big_n_of_ancestor, big_rule_method, big_ancestors)
def doc_to_triple(token, root = True):
cur_rule = find_rule_for_token(token)
for child in token.children:
cur_rule = merge(cur_rule,doc_to_triple(child, False))
if root:
return apply_rules(*cur_rule)
(number_of_ancestors_before, res, ancestors) = cur_rule
return (number_of_ancestors_before - 1, res, ancestors)
def merge_token_entity(doc):
with doc.retokenize() as retokenizer:
for ent in doc.noun_chunks:
retokenizer.merge(doc[ent.start:ent.end], attrs={"LEMMA": ent.text, "noun": Noun_phrase(ent.as_doc(), ent.root.text)})
return doc
if __name__ == "__main__":
import spacy
from spacy.tokens import Doc,Token
nlp = spacy.load('fr_core_news_lg')
print(nlp._path)
doc = nlp("Qui est le président de l'allemagne et de la france?")
# rules "et" => {'els': [allemagne, france], 'conj': et}
assert apply_rules(*find_rule_for_token(doc[7])) == {'els': [doc[6], doc[10]], 'conj': doc[7]}
# rules "allemagne" => (allemagne, président,'?')
assert apply_rules(*find_rule_for_token(doc[6])) == (doc[6],doc[3],'?')
# rules "président" => président
assert apply_rules(*find_rule_for_token(doc[3])) == doc[3]
doc = nlp("Qui est le président de la france?")
# doc_to_triple "président" => (france, président, '?')
assert doc_to_triple(doc[3]) == (doc[6], doc[3], '?')
doc = nlp("Qui est le président des états unis?")
assert doc_to_triple(doc[3]) == ("états unis", doc[3], '?')
# doc_to_triple "président" => ({'els': ['états unis', france], 'conj': et}, président, '?')
doc = nlp("Qui est le président des états unis et de la france?")
assert doc_to_triple(doc[3]) == ({'els': ['états unis', doc[10]], 'conj': doc[7]}, doc[3], '?')