194 lines
5.3 KiB
Python
194 lines
5.3 KiB
Python
import copy
|
|
|
|
|
|
class Noun_phrase():
|
|
def __init__(self, doc, noun):
|
|
self.core_noun = noun
|
|
self.doc = doc
|
|
|
|
def __str__(self):
|
|
#if(self.det):
|
|
# return '{0} {1} {2}'.format(str(self.det), str(self.core_noun), ' et '.join(str(x) for x in self.adjs))
|
|
#else:
|
|
return self.doc.text
|
|
|
|
def root_of_doc(doc):
|
|
for token in doc:
|
|
if(token.dep_ == 'ROOT'):
|
|
return token
|
|
|
|
#
|
|
# ==== Format ====
|
|
# branch are one out of two times "word type", one out of two times "relation between word"
|
|
# word type -rel-> ancestor type -rel-> 2nd ancestor type ... Should end with a word/ancestor type
|
|
#
|
|
# "*" describe "every else relation" or "every else type"
|
|
#
|
|
# Leaf are rules name
|
|
#
|
|
RULES_TREE = {
|
|
"*": {
|
|
"nmod": {
|
|
"*": "r1"
|
|
},
|
|
"cc": {
|
|
"*": {
|
|
"conj": {
|
|
"*": "r2"
|
|
}
|
|
}
|
|
},
|
|
"amod": {
|
|
"*": "r3",
|
|
},
|
|
"nsubj": {
|
|
"NOUN": "r4"
|
|
},
|
|
"*": {
|
|
"*" : "r0"
|
|
}
|
|
},
|
|
"NOUN": {
|
|
"nsubj": {
|
|
"ADJ": "r4_prime",
|
|
},
|
|
}
|
|
}
|
|
|
|
class Rules():
|
|
@staticmethod
|
|
def identity(a):
|
|
return a
|
|
|
|
@staticmethod
|
|
def r0(a,b):
|
|
return a
|
|
|
|
@staticmethod
|
|
def r1(a,b):
|
|
return (b,a,'?')
|
|
|
|
@staticmethod
|
|
def r2(a,b,c):
|
|
return { 'els': [a,b], 'conj': c}
|
|
|
|
@staticmethod
|
|
def r3(a,b):
|
|
return a.text + " " + b.text
|
|
|
|
@staticmethod
|
|
def r4(a,b):
|
|
return ('?',a,b)
|
|
|
|
@staticmethod
|
|
def r4_prime(b,a):
|
|
return ('?',a,b)
|
|
|
|
def get_from_tree_rules(cur_rules_tree, rules):
|
|
if isinstance(cur_rules_tree, str):
|
|
return cur_rules_tree
|
|
|
|
if not rules:
|
|
return cur_rules_tree
|
|
|
|
copy_rules = copy.deepcopy(rules)
|
|
rule = copy_rules.pop(0)
|
|
|
|
if rule in cur_rules_tree:
|
|
res = get_from_tree_rules(cur_rules_tree[rule], copy_rules)
|
|
if not res is None:
|
|
return res
|
|
|
|
if "*" in cur_rules_tree:
|
|
return get_from_tree_rules(cur_rules_tree["*"], copy_rules)
|
|
else:
|
|
return None
|
|
|
|
def find_rule_for_token(token):
|
|
previous_ancestor = token
|
|
selected_ancestors = [token]
|
|
loop_increment = 0
|
|
path_rules = [token.pos_]
|
|
for ancestor in token.ancestors:
|
|
path_rules.append(previous_ancestor.dep_)
|
|
path_rules.append(ancestor.pos_)
|
|
selected_ancestors = [ancestor] + selected_ancestors
|
|
rule = get_from_tree_rules(RULES_TREE, path_rules)
|
|
if rule and isinstance(rule, str):
|
|
return (len(selected_ancestors) - 1,rule,selected_ancestors)
|
|
previous_ancestor = ancestor
|
|
|
|
loop_increment = loop_increment + 1
|
|
|
|
if loop_increment == 0:
|
|
return (0,"identity",[token])
|
|
|
|
raise Exception('No rules found')
|
|
|
|
def apply_rules(number_of_ancestors,cur_rules,selected_ancestors):
|
|
return getattr(Rules,cur_rules)(*selected_ancestors)
|
|
|
|
def merge(cur_rule, rule_to_merge):
|
|
if cur_rule[0] < rule_to_merge[0]:
|
|
big_rule = rule_to_merge
|
|
small_rule = cur_rule
|
|
else:
|
|
big_rule = cur_rule
|
|
small_rule = rule_to_merge
|
|
|
|
(big_n_of_ancestor, big_rule_method, big_ancestors) = big_rule
|
|
(small_n_of_ancestor, small_rule_method, small_ancestors) = small_rule
|
|
|
|
# fix
|
|
if big_rule_method == 'r0':
|
|
return small_rule
|
|
|
|
small_ancestors[small_n_of_ancestor] = big_ancestors[big_n_of_ancestor]
|
|
big_ancestors[big_n_of_ancestor] = apply_rules(*small_rule)
|
|
|
|
return (big_n_of_ancestor, big_rule_method, big_ancestors)
|
|
|
|
def doc_to_triple(token, root = True):
|
|
cur_rule = find_rule_for_token(token)
|
|
|
|
for child in token.children:
|
|
cur_rule = merge(cur_rule,doc_to_triple(child, False))
|
|
|
|
if root:
|
|
return apply_rules(*cur_rule)
|
|
|
|
(number_of_ancestors_before, res, ancestors) = cur_rule
|
|
return (number_of_ancestors_before - 1, res, ancestors)
|
|
|
|
def merge_token_entity(doc):
|
|
with doc.retokenize() as retokenizer:
|
|
for ent in doc.noun_chunks:
|
|
retokenizer.merge(doc[ent.start:ent.end], attrs={"LEMMA": ent.text, "noun": Noun_phrase(ent.as_doc(), ent.root.text)})
|
|
return doc
|
|
|
|
if __name__ == "__main__":
|
|
import spacy
|
|
from spacy.tokens import Doc,Token
|
|
nlp = spacy.load('fr_core_news_lg')
|
|
print(nlp._path)
|
|
|
|
doc = nlp("Qui est le président de l'allemagne et de la france?")
|
|
|
|
# rules "et" => {'els': [allemagne, france], 'conj': et}
|
|
assert apply_rules(*find_rule_for_token(doc[7])) == {'els': [doc[6], doc[10]], 'conj': doc[7]}
|
|
# rules "allemagne" => (allemagne, président,'?')
|
|
assert apply_rules(*find_rule_for_token(doc[6])) == (doc[6],doc[3],'?')
|
|
# rules "président" => président
|
|
assert apply_rules(*find_rule_for_token(doc[3])) == doc[3]
|
|
|
|
doc = nlp("Qui est le président de la france?")
|
|
# doc_to_triple "président" => (france, président, '?')
|
|
assert doc_to_triple(doc[3]) == (doc[6], doc[3], '?')
|
|
|
|
doc = nlp("Qui est le président des états unis?")
|
|
assert doc_to_triple(doc[3]) == ("états unis", doc[3], '?')
|
|
|
|
# doc_to_triple "président" => ({'els': ['états unis', france], 'conj': et}, président, '?')
|
|
doc = nlp("Qui est le président des états unis et de la france?")
|
|
assert doc_to_triple(doc[3]) == ({'els': ['états unis', doc[10]], 'conj': doc[7]}, doc[3], '?')
|