Errol/src/nlp.py

import copy


class Noun_phrase():
    def __init__(self, doc, noun):
        self.core_noun = noun
        self.doc = doc

    def __str__(self):
        #if(self.det):
        #    return '{0} {1} {2}'.format(str(self.det), str(self.core_noun), ' et '.join(str(x) for x in self.adjs))
        #else:
        return self.doc.text

def root_of_doc(doc):
    for token in doc:
        if(token.dep_ == 'ROOT'):
            return token

#
# ==== Format ====
# branch are one out of two times "word type", one out of two times "relation between word"
# word type -rel-> ancestor type -rel-> 2nd ancestor type ... Should end with a word/ancestor type
#
# "*" describe "every else relation" or "every else type"
#
# Leaf are rules name
#
RULES_TREE = {
    "*": {
        "nmod": {
            "*": "r1"
        },
        "cc": {
            "*": {
                "conj": {
                    "*": "r2"
                }
            }
        },
        "amod": {
            "*": "r3",
        },
        "nsubj": {
            "NOUN": "r4"
        },
        "*": {
            "*" : "r0"
        }
    },
    "NOUN": {
        "nsubj": {
            "ADJ": "r4_prime",
        },
    }
}

class Rules():
    @staticmethod
    def identity(a):
        return a

    @staticmethod
    def r0(a,b):
        return a

    @staticmethod
    def r1(a,b):
        return (b,a,'?')

    @staticmethod
    def r2(a,b,c):
        return { 'els': [a,b], 'conj': c}

    @staticmethod
    def r3(a,b):
        return a.text + " " + b.text

    @staticmethod
    def r4(a,b):
        return ('?',a,b)

    @staticmethod
    def r4_prime(b,a):
        return ('?',a,b)

def get_from_tree_rules(cur_rules_tree, rules):
    if isinstance(cur_rules_tree, str):
        return cur_rules_tree

    if not rules:
        return cur_rules_tree

    copy_rules = copy.deepcopy(rules)
    rule = copy_rules.pop(0)

    if rule in cur_rules_tree:
        res = get_from_tree_rules(cur_rules_tree[rule], copy_rules)
        if not res is None:
            return res

    if "*" in cur_rules_tree:
        return get_from_tree_rules(cur_rules_tree["*"], copy_rules)
    else:
        return None

def find_rule_for_token(token):
    previous_ancestor = token
    selected_ancestors = [token]
    loop_increment = 0
    path_rules = [token.pos_]
    for ancestor in token.ancestors:
        path_rules.append(previous_ancestor.dep_)
        path_rules.append(ancestor.pos_)
        selected_ancestors = [ancestor] + selected_ancestors
        rule = get_from_tree_rules(RULES_TREE, path_rules)
        if rule and isinstance(rule, str):
            return (len(selected_ancestors) - 1,rule,selected_ancestors)
        previous_ancestor =  ancestor

        loop_increment = loop_increment + 1

    if loop_increment == 0:
        return (0,"identity",[token])

    raise Exception('No rules found')

def apply_rules(number_of_ancestors,cur_rules,selected_ancestors):
    return getattr(Rules,cur_rules)(*selected_ancestors)

def merge(cur_rule, rule_to_merge):
    if cur_rule[0] < rule_to_merge[0]:
        big_rule = rule_to_merge
        small_rule = cur_rule
    else:
        big_rule = cur_rule
        small_rule = rule_to_merge

    (big_n_of_ancestor, big_rule_method, big_ancestors) = big_rule
    (small_n_of_ancestor, small_rule_method, small_ancestors) = small_rule

    # fix
    if big_rule_method == 'r0':
        return small_rule

    small_ancestors[small_n_of_ancestor] = big_ancestors[big_n_of_ancestor]
    big_ancestors[big_n_of_ancestor] = apply_rules(*small_rule)

    return (big_n_of_ancestor, big_rule_method, big_ancestors)

def doc_to_triple(token, root = True):
    cur_rule = find_rule_for_token(token)

    for child in token.children:
        cur_rule = merge(cur_rule,doc_to_triple(child, False))

    if root:
        return apply_rules(*cur_rule)

    (number_of_ancestors_before, res, ancestors) = cur_rule
    return (number_of_ancestors_before - 1, res, ancestors)

def merge_token_entity(doc):
    with doc.retokenize() as retokenizer:
        for ent in doc.noun_chunks:
            retokenizer.merge(doc[ent.start:ent.end], attrs={"LEMMA": ent.text, "noun": Noun_phrase(ent.as_doc(), ent.root.text)})
    return doc

if __name__ == "__main__":
    import spacy
    from spacy.tokens import Doc,Token
    nlp = spacy.load('fr_core_news_lg')
    print(nlp._path)

    doc = nlp("Qui est le président de l'allemagne et de la france?")

    # rules "et" => {'els': [allemagne, france], 'conj': et}
    assert apply_rules(*find_rule_for_token(doc[7])) == {'els': [doc[6], doc[10]], 'conj': doc[7]}
    # rules "allemagne" => (allemagne, président,'?')
    assert apply_rules(*find_rule_for_token(doc[6])) == (doc[6],doc[3],'?')
    # rules "président" => président
    assert apply_rules(*find_rule_for_token(doc[3])) == doc[3]

    doc = nlp("Qui est le président de la france?")
    # doc_to_triple "président" => (france, président, '?')
    assert doc_to_triple(doc[3]) == (doc[6], doc[3], '?')

    doc = nlp("Qui est le président des états unis?")
    assert doc_to_triple(doc[3]) == ("états unis", doc[3], '?')

    # doc_to_triple "président" => ({'els': ['états unis', france], 'conj': et}, président, '?')
    doc = nlp("Qui est le président des états unis et de la france?")
    assert doc_to_triple(doc[3]) == ({'els': ['états unis', doc[10]], 'conj': doc[7]}, doc[3], '?')