ratatouille/src/tools/parse_entities.py

52 lines
2.4 KiB
Python

from collections import defaultdict
def parse_entities(entities):
"""
>>> parse_entities([{'entity': 'B-weather_descriptor', 'score': 0.76314837, 'index': 4, 'word': '▁température', 'start': 15, 'end': 26}])
defaultdict(<class 'list'>, {'B-weather_descriptor': ['température']})
>>> parse_entities([\
{'entity': 'B-weather_descriptor', 'score': 0.98460984, 'index': 8, 'word': '▁ple', 'start': 16, 'end': 19}, \
{'entity': 'B-weather_descriptor', 'score': 0.98146856, 'index': 9, 'word': 'u', 'start': 19, 'end': 20}, \
{'entity': 'B-weather_descriptor', 'score': 0.97976905, 'index': 10, 'word': 'voir', 'start': 20, 'end': 24}, \
{'entity': 'B-date', 'score': 0.9455722, 'index': 12, 'word': '▁de', 'start': 28, 'end': 30}, \
{'entity': 'B-date', 'score': 0.9530212, 'index': 13, 'word': 'main', 'start': 30, 'end': 34}])
defaultdict(<class 'list'>, {'B-weather_descriptor': ['pleuvoir'], 'B-date': ['demain']})
>>> parse_entities([ \
{'entity': 'B-weather_descriptor', 'score': 0.9845413, 'index': 8, 'word': '▁ple', 'start': 16, 'end': 19}, \
{'entity': 'B-weather_descriptor', 'score': 0.9849722, 'index': 9, 'word': 'u', 'start': 19, 'end': 20}, \
{'entity': 'B-weather_descriptor', 'score': 0.97936, 'index': 10, 'word': 'voir', 'start': 20, 'end': 24}, \
{'entity': 'B-date', 'score': 0.9811183, 'index': 11, 'word': '▁de', 'start': 25, 'end': 27}, \
{'entity': 'B-date', 'score': 0.9860088, 'index': 12, 'word': 'main', 'start': 27, 'end': 31}, \
{'entity': 'B-date', 'score': 0.8544976, 'index': 14, 'word': '▁lundi', 'start': 35, 'end': 40}])
defaultdict(<class 'list'>, {'B-weather_descriptor': ['pleuvoir'], 'B-date': ['demain', 'lundi']})
"""
entities_dict = defaultdict(list)
last_entity_word = ""
last_entity_type = None
for e in entities:
word = e['word']
e_type = e['entity']
if word[0] != "":
last_entity_word = last_entity_word + word.strip("")
else:
if last_entity_type is not None:
entities_dict[last_entity_type].append(last_entity_word)
last_entity_type = e_type
last_entity_word = word.strip("")
entities_dict[last_entity_type].append(last_entity_word)
return entities_dict
if __name__ == "__main__":
import doctest
doctest.testmod()