72 lines
2.4 KiB
Python
72 lines
2.4 KiB
Python
from datasets import load_dataset
|
|
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
|
|
import evaluate
|
|
import numpy as np
|
|
|
|
id2label = {0: "HEURE", 1: "DATE", 2: "ETEINDRE_CUISINE", 3: "ETEINDRE_BUREAU",
|
|
4: "ETEINDRE_SALON", 5: "ETEINDRE_CHAMBRE", 6: "METEO", 7: "TEMPERATURE_EXTERIEUR", 8: "TEMPERATURE_INTERIEUR",
|
|
9: "ALLUMER_CUISINE", 10: "ALLUMER_SALON", 11: "ALLUMER_BUREAU", 12: "ALLUMER_CHAMBRE"}
|
|
|
|
label2id = {"HEURE": 0, "DATE": 1, "ETEINDRE_CUISINE": 2,
|
|
"ETEINDRE_BUREAU": 3, "ETEINDRE_SALON": 4, "ETEINDRE_CHAMBRE": 5, "METEO": 6, "TEMPERATURE_EXTERIEUR": 7, "TEMPERATURE_INTERIEUR": 8,
|
|
"ALLUMER_CUISINE": 9, "ALLUMER_SALON": 10, "ALLUMER_BUREAU": 11, "ALLUMER_CHAMBRE": 12}
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("almanach/camembert-base")
|
|
|
|
|
|
def preprocess_function(examples):
|
|
return tokenizer(examples["text"], padding="max_length", truncation=True)
|
|
|
|
|
|
dataset_train = load_dataset("csv", data_files="v2.1.train.csv", delimiter=';')
|
|
dataset_eval = load_dataset("csv", data_files="v2.1.valid.csv", delimiter=';')
|
|
|
|
tokenized_train = dataset_train.map(preprocess_function, batched=True)
|
|
tokenized_eval = dataset_eval.map(preprocess_function, batched=True)
|
|
|
|
|
|
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
|
accuracy = evaluate.load("accuracy")
|
|
|
|
|
|
def compute_metrics(eval_pred):
|
|
predictions, labels = eval_pred
|
|
predictions = np.argmax(predictions, axis=1)
|
|
return accuracy.compute(predictions=predictions, references=labels)
|
|
|
|
|
|
model = AutoModelForSequenceClassification.from_pretrained(
|
|
"almanach/camembert-base", num_labels=13, id2label=id2label, label2id=label2id
|
|
)
|
|
|
|
print("\n All is loaded ! \n")
|
|
|
|
|
|
training_args = TrainingArguments(
|
|
output_dir="domotique_command_classification",
|
|
learning_rate=2e-5,
|
|
per_device_train_batch_size=4,
|
|
per_device_eval_batch_size=8,
|
|
num_train_epochs=16,
|
|
weight_decay=0.01,
|
|
eval_strategy="epoch",
|
|
save_strategy="epoch",
|
|
load_best_model_at_end=True,
|
|
push_to_hub=False,
|
|
)
|
|
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=training_args,
|
|
train_dataset=tokenized_train["train"],
|
|
eval_dataset=tokenized_eval["train"],
|
|
processing_class=tokenizer,
|
|
data_collator=data_collator,
|
|
compute_metrics=compute_metrics,
|
|
)
|
|
|
|
trainer.save_model("models/bert/")
|
|
|
|
print("\n\n Start training ! \n")
|
|
|
|
trainer.train()
|