intent-recognition-training/bert.py
2025-01-26 02:58:43 +01:00

72 lines
2.4 KiB
Python

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
id2label = {0: "HEURE", 1: "DATE", 2: "ETEINDRE_CUISINE", 3: "ETEINDRE_BUREAU",
4: "ETEINDRE_SALON", 5: "ETEINDRE_CHAMBRE", 6: "METEO", 7: "TEMPERATURE_EXTERIEUR", 8: "TEMPERATURE_INTERIEUR",
9: "ALLUMER_CUISINE", 10: "ALLUMER_SALON", 11: "ALLUMER_BUREAU", 12: "ALLUMER_CHAMBRE"}
label2id = {"HEURE": 0, "DATE": 1, "ETEINDRE_CUISINE": 2,
"ETEINDRE_BUREAU": 3, "ETEINDRE_SALON": 4, "ETEINDRE_CHAMBRE": 5, "METEO": 6, "TEMPERATURE_EXTERIEUR": 7, "TEMPERATURE_INTERIEUR": 8,
"ALLUMER_CUISINE": 9, "ALLUMER_SALON": 10, "ALLUMER_BUREAU": 11, "ALLUMER_CHAMBRE": 12}
tokenizer = AutoTokenizer.from_pretrained("almanach/camembert-base")
def preprocess_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True)
dataset_train = load_dataset("csv", data_files="v2.1.train.csv", delimiter=';')
dataset_eval = load_dataset("csv", data_files="v2.1.valid.csv", delimiter=';')
tokenized_train = dataset_train.map(preprocess_function, batched=True)
tokenized_eval = dataset_eval.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return accuracy.compute(predictions=predictions, references=labels)
model = AutoModelForSequenceClassification.from_pretrained(
"almanach/camembert-base", num_labels=13, id2label=id2label, label2id=label2id
)
print("\n All is loaded ! \n")
training_args = TrainingArguments(
output_dir="domotique_command_classification",
learning_rate=2e-5,
per_device_train_batch_size=4,
per_device_eval_batch_size=8,
num_train_epochs=16,
weight_decay=0.01,
eval_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
push_to_hub=False,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train["train"],
eval_dataset=tokenized_eval["train"],
processing_class=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
trainer.save_model("models/bert/")
print("\n\n Start training ! \n")
trainer.train()