| import pandas as pd |
| from datasets import Dataset |
| from transformers import ( |
| AutoTokenizer, |
| AutoModelForSequenceClassification, |
| Trainer, |
| TrainingArguments |
| ) |
|
|
| |
| df = pd.read_csv("data/intents.csv") |
| labels = sorted(df.intent.unique()) |
| label2id = {l: i for i, l in enumerate(labels)} |
| id2label = {i: l for l, i in label2id.items()} |
|
|
| df["label"] = df.intent.map(label2id) |
| dataset = Dataset.from_pandas(df) |
|
|
| tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") |
|
|
| def tokenize(batch): |
| return tokenizer(batch["text"], truncation=True, padding=True) |
|
|
| dataset = dataset.map(tokenize, batched=True) |
| dataset = dataset.train_test_split(test_size=0.2) |
|
|
| model = AutoModelForSequenceClassification.from_pretrained( |
| "distilbert-base-uncased", |
| num_labels=len(labels), |
| id2label=id2label, |
| label2id=label2id |
| ) |
|
|
| args = TrainingArguments( |
| output_dir="./model", |
| evaluation_strategy="epoch", |
| per_device_train_batch_size=8, |
| per_device_eval_batch_size=8, |
| num_train_epochs=6, |
| logging_steps=10, |
| save_strategy="epoch" |
| ) |
|
|
| trainer = Trainer( |
| model=model, |
| args=args, |
| train_dataset=dataset["train"], |
| eval_dataset=dataset["test"], |
| tokenizer=tokenizer |
| ) |
|
|
| trainer.train() |
| trainer.save_model("./model") |
| tokenizer.save_pretrained("./model") |
|
|