| |
| import json |
| import torch |
| from transformers import DistilBertTokenizerFast, DistilBertModel |
| import numpy as np |
|
|
| def load_data(file_path): |
| with open(file_path, 'r') as f: |
| dataset = json.load(f) |
| outdata = [ |
| { |
| "did": e["user_id"], |
| "description": e["description"], |
| "label_weights": e["user_categories"] |
| } |
| for e in dataset |
| if e["description"] and e["user_categories"] |
| ] |
| return outdata |
|
|
| def prepare_labels(outdata): |
| all_labels = sorted({label for record in outdata for label in record['label_weights'].keys()}) |
| label2id = {label: i for i, label in enumerate(all_labels)} |
| id2label = {i: label for label, i in label2id.items()} |
|
|
| y_matrix = np.zeros((len(outdata), len(all_labels)), dtype=float) |
| for idx, record in enumerate(outdata): |
| for label, weight in record['label_weights'].items(): |
| y_matrix[idx, label2id[label]] = weight |
| return y_matrix, label2id, id2label |
|
|
| class EmbeddingGenerator: |
| def __init__(self, model_name='distilbert-base-uncased', device=None): |
| self.tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) |
| self.embedding_model = DistilBertModel.from_pretrained(model_name) |
| self.device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| self.embedding_model.to(self.device) |
|
|
| def generate_embeddings(self, descriptions, batch_size=1000): |
| all_embeddings = [] |
| descriptions = [desc for desc in descriptions] |
| for i in range(0, len(descriptions), batch_size): |
| batch_descriptions = descriptions[i:i + batch_size] |
| inputs = self.tokenizer( |
| batch_descriptions, |
| padding=True, |
| truncation=True, |
| max_length=128, |
| return_tensors="pt" |
| ).to(self.device) |
| with torch.no_grad(): |
| outputs = self.embedding_model(**inputs) |
| batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy() |
| all_embeddings.append(batch_embeddings) |
| return np.vstack(all_embeddings) |
|
|