Spaces:
Sleeping
Sleeping
File size: 2,678 Bytes
677f286 595955c 677f286 595955c dcd9854 595955c dcd9854 595955c 677f286 595955c 677f286 595955c 677f286 595955c 677f286 595955c 677f286 595955c 677f286 595955c 677f286 595955c 677f286 595955c 677f286 595955c 677f286 595955c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 | import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
class GraphCodeBERT:
def __init__(self):
import gc
# Using "Small" model to fit in Render Free Tier (512MB RAM)
self.model_name = "huggingface/CodeBERTa-small-v1"
print(f"Loading Analyzer Model: {self.model_name}...")
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
# Load model usually
model_fp32 = AutoModel.from_pretrained(self.model_name)
# QUANTIZATION MAGIC: Compress model to int8 (4x smaller RAM)
print("📉 Quantizing model to reduce memory usage for Render...")
# FIX: Explicitly set engine for ARM64/Mac/Container compatibility
torch.backends.quantized.engine = 'qnnpack'
self.model = torch.quantization.quantize_dynamic(
model_fp32, {torch.nn.Linear}, dtype=torch.qint8
)
del model_fp32
gc.collect()
self.model.eval() # Set to evaluation mode
def get_embedding(self, code_snippet):
"""
Converts a string of code into a dense vector (embedding).
"""
if not code_snippet or not isinstance(code_snippet, str):
return np.zeros(768) # Return empty vector if code is invalid
try:
inputs = self.tokenizer(
code_snippet,
return_tensors="pt",
truncation=True,
max_length=512
)
with torch.no_grad():
outputs = self.model(**inputs)
# Mean pooling to capture the overall semantic meaning
embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
return embedding
except Exception as e:
print(f"Error generating embedding: {e}")
return np.zeros(768)
# --- EXPORTED FUNCTIONS (To fix the ImportError) ---
# 1. Create a global instance of the model
_bert_instance = GraphCodeBERT()
# 2. Expose the function so scorer.py can import it
def get_embedding(code_snippet):
return _bert_instance.get_embedding(code_snippet)
# 3. Expose the similarity function
def compute_similarity(embedding1, embedding2):
"""
Calculates cosine similarity between two embeddings.
"""
if embedding1 is None or embedding2 is None:
return 0.0
# Ensure they are numpy arrays
e1 = np.array(embedding1).reshape(1, -1)
e2 = np.array(embedding2).reshape(1, -1)
return float(cosine_similarity(e1, e2)[0][0]) |