File size: 4,210 Bytes
cb1109f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
845c5fd
 
 
 
 
 
cb1109f
 
845c5fd
cb1109f
845c5fd
cb1109f
845c5fd
cb1109f
845c5fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cb1109f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
from fastapi import FastAPI
from pydantic import BaseModel
from typing import List
import joblib
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import torch

app = FastAPI(title="Comment Validator API")

# =====================================
# 🔹 Chargement des modèles
# =====================================

if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"  # pour ton Mac local
else:
    device = "cpu"
print(f"🧠 Using device: {device}")

print("Loading model embedding")
text_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2", device=device)
print("Loading model classifier")
clf = joblib.load("models/classifier.joblib")
print("Loading model encoder")
encoder = joblib.load("models/encoder.joblib")
print("Loading model sentiment-analysis")
sentiment_analyzer = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment", device=device)
print("Loading model toxicity")
toxicity_analyzer = pipeline("text-classification", model="unitary/toxic-bert", return_all_scores=True, device=device)

def analyze_comment(comment: str, category: str, country: str) -> dict:
    reasons = []

    # --- Analyse du sentiment ---
    try:
        sentiment = sentiment_analyzer(comment[:512])[0]
        label = sentiment["label"]
        score = sentiment["score"]
    except Exception:
        label, score = "unknown", 0.0

    if "1" in label or "2" in label:
        sentiment_score = -1
        reasons.append("Le ton semble négatif ou insatisfait.")
    elif "4" in label or "5" in label:
        sentiment_score = 1
    else:
        sentiment_score = 0

    # --- Encodage du texte ---
    X_text = text_model.encode([comment])

    # --- Encodage catégorie/pays ---
    df_cat = pd.DataFrame([[category, country]], columns=["category", "country"])
    try:
        X_cat = encoder.transform(df_cat)
    except ValueError:
        reasons.append(f"Catégorie ou pays inconnus : {category}, {country}")
        n_features = sum(len(cats) for cats in encoder.categories_)
        X_cat = np.zeros((1, n_features))

    # --- Concaténation ---
    X = np.concatenate([X_text, X_cat], axis=1)

    # --- Prédiction validité ---
    proba = clf.predict_proba(X)[0][1]
    prediction = proba >= 0.5

    if len(comment.split()) < 3:
        reasons.append("Le commentaire est trop court.")
    if sentiment_score < 0:
        reasons.append("Le ton global est négatif.")
    if proba < 0.4:
        reasons.append("Le modèle estime une faible probabilité de validité.")

    # --- Analyse toxicité ---
    try:
        tox_scores = toxicity_analyzer(comment[:512])[0]  # tronquer pour sécurité
        tags = {f"tag_{item['label']}": round(item['score'], 3) for item in tox_scores}
    except Exception:
        tags = {f"tag_{label}": 0.0 for label in ["toxicity","severe_toxicity","obscene","identity_attack","insult","threat"]}

    # --- Résultat final ---
    result = {
        "is_valid": bool(prediction),
        "confidence": round(float(proba), 3),
        "sentiment": label,
        "sentiment_score": round(float(score), 3),
        "reasons": "; ".join(reasons) if reasons else "Aucune anomalie détectée."
    }

    result.update(tags)
    return result


# =====================================
# 🔸 Modèles de requête/réponse
# =====================================

class CommentRequest(BaseModel):
    comment: str
    category: str
    country: str

class BatchRequest(BaseModel):
    items: List[CommentRequest]

# =====================================
# 🔹 Routes
# =====================================

@app.post("/predict")
def predict(item: CommentRequest):
    """Analyse un seul commentaire"""
    result = analyze_comment(item.comment, item.category, item.country)
    return result


@app.post("/batch_predict")
def batch_predict(request: BatchRequest):
    """Analyse plusieurs commentaires à la fois"""
    results = []
    for item in request.items:
        results.append(analyze_comment(item.comment, item.category, item.country))
    return {"results": results}