Spaces:

Sigdev
/

comment_validator

Paused

App Files Files Community

aurelien commited on Oct 30, 2025

Commit

cb1109f

1 Parent(s): 135616e

1st commit

Browse files

Files changed (6) hide show

.gitignore +3 -0
app.py +56 -0
models/classifier.joblib +3 -0
models/encoder.joblib +3 -0
requirements.txt +40 -0
validate_comment_sentiment_tags.py +145 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+__pycache__
+data
+venv

app.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from fastapi import FastAPI
+from pydantic import BaseModel
+from typing import List
+import joblib
+import pandas as pd
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from transformers import pipeline
+import torch
+from validate_comment_sentiment_tags import analyze_comment  # ton code ci-dessus, tu peux aussi le copier ici
+app = FastAPI(title="Comment Validator API")
+# =====================================
+# 🔹 Chargement des modèles
+# =====================================
+device = "mps" if torch.backends.mps.is_available() else "cpu"
+print(f"🧠 Using device: {device}")
+text_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2", device=device)
+clf = joblib.load("models/classifier.joblib")
+encoder = joblib.load("models/encoder.joblib")
+sentiment_analyzer = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment", device=-1)
+toxicity_analyzer = pipeline("text-classification", model="unitary/toxic-bert", return_all_scores=True, device=-1)
+# =====================================
+# 🔸 Modèles de requête/réponse
+# =====================================
+class CommentRequest(BaseModel):
+    comment: str
+    category: str
+    country: str
+class BatchRequest(BaseModel):
+    items: List[CommentRequest]
+# =====================================
+# 🔹 Routes
+# =====================================
+@app.post("/predict")
+def predict(item: CommentRequest):
+    """Analyse un seul commentaire"""
+    result = analyze_comment(item.comment, item.category, item.country)
+    return result
+@app.post("/batch_predict")
+def batch_predict(request: BatchRequest):
+    """Analyse plusieurs commentaires à la fois"""
+    results = []
+    for item in request.items:
+        results.append(analyze_comment(item.comment, item.category, item.country))
+    return {"results": results}

models/classifier.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:83787fadf944c3d5d149a3b7dc251e4ffa934ead146080c6b7ed62f732f2a8a2
+size 490140969

models/encoder.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0fb07d0e6818f85b30c63b9f33363b36bc77a7ef410e4a4007716dd552d9e283
+size 2002

requirements.txt ADDED Viewed

	@@ -0,0 +1,40 @@

+certifi==2025.10.5
+charset-normalizer==3.4.4
+filelock==3.20.0
+fsspec==2025.9.0
+hf-xet==1.2.0
+huggingface-hub==0.36.0
+idna==3.11
+Jinja2==3.1.6
+joblib==1.5.2
+MarkupSafe==3.0.3
+mpmath==1.3.0
+networkx==3.5
+numpy==1.26.4
+packaging==25.0
+pandas==2.3.3
+pillow==12.0.0
+protobuf==6.33.0
+python-dateutil==2.9.0.post0
+pytz==2025.2
+PyYAML==6.0.3
+regex==2025.10.23
+requests==2.32.5
+safetensors==0.6.2
+scikit-learn==1.7.2
+scipy==1.16.2
+sentence-transformers==5.1.2
+sentencepiece==0.2.1
+six==1.17.0
+sympy==1.14.0
+threadpoolctl==3.6.0
+tiktoken==0.12.0
+tokenizers==0.22.1
+torch==2.2.2
+torchaudio==2.2.2
+torchvision==0.17.2
+tqdm==4.67.1
+transformers==4.57.1
+typing_extensions==4.15.0
+tzdata==2025.2
+urllib3==2.5.0

validate_comment_sentiment_tags.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import pandas as pd
+import joblib
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from transformers import pipeline
+import torch
+from tqdm import tqdm
+import os
+# =====================================
+# 🔹 Initialisation
+# =====================================
+device = "mps" if torch.backends.mps.is_available() else "cpu"
+print(f"🧠 Using device: {device}")
+# Modèle d'embedding
+text_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2", device=device)
+# Modèle de classification & encodeur
+clf = joblib.load("models/classifier.joblib")
+encoder = joblib.load("models/encoder.joblib")
+# Modèle de sentiment
+sentiment_analyzer = pipeline(
+    "sentiment-analysis",
+    model="nlptown/bert-base-multilingual-uncased-sentiment",
+    device=-1
+)
+# Modèle de toxicité
+toxicity_analyzer = pipeline(
+    "text-classification",
+    model="unitary/toxic-bert",
+    return_all_scores=True,
+    device=-1
+)
+# =====================================
+# 🔸 Fonction d’analyse d’un commentaire
+# =====================================
+def analyze_comment(comment: str, category: str, country: str) -> dict:
+    reasons = []
+    # --- Analyse du sentiment ---
+    try:
+        sentiment = sentiment_analyzer(comment[:512])[0]
+        label = sentiment["label"]
+        score = sentiment["score"]
+    except Exception:
+        label, score = "unknown", 0.0
+    if "1" in label or "2" in label:
+        sentiment_score = -1
+        reasons.append("Le ton semble négatif ou insatisfait.")
+    elif "4" in label or "5" in label:
+        sentiment_score = 1
+    else:
+        sentiment_score = 0
+    # --- Encodage du texte ---
+    X_text = text_model.encode([comment])
+    # --- Encodage catégorie/pays ---
+    df_cat = pd.DataFrame([[category, country]], columns=["category", "country"])
+    try:
+        X_cat = encoder.transform(df_cat)
+    except ValueError:
+        reasons.append(f"Catégorie ou pays inconnus : {category}, {country}")
+        n_features = sum(len(cats) for cats in encoder.categories_)
+        X_cat = np.zeros((1, n_features))
+    # --- Concaténation ---
+    X = np.concatenate([X_text, X_cat], axis=1)
+    # --- Prédiction validité ---
+    proba = clf.predict_proba(X)[0][1]
+    prediction = proba >= 0.5
+    if len(comment.split()) < 3:
+        reasons.append("Le commentaire est trop court.")
+    if sentiment_score < 0:
+        reasons.append("Le ton global est négatif.")
+    if proba < 0.4:
+        reasons.append("Le modèle estime une faible probabilité de validité.")
+    # --- Analyse toxicité ---
+    try:
+        tox_scores = toxicity_analyzer(comment[:512])[0]  # tronquer pour sécurité
+        tags = {f"tag_{item['label']}": round(item['score'], 3) for item in tox_scores}
+    except Exception:
+        tags = {f"tag_{label}": 0.0 for label in ["toxicity","severe_toxicity","obscene","identity_attack","insult","threat"]}
+    # --- Résultat final ---
+    result = {
+        "is_valid": bool(prediction),
+        "confidence": round(float(proba), 3),
+        "sentiment": label,
+        "sentiment_score": round(float(score), 3),
+        "reasons": "; ".join(reasons) if reasons else "Aucune anomalie détectée."
+    }
+    result.update(tags)
+    return result
+# =====================================
+# 🔹 Batch prediction sur CSV
+# =====================================
+def batch_predict(input_csv: str, output_csv: str):
+    if not os.path.exists(input_csv):
+        raise FileNotFoundError(f"Le fichier '{input_csv}' est introuvable.")
+    print(f"🔹 Lecture du fichier : {input_csv}")
+    df = pd.read_csv(input_csv)
+    required_cols = {"comment", "category", "country"}
+    if not required_cols.issubset(df.columns):
+        raise ValueError(f"Le fichier CSV doit contenir les colonnes : {required_cols}")
+    print(f"🚀 Analyse de {len(df)} commentaires...")
+    results = []
+    for _, row in tqdm(df.iterrows(), total=len(df)):
+        analysis = analyze_comment(
+            str(row["comment"]),
+            str(row["category"]),
+            str(row["country"])
+        )
+        results.append(analysis)
+    df_results = pd.concat([df, pd.DataFrame(results)], axis=1)
+    os.makedirs(os.path.dirname(output_csv), exist_ok=True)
+    df_results.to_csv(output_csv, index=False)
+    print(f"✅ Résultats sauvegardés dans : {output_csv}")
+# =====================================
+# 🔸 Exemple d’exécution
+# =====================================
+if __name__ == "__main__":
+    INPUT_FILE = "data/new_comments.csv"
+    OUTPUT_FILE = "data/predictions_with_tags.csv"
+    batch_predict(INPUT_FILE, OUTPUT_FILE)