Spaces:

specimba
/

nexus-os-lab

Running

App Files Files Community

specimba commited on about 13 hours ago

Commit

679aa22

verified ·

1 Parent(s): a8cc3e1

Remove guards.py: was importing torch at module-level causing builder side-effects

Browse files

Files changed (1) hide show

guards.py +0 -132

guards.py DELETED Viewed

@@ -1,132 +0,0 @@
-"""
-NEXUS LAB - 4 Specialist Guard System
-======================================
-4 x BERT-base classifiers (< 110MB each, ~440MB total)
-Ensemble: Gemma-4B takes 4 specialist scores + raw input for final verdict
-Domains:
-  1. MCP/Tool Contamination - tool misuse, parameter injection, supply chain
-  2. Multi-Agent Collusion - identity spoofing, coordinated attacks
-  3. Content Safety - hate, sexual, violence, self-harm, CSAM
-  4. Jailbreak/Prompt Engineering - DAN, role-play, ignore-previous
-Training: LoRA on BERT-base (66MB per adapter, 4 = 264MB)
-VRAM fit: 440MB (FP16 BERT) + 264MB (LoRA) + 3.1GB (Gemma Q4) = <4GB total
-"""
-import os, json, torch
-from typing import Dict, List, Optional
-from dataclasses import dataclass
-GUARD_LABELS = {
-    "mcp":     ["safe", "tool_param_injection", "supply_chain_attack", "mcp_surface_exploit"],
-    "collusion": ["safe", "identity_spoof", "governance_bypass", "coordinated_attack"],
-    "content":   ["safe", "hate_speech", "sexual_content", "violence", "self_harm", "csam"],
-    "jailbreak": ["safe", "dan", "roleplay_bypass", "ignore_previous", "encoding_attack", "jailbreak_pattern"],
-}
-@dataclass
-class GuardResult:
-    domain: str
-    label: str
-    confidence: float
-    scores: Dict[str, float]
-class SpecialistGuard:
-    """Single BERT-base specialist with LoRA adapter."""
-    BASE_MODEL = "google-bert/bert-base-uncased"
-    ADAPTER_PATH = "specimba/nexus-guard-{}"  # e.g. specimba/nexus-guard-mcp
-    def __init__(self, domain: str):
-        self.domain = domain
-        self.labels = GUARD_LABELS[domain]
-        self.model = None
-        self.tokenizer = None
-        from transformers import AutoModelForSequenceClassification, AutoTokenizer
-        self._cls = AutoModelForSequenceClassification
-        self._tok = AutoTokenizer
-    def load(self, device: str = "cpu"):
-        import os
-        path = self.ADAPTER_PATH.format(self.domain)
-        self.tokenizer = self._tok.from_pretrained(self.BASE_MODEL)
-        base = self._cls.from_pretrained(self.BASE_MODEL, num_labels=len(self.labels))
-        try:
-            from peft import PeftModel
-            self.model = PeftModel.from_pretrained(base, path, adapter_name=self.domain)
-        except Exception:
-            self.model = base  # Fallback: base model without LoRA
-        self.model = self.model.to(device)
-        return self
-    def predict(self, text: str) -> GuardResult:
-        if self.model is None:
-            self.load()
-        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(self.model.device)
-        with torch.no_grad():
-            logits = self.model(**inputs).logits
-            probs = torch.softmax(logits, dim=-1)[0]
-        top_idx = torch.argmax(probs).item()
-        scores = {self.labels[i]: round(probs[i].item(), 4) for i in range(len(self.labels))}
-        return GuardResult(
-            domain=self.domain,
-            label=self.labels[top_idx],
-            confidence=round(probs[top_idx].item(), 4),
-            scores=scores,
-        )
-class GuardEnsemble:
-    """4 Specialist Guards + Gemma-4B ensemble for final verdict."""
-    GEMMA_MODEL = "google/gemma-3-4b-it"
-    def __init__(self, device_map: str = "auto"):
-        self.guards = {d: SpecialistGuard(d) for d in GUARD_LABELS}
-    def evaluate(self, text: str, fast_path: bool = True) -> Dict:
-        """Fast path: if all guards agree safe, skip Gemma."""
-        results = {d: g.predict(text) for d, g in self.guards.items()}
-        all_safe = all(r.label == "safe" and r.confidence > 0.95 for r in results.values())
-        if fast_path and all_safe:
-            return {"verdict": "SAFE", "path": "fast", "guards": {d: asdict(r) for d,r in results.items()}}
-        # Slow path: Ensemble judgment
-        prompt = self._build_ensemble_prompt(text, results)
-        verdict = self._gemma_judge(prompt)
-        return {"verdict": verdict, "path": "ensemble", "guards": {d: asdict(r) for d,r in results.items()}}
-    def _build_ensemble_prompt(self, text: str, results: Dict) -> str:
-        scores = []
-        for d, r in results.items():
-            top = f"{r.label}({r.confidence:.2%})"
-            scores.append(f"{d}: {top}")
-        return (
-            f"Text to evaluate: {text[:500]}\n"
-            f"Specialist scores: {', '.join(scores)}\n"
-            "Based on these specialist evaluations, provide a final verdict:\n"
-            "VERDICT: [SAFE | UNSAFE - then explain why in one sentence]"
-        )
-    def _gemma_judge(self, prompt: str) -> str:
-        try:
-            from transformers import AutoModelForCausalLM, AutoTokenizer
-            tok = AutoTokenizer.from_pretrained(self.GEMMA_MODEL)
-            model = AutoModelForCausalLM.from_pretrained(self.GEMMA_MODEL, device_map="auto")
-            inputs = tok(prompt, return_tensors="pt").to(model.device)
-            out = model.generate(**inputs, max_new_tokens=100, do_sample=False)
-            return tok.decode(out[0], skip_special_tokens=True)[len(prompt):].strip()
-        except Exception as e:
-            return f"ERROR: {str(e)[:200]}"
-def asdict(result: GuardResult):
-    return {"domain": result.domain, "label": result.label, "confidence": result.confidence, "scores": result.scores}
-# Pre-load stubs for when models aren't downloaded
-NO_MODEL_RESPONSE = {
-    "mcp": {"label": "safe", "confidence": 0.0, "note": "model not loaded"},
-    "collusion": {"label": "safe", "confidence": 0.0, "note": "model not loaded"},
-    "content": {"label": "safe", "confidence": 0.0, "note": "model not loaded"},
-    "jailbreak": {"label": "safe", "confidence": 0.0, "note": "model not loaded"},
-}