Upload DFC CrossCoder model

Browse files

Files changed (10) hide show

README.md +120 -0
app.py +231 -0
config.json +7 -0
demo.py +100 -0
dfc_crosscoder.py +201 -0
inference_config.json +13 -0
minimal_demo.py +439 -0
model.pt +3 -0
requirements.txt +4 -0
space_requirements.txt +6 -0

README.md ADDED Viewed

	@@ -0,0 +1,120 @@

+---
+language:
+- en
+license: mit
+library_name: pytorch
+tags:
+- crosscoder
+- sparse-autoencoder
+- interpretability
+- feature-extraction
+- pytorch
+datasets:
+- fineweb
+- toolrl
+metrics:
+- reconstruction_loss
+- sparsity
+base_model:
+- chengq9/ToolRL-Qwen2.5-3B
+- Qwen/Qwen2.5-3B
+pipeline_tag: feature-extraction
+---
+# DFC CrossCoder (antebe1/dfc-crosscoder-qwen-ToolRL)
+A Dedicated Feature CrossCoder (DFC) trained to extract sparse, interpretable features from the activations of two related language models:
+- **Model A (ToolRL)**: chengq9/ToolRL-Qwen2.5-3B
+- **Model B (Base)**: Qwen/Qwen2.5-3B
+The DFC learns to identify features that are:
+- **A-exclusive**: Only active for the ToolRL model
+- **B-exclusive**: Only active for the base model
+- **Shared**: Active for both models
+## Model Details
+### Architecture
+- **Dictionary Size**: 16,384 features
+- **Top-K**: 90 active features per example
+- **Layer**: 13 (of transformer)
+- **Activation Dimension**: 2048
+### Feature Partitions
+- **A-exclusive features**: 819 (5.0%)
+- **B-exclusive features**: 819 (5.0%)
+- **Shared features**: 14746 (90.0%)
+### Training Details
+- **Training Steps**: 20,000
+- **Learning Rate**: 0.0001
+- **Batch Size**: 64
+- **Sparsity Coefficient**: 0
+- **Exclusive Sparsity Coefficient**: 0.001
+## Usage
+```python
+from transformers import AutoTokenizer
+from dfc_crosscoder import DFCCrossCoder, extract_activations
+# Load the model
+dfc = DFCCrossCoder.from_pretrained("antebe1/dfc-crosscoder-qwen-ToolRL")
+# Load base models (you need both original models)
+model_a = AutoModelForCausalLM.from_pretrained("chengq9/ToolRL-Qwen2.5-3B")
+model_b = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-3B")
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B")
+# Extract and encode features
+text = "Your input text here"
+activations = extract_activations(model_a, model_b, tokenizer, [text], layer=13)
+features = dfc.encode(activations)
+# Analyze features
+active_features = (features > 0).nonzero(as_tuple=True)[1]
+print(f"Active features: {active_features.tolist()}")
+# Decode back to activations
+reconstructed = dfc.decode(features)
+```
+## Model Files
+- `model.pt` - PyTorch model weights
+- `config.json` - Model configuration
+- `dfc_crosscoder.py` - Model implementation
+- `demo.py` - Minimal usage demo
+- `requirements.txt` - Dependencies
+## Intended Use
+This model is designed for:
+- **Interpretability research**: Understanding differences between fine-tuned and base models
+- **Feature analysis**: Identifying model-specific vs shared computational patterns
+- **Steering experiments**: Modifying model behavior through feature manipulation
+- **Mechanistic interpretability**: Studying how fine-tuning affects internal representations
+## Limitations
+- Trained on specific model pair (chengq9/ToolRL-Qwen2.5-3B / Qwen/Qwen2.5-3B)
+- Features are extracted from layer 13 only
+- Requires both original models for activation extraction
+- Performance depends on quality of training data and hyperparameters
+## Citation
+```bibtex
+@misc{dfc_crosscoder_antebe1_dfc_crosscoder_qwen_ToolRL,
+  title={DFC CrossCoder: Sparse Feature Extraction for Model Comparison},
+  author={Your Name Here},
+  year={2026},
+  url={https://huggingface.co/antebe1/dfc-crosscoder-qwen-ToolRL}
+}
+```
+## License
+MIT License - see LICENSE file for details.

app.py ADDED Viewed

	@@ -0,0 +1,231 @@

+"""
+app.py — Hugging Face Space demo for DFC CrossCoder.
+This file creates a Gradio demo that can be deployed to Hugging Face Spaces.
+Upload this along with the model files to create a working demo.
+"""
+import gradio as gr
+import torch
+import torch.nn.functional as F
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import json
+# Simplified DFC for Space demo
+class DFCCrossCoder(torch.nn.Module):
+    def __init__(self, activation_dim: int, dict_size: int, k: int, n_a: int, n_b: int):
+        super().__init__()
+        self.activation_dim = activation_dim
+        self.dict_size = dict_size
+        self.k = k
+        self.n_a = n_a
+        self.n_b = n_b
+        self.n_shared = dict_size - n_a - n_b
+        self.a_end = n_a
+        self.b_end = n_a + n_b
+        self.W_enc = torch.nn.Parameter(torch.zeros(2, activation_dim, dict_size))
+        self.b_enc = torch.nn.Parameter(torch.zeros(dict_size))
+        self.W_dec = torch.nn.Parameter(torch.zeros(dict_size, 2, activation_dim))
+        self.b_dec = torch.nn.Parameter(torch.zeros(2, activation_dim))
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        pre = torch.einsum("bmd,mdf->bf", x, self.W_enc) + self.b_enc
+        pre = F.relu(pre)
+        topk_vals, topk_idx = torch.topk(pre, self.k, dim=-1)
+        features = torch.zeros_like(pre)
+        features.scatter_(-1, topk_idx, topk_vals)
+        return features
+    def decode(self, features: torch.Tensor) -> torch.Tensor:
+        return torch.einsum("bf,fmd->bmd", features, self.W_dec) + self.b_dec
+    @classmethod
+    def from_pretrained(cls, model_path: str = ".", device: str = "cpu"):
+        # Load config
+        with open(f"{model_path}/config.json") as f:
+            config = json.load(f)
+        model = cls(
+            activation_dim=config["activation_dim"],
+            dict_size=config["dict_size"],
+            k=config["k"],
+            n_a=config.get("n_a", int(config["dict_size"] * 0.05)),
+            n_b=config.get("n_b", int(config["dict_size"] * 0.05))
+        )
+        state_dict = torch.load(f"{model_path}/model.pt", map_location=device, weights_only=True)
+        model.load_state_dict(state_dict)
+        return model.to(device)
+# Global variables for models (loaded once)
+dfc_model = None
+model_a = None
+model_b = None
+tokenizer = None
+def load_models():
+    """Load all models once at startup."""
+    global dfc_model, model_a, model_b, tokenizer
+    if dfc_model is None:
+        print("Loading models...")
+        # Load DFC
+        dfc_model = DFCCrossCoder.from_pretrained(".", device="cpu")
+        dfc_model.eval()
+        # Load language models with reduced precision for space
+        model_a = AutoModelForCausalLM.from_pretrained(
+            "chengq9/ToolRL-Qwen2.5-3B",
+            torch_dtype=torch.float16,  # Use half precision
+            device_map="auto",
+            low_cpu_mem_usage=True
+        )
+        model_b = AutoModelForCausalLM.from_pretrained(
+            "Qwen/Qwen2.5-3B",
+            torch_dtype=torch.float16,
+            device_map="auto",
+            low_cpu_mem_usage=True
+        )
+        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B")
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        print("Models loaded!")
+def analyze_text(text: str) -> str:
+    """Analyze input text and return formatted results."""
+    if not text.strip():
+        return "⚠️ Please enter some text to analyze."
+    try:
+        load_models()  # Ensure models are loaded
+        # Extract activations
+        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256)
+        with torch.no_grad():
+            # Get activations from both models
+            out_a = model_a(**inputs, output_hidden_states=True)
+            out_b = model_b(**inputs, output_hidden_states=True)
+            # Extract last token activations (layer 13)
+            layer_idx = 13
+            hidden_a = out_a.hidden_states[layer_idx + 1]
+            hidden_b = out_b.hidden_states[layer_idx + 1]
+            last_idx = inputs["attention_mask"].sum(dim=1) - 1
+            act_a = hidden_a[0, last_idx].cpu().float()
+            act_b = hidden_b[0, last_idx].cpu().float()
+            # Combine activations
+            activations = torch.stack([act_a, act_b], dim=0).unsqueeze(0)  # (1, 2, d)
+            # Encode to features
+            features = dfc_model.encode(activations)
+            feature_vec = features[0]
+            # Find active features
+            active_indices = (feature_vec > 0).nonzero(as_tuple=True)[0]
+            active_values = feature_vec[active_indices]
+            if len(active_indices) == 0:
+                return "🤔 No active features found. Try a different text."
+            # Sort by strength
+            sorted_indices = torch.argsort(active_values, descending=True)
+            top_indices = active_indices[sorted_indices[:10]]
+            top_values = active_values[sorted_indices[:10]]
+            # Partition analysis
+            a_excl = sum(idx < dfc_model.a_end for idx in active_indices)
+            b_excl = sum(dfc_model.a_end <= idx < dfc_model.b_end for idx in active_indices)
+            shared = sum(idx >= dfc_model.b_end for idx in active_indices)
+            # Reconstruction quality
+            reconstructed = dfc_model.decode(features)
+            mse_loss = F.mse_loss(reconstructed, activations).item()
+            # Format results
+            result = f"""## 🔍 Analysis Results
+**Input Text**: "{text}"
+### 📊 Feature Summary
+- **Total Active Features**: {len(active_indices)}
+- **Reconstruction Quality**: {mse_loss:.6f} MSE
+### 🏷️ Feature Distribution
+- 🔴 **ToolRL-specific**: {a_excl} features ({a_excl/len(active_indices)*100:.1f}%)
+- 🔵 **Base model-specific**: {b_excl} features ({b_excl/len(active_indices)*100:.1f}%)
+- 🟢 **Shared features**: {shared} features ({shared/len(active_indices)*100:.1f}%)
+### ⭐ Top Active Features
+"""
+            for i, (idx, val) in enumerate(zip(top_indices, top_values)):
+                if idx < dfc_model.a_end:
+                    partition = "🔴 ToolRL"
+                elif idx < dfc_model.b_end:
+                    partition = "🔵 Base"
+                else:
+                    partition = "🟢 Shared"
+                result += f"{i+1}. Feature {idx.item()} ({partition}) - **{val.item():.4f}**\n"
+            return result
+    except Exception as e:
+        return f"❌ Error during analysis: {str(e)}\n\nPlease try again with different text."
+# Example texts for easy testing
+example_texts = [
+    "To solve this problem, I need to use the calculator tool.",
+    "The weather is beautiful today.",
+    "Let me search for information about machine learning.",
+    "I should call the API to get the current data.",
+    "Python is a great programming language for data science."
+]
+# Create Gradio interface
+demo = gr.Interface(
+    fn=analyze_text,
+    inputs=gr.Textbox(
+        lines=3,
+        placeholder="Enter text to analyze...",
+        label="📝 Input Text",
+        info="Enter any text to see how features activate differently between ToolRL and base models"
+    ),
+    outputs=gr.Markdown(label="📊 Analysis Results"),
+    title="🧠 DFC CrossCoder Demo",
+    description="""
+    This demo analyzes text using a **DFC CrossCoder** to reveal how features activate differently between:
+    - 🔴 **ToolRL Model**: Fine-tuned for tool usage
+    - 🔵 **Base Model**: Original Qwen2.5-3B
+    - 🟢 **Shared Features**: Common to both models
+    The CrossCoder extracts sparse, interpretable features from the internal representations of both models.
+    """,
+    examples=[[text] for text in example_texts],
+    theme="soft",
+    allow_flagging="never"
+)
+if __name__ == "__main__":
+    # Load models at startup (for better UX)
+    print("🚀 Starting DFC CrossCoder demo...")
+    load_models()
+    # Launch the demo
+    demo.launch(
+        share=False,  # Set to True for sharing
+        server_name="0.0.0.0",  # For Spaces
+        server_port=7860  # Default Spaces port
+    )

config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "activation_dim": 2048,
+  "dict_size": 16384,
+  "k": 90,
+  "n_a": 819,
+  "n_b": 819
+}

demo.py ADDED Viewed

	@@ -0,0 +1,100 @@

+"""
+demo.py — Minimal demo for DFC CrossCoder usage.
+"""
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import json
+def extract_last_token_activations(model, tokenizer, texts, layer_idx, device="cuda:0"):
+    """Extract last-token activations from a model."""
+    model.eval()
+    all_acts = []
+    with torch.no_grad():
+        for text in texts:
+            # Tokenize
+            inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
+            input_ids = inputs["input_ids"].to(device)
+            attention_mask = inputs["attention_mask"].to(device)
+            # Forward pass
+            outputs = model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
+            # Get last token activation
+            hidden_states = outputs.hidden_states[layer_idx + 1]  # +1 because [0] is embedding
+            last_idx = attention_mask.sum(dim=1) - 1
+            last_token_act = hidden_states[0, last_idx]
+            all_acts.append(last_token_act.cpu())
+    return torch.stack(all_acts)
+def main():
+    """Demo usage of DFC CrossCoder."""
+    # Load the DFC model (replace with your repo name)
+    from dfc_crosscoder import DFCCrossCoder
+    dfc = DFCCrossCoder.from_pretrained("your-username/dfc-crosscoder")
+    dfc.eval()
+    # Load the original models (you need both)
+    print("Loading models...")
+    model_a = AutoModelForCausalLM.from_pretrained("chengq9/ToolRL-Qwen2.5-3B")
+    model_b = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-3B")
+    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B")
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # Example text
+    texts = [
+        "To solve this problem, I need to use the calculator tool.",
+        "The weather is beautiful today.",
+        "Let me search for the latest news about AI research."
+    ]
+    print(f"Analyzing {len(texts)} texts...")
+    for i, text in enumerate(texts):
+        print(f"\n--- Text {i+1}: {text} ---")
+        # Extract activations from both models
+        act_a = extract_last_token_activations(model_a, tokenizer, [text], layer_idx=13)
+        act_b = extract_last_token_activations(model_b, tokenizer, [text], layer_idx=13)
+        # Combine activations
+        combined_acts = torch.stack([act_a[0], act_b[0]], dim=0).unsqueeze(0)  # (1, 2, d)
+        # Encode to features
+        features = dfc.encode(combined_acts)
+        # Analyze
+        active_indices = (features[0] > 0).nonzero(as_tuple=True)[0]
+        active_values = features[0][active_indices]
+        # Sort by strength
+        sorted_indices = torch.argsort(active_values, descending=True)
+        top_features = active_indices[sorted_indices[:10]]
+        top_values = active_values[sorted_indices[:10]]
+        print(f"Active features: {len(active_indices)}")
+        print(f"Top 10 features: {top_features.tolist()}")
+        print(f"Values: {[f'{v:.3f}' for v in top_values.tolist()]}")
+        # Partition analysis
+        a_excl = sum(idx < dfc.a_end for idx in top_features)
+        b_excl = sum(dfc.a_end <= idx < dfc.b_end for idx in top_features)
+        shared = sum(idx >= dfc.b_end for idx in top_features)
+        print(f"Feature distribution: A-exclusive={a_excl}, B-exclusive={b_excl}, Shared={shared}")
+        # Decode features back to activations
+        reconstructed = dfc.decode(features)
+        mse_loss = torch.nn.functional.mse_loss(reconstructed, combined_acts)
+        print(f"Reconstruction MSE: {mse_loss.item():.6f}")
+if __name__ == "__main__":
+    main()

dfc_crosscoder.py ADDED Viewed

	@@ -0,0 +1,201 @@

+"""
+dfc.py — Dedicated Feature CrossCoder (DFC) model.
+Feature layout in dict_size
+────────────────────────────
+  ┌─────────────────────┬─────────────────────┬──────────────────────────┐
+  │  A-exclusive (n_a)  │  B-exclusive (n_b)  │     Shared (n_shared)    │
+  └─────────────────────┴─────────────────────┴──────────────────────────┘
+  idx:  0 ─────── a_end ──────── b_end ───────────────────── dict_size
+Constraints (enforced by gradient masking + _apply_masks every step)
+──────────────────────────────────────────────────────────────────────
+  • Model A cannot encode/decode B-exclusive features
+  • Model B cannot encode/decode A-exclusive features
+  • Shared features are accessible to both
+"""
+from __future__ import annotations
+import json
+from pathlib import Path
+from bitsandbytes import features
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class DFCCrossCoder(nn.Module):
+    def __init__(
+        self,
+        activation_dim: int,
+        dict_size: int,
+        k: int,
+        model_a_exclusive_pct: float = 0.05,
+        model_b_exclusive_pct: float = 0.05,
+    ):
+        super().__init__()
+        self.activation_dim = activation_dim
+        self.dict_size = dict_size
+        self.k = k
+        self.n_a     = int(dict_size * model_a_exclusive_pct)
+        self.n_b     = int(dict_size * model_b_exclusive_pct)
+        self.n_shared = dict_size - self.n_a - self.n_b
+        self.a_end   = self.n_a
+        self.b_end   = self.n_a + self.n_b
+        print(
+            f"[DFC] dict={dict_size} k={k} | "
+            f"A-excl={self.n_a} B-excl={self.n_b} shared={self.n_shared}"
+        )
+        # Encoder: W_enc[model, d_in, dict_size]
+        self.W_enc = nn.Parameter(
+            torch.randn(2, activation_dim, dict_size) / (activation_dim ** 0.5)
+        )
+        self.b_enc = nn.Parameter(torch.zeros(dict_size))
+        # Decoder: W_dec[dict_size, model, d_in]
+        self.W_dec = nn.Parameter(
+            torch.randn(dict_size, 2, activation_dim) / (dict_size ** 0.5)
+        )
+        self.b_dec = nn.Parameter(torch.zeros(2, activation_dim))
+        # ── Partition masks (move with .to(device)) ───────────────────
+        # enc_mask[model, dict_size]
+        enc_mask = torch.ones(2, dict_size)
+        enc_mask[1, : self.a_end] = 0                   # B cannot encode A-excl
+        enc_mask[0, self.a_end : self.b_end] = 0        # A cannot encode B-excl
+        self.register_buffer("enc_mask", enc_mask)
+        # dec_mask[dict_size, model]
+        dec_mask = torch.ones(dict_size, 2)
+        dec_mask[: self.a_end, 1] = 0                   # A-excl: B decoder = 0
+        dec_mask[self.a_end : self.b_end, 0] = 0        # B-excl: A decoder = 0
+        self.register_buffer("dec_mask", dec_mask)
+        self._apply_masks()
+    # ── Weight enforcement ────────────────────────────────────────────
+    @torch.no_grad()
+    def _apply_masks(self):
+        """Zero forbidden weights. Call after every optimiser step."""
+        for m in range(2):
+            self.W_enc.data[m] *= self.enc_mask[m].unsqueeze(0)
+        for m in range(2):
+            self.W_dec.data[:, m, :] *= self.dec_mask[:, m].unsqueeze(1)
+    # ── Forward ───────────────────────────────────────────────────────
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        """x: (B, 2, d) → features: (B, dict_size) sparse top-k."""
+        W = self.W_enc * self.enc_mask.unsqueeze(1)         # (2, d, dict)
+        pre = torch.einsum("bmd,mdf->bf", x, W) + self.b_enc
+        pre = F.relu(pre)
+        topk_vals, topk_idx = torch.topk(pre, self.k, dim=-1)
+        features = torch.zeros_like(pre)
+        features.scatter_(-1, topk_idx, topk_vals)
+        return features
+    def decode(self, features: torch.Tensor) -> torch.Tensor:
+        """features: (B, dict_size) → (B, 2, d)."""
+        W = self.W_dec * self.dec_mask.unsqueeze(-1)        # (dict, 2, d)
+        return torch.einsum("bf,fmd->bmd", features, W) + self.b_dec
+    def forward(self, x: torch.Tensor):
+        """x: (B, 2, d) → (reconstruction, features)."""
+        features = self.encode(x)
+        recon    = self.decode(features)
+        return recon, features
+    def loss(
+        self,
+        x: torch.Tensor,
+        sparsity_coef: float = 1e-3,
+        exclusive_sparsity_coef: float = 1e-3  # Lower penalty for exclusive features
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """MSE + weighted L1 sparsity. Returns (total, mse, l1_shared, l1_exclusive)."""
+        recon, features = self.forward(x)
+        mse = F.mse_loss(recon, x)
+        # Split features by partition
+        # fa = features[:, :self.a_end]          # A-exclusive
+        # fb = features[:, self.a_end:self.b_end] # B-exclusive
+        fs = features[:, self.b_end:]          # Shared
+        # A sees: A-exclusive + shared
+        fa = torch.cat([features[:, :self.a_end], features[:, self.b_end:]], dim=-1)   # A-exclusive + shared
+        fb = torch.cat([features[:, self.a_end:self.b_end], features[:, self.b_end:]], dim=-1)  # B-exclusive + shared
+        # Separate sparsity penalties
+        l1_shared = fs.abs().mean()
+        l1_exclusive = (fa.abs().mean() + fb.abs().mean()) / 2
+        total = mse + exclusive_sparsity_coef * l1_exclusive + sparsity_coef * l1_shared
+        return total, mse, l1_shared, l1_exclusive
+    # ── Diagnostics ───────────────────────────────────────────────────
+    @torch.no_grad()
+    def verify_partition_integrity(self) -> dict[str, float]:
+        """Max absolute value in weights that should be zero."""
+        enc_viol  = (self.W_enc.abs() * (1 - self.enc_mask).unsqueeze(1)).max().item()
+        dec_viol_a = self.W_dec[: self.a_end, 1, :].abs().max().item()
+        dec_viol_b = self.W_dec[self.a_end : self.b_end, 0, :].abs().max().item()
+        return {
+            "enc_max_violation": enc_viol,
+            "dec_max_violation": max(dec_viol_a, dec_viol_b),
+        }
+    @torch.no_grad()
+    def feature_stats(self, features: torch.Tensor) -> dict[str, float]:
+        """Partition-level activation stats for a batch of features."""
+        fa = features[:, : self.a_end]
+        fb = features[:, self.a_end : self.b_end]
+        fs = features[:, self.b_end :]
+        return {
+            "l0_total":    (features > 0).float().sum(dim=-1).mean().item(),
+            "l0_a_excl":   (fa > 0).float().sum(dim=-1).mean().item(),
+            "l0_b_excl":   (fb > 0).float().sum(dim=-1).mean().item(),
+            "l0_shared":   (fs > 0).float().sum(dim=-1).mean().item(),
+            "mean_a_excl": fa.mean().item(),
+            "mean_b_excl": fb.mean().item(),
+            "mean_shared": fs.mean().item(),
+        }
+    # ── Save / Load ───────────────────────────────────────────────────
+    def save(self, path: str) -> None:
+        Path(path).mkdir(parents=True, exist_ok=True)
+        torch.save(self.state_dict(), f"{path}/model.pt")
+        json.dump(
+            dict(
+                activation_dim=self.activation_dim,
+                dict_size=self.dict_size,
+                k=self.k,
+                n_a=self.n_a,
+                n_b=self.n_b,
+            ),
+            open(f"{path}/config.json", "w"),
+            indent=2,
+        )
+        print(f"[DFC] Saved → {path}")
+    @classmethod
+    def load(cls, path: str, device: str = "cpu") -> "DFCCrossCoder":
+        cfg = json.load(open(f"{path}/config.json"))
+        model = cls(
+            activation_dim=cfg["activation_dim"],
+            dict_size=cfg["dict_size"],
+            k=cfg["k"],
+            model_a_exclusive_pct=cfg["n_a"] / cfg["dict_size"],
+            model_b_exclusive_pct=cfg["n_b"] / cfg["dict_size"],
+        )
+        model.load_state_dict(
+            torch.load(f"{path}/model.pt", map_location=device, weights_only=True)
+        )
+        return model.to(device)

inference_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "model_type": "dfc_crosscoder",
+  "model_a_name": "chengq9/ToolRL-Qwen2.5-3B",
+  "model_b_name": "Qwen/Qwen2.5-3B",
+  "tokenizer_name": "Qwen/Qwen2.5-3B",
+  "layer": 13,
+  "activation_dim": 2048,
+  "dict_size": 16384,
+  "k": 90,
+  "n_a": 819,
+  "n_b": 819,
+  "n_shared": 14746
+}

minimal_demo.py ADDED Viewed

	@@ -0,0 +1,439 @@

+"""
+minimal_demo.py — Standalone minimal demo for DFC CrossCoder.
+A lightweight demonstration of the DFC CrossCoder that can run as:
+1. Command-line demo
+2. Gradio web interface
+3. Hugging Face Space
+Usage:
+    python minimal_demo.py --text "Your input text"
+    python minimal_demo.py --gradio  # Start web interface
+    python minimal_demo.py --interface  # Interactive CLI
+"""
+import argparse
+import json
+import sys
+from typing import List, Dict, Tuple, Optional
+import torch
+import torch.nn.functional as F
+from transformers import AutoModelForCausalLM, AutoTokenizer
+# Simplified DFC implementation for demo (copy of key parts)
+class SimpleDFCCrossCoder(torch.nn.Module):
+    """Simplified DFC CrossCoder for demo purposes."""
+    def __init__(self, activation_dim: int, dict_size: int, k: int, n_a: int, n_b: int):
+        super().__init__()
+        self.activation_dim = activation_dim
+        self.dict_size = dict_size
+        self.k = k
+        self.n_a = n_a
+        self.n_b = n_b
+        self.n_shared = dict_size - n_a - n_b
+        self.a_end = n_a
+        self.b_end = n_a + n_b
+        # Model weights (will be loaded from checkpoint)
+        self.W_enc = torch.nn.Parameter(torch.zeros(2, activation_dim, dict_size))
+        self.b_enc = torch.nn.Parameter(torch.zeros(dict_size))
+        self.W_dec = torch.nn.Parameter(torch.zeros(dict_size, 2, activation_dim))
+        self.b_dec = torch.nn.Parameter(torch.zeros(2, activation_dim))
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        """Encode activations to sparse features."""
+        pre = torch.einsum("bmd,mdf->bf", x, self.W_enc) + self.b_enc
+        pre = F.relu(pre)
+        topk_vals, topk_idx = torch.topk(pre, self.k, dim=-1)
+        features = torch.zeros_like(pre)
+        features.scatter_(-1, topk_idx, topk_vals)
+        return features
+    def decode(self, features: torch.Tensor) -> torch.Tensor:
+        """Decode features back to activations."""
+        return torch.einsum("bf,fmd->bmd", features, self.W_dec) + self.b_dec
+    @classmethod
+    def from_pretrained(cls, model_path: str, device: str = "cpu"):
+        """Load model from checkpoint."""
+        # Load config
+        import json
+        with open(f"{model_path}/config.json") as f:
+            config = json.load(f)
+        # Create model
+        model = cls(
+            activation_dim=config["activation_dim"],
+            dict_size=config["dict_size"],
+            k=config["k"],
+            n_a=config.get("n_a", int(config["dict_size"] * 0.05)),
+            n_b=config.get("n_b", int(config["dict_size"] * 0.05))
+        )
+        # Load weights
+        state_dict = torch.load(f"{model_path}/model.pt", map_location=device, weights_only=True)
+        model.load_state_dict(state_dict)
+        return model.to(device)
+class DFCDemo:
+    """Demo class for DFC CrossCoder functionality."""
+    def __init__(
+        self,
+        dfc_path: str = "./checkpoints/dfc2",
+        model_a_name: str = "chengq9/ToolRL-Qwen2.5-3B",
+        model_b_name: str = "Qwen/Qwen2.5-3B",
+        layer: int = 13,
+        device: str = "auto"
+    ):
+        self.dfc_path = dfc_path
+        self.model_a_name = model_a_name
+        self.model_b_name = model_b_name
+        self.layer = layer
+        # Auto-detect device
+        if device == "auto":
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.device = device
+        # Models (loaded on first use)
+        self._dfc = None
+        self._model_a = None
+        self._model_b = None
+        self._tokenizer = None
+    @property
+    def dfc(self):
+        """Lazy load DFC model."""
+        if self._dfc is None:
+            print("Loading DFC CrossCoder...")
+            self._dfc = SimpleDFCCrossCoder.from_pretrained(self.dfc_path, device=self.device)
+            self._dfc.eval()
+        return self._dfc
+    @property
+    def models(self):
+        """Lazy load language models."""
+        if self._model_a is None:
+            print("Loading language models...")
+            print(f"  Model A: {self.model_a_name}")
+            self._model_a = AutoModelForCausalLM.from_pretrained(
+                self.model_a_name,
+                torch_dtype=torch.float32,
+                device_map=None
+            ).to(self.device).eval()
+            print(f"  Model B: {self.model_b_name}")
+            self._model_b = AutoModelForCausalLM.from_pretrained(
+                self.model_b_name,
+                torch_dtype=torch.float32,
+                device_map=None
+            ).to(self.device).eval()
+            print("  Tokenizer...")
+            self._tokenizer = AutoTokenizer.from_pretrained(self.model_b_name)
+            if self._tokenizer.pad_token is None:
+                self._tokenizer.pad_token = self._tokenizer.eos_token
+            self._tokenizer.padding_side = "left"
+        return self._model_a, self._model_b, self._tokenizer
+    def extract_activations(self, texts: List[str]) -> torch.Tensor:
+        """Extract last-token activations from both models."""
+        model_a, model_b, tokenizer = self.models
+        # Tokenize
+        inputs = tokenizer(
+            texts,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=512
+        )
+        input_ids = inputs["input_ids"].to(self.device)
+        attention_mask = inputs["attention_mask"].to(self.device)
+        activations = []
+        with torch.no_grad():
+            # Model A
+            out_a = model_a(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
+            hidden_a = out_a.hidden_states[self.layer + 1]
+            last_idx = attention_mask.sum(dim=1) - 1
+            act_a = hidden_a[torch.arange(len(texts)), last_idx]
+            # Model B
+            out_b = model_b(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
+            hidden_b = out_b.hidden_states[self.layer + 1]
+            act_b = hidden_b[torch.arange(len(texts)), last_idx]
+            # Stack as (batch, models, hidden_dim)
+            activations = torch.stack([act_a, act_b], dim=1)
+        return activations
+    def analyze_text(self, text: str) -> Dict:
+        """Analyze a single text and return feature breakdown."""
+        # Extract activations
+        activations = self.extract_activations([text])
+        # Encode to features
+        features = self.dfc.encode(activations)
+        feature_vec = features[0]  # Single text
+        # Find active features
+        active_indices = (feature_vec > 0).nonzero(as_tuple=True)[0]
+        active_values = feature_vec[active_indices]
+        # Sort by activation strength
+        sorted_indices = torch.argsort(active_values, descending=True)
+        top_indices = active_indices[sorted_indices[:20]]  # Top 20
+        top_values = active_values[sorted_indices[:20]]
+        # Partition analysis
+        a_excl_count = sum(idx < self.dfc.a_end for idx in active_indices)
+        b_excl_count = sum(self.dfc.a_end <= idx < self.dfc.b_end for idx in active_indices)
+        shared_count = sum(idx >= self.dfc.b_end for idx in active_indices)
+        # Reconstruction quality
+        reconstructed = self.dfc.decode(features)
+        mse_loss = F.mse_loss(reconstructed, activations).item()
+        return {
+            "text": text,
+            "total_active_features": len(active_indices),
+            "top_features": [
+                {"index": idx.item(), "value": val.item(), "partition": self._get_partition_name(idx.item())}
+                for idx, val in zip(top_indices, top_values)
+            ],
+            "partition_counts": {
+                "A_exclusive": a_excl_count,
+                "B_exclusive": b_excl_count,
+                "Shared": shared_count
+            },
+            "reconstruction_mse": mse_loss,
+            "model_info": {
+                "dict_size": self.dfc.dict_size,
+                "k": self.dfc.k,
+                "layer": self.layer,
+                "model_a": self.model_a_name,
+                "model_b": self.model_b_name
+            }
+        }
+    def _get_partition_name(self, feature_idx: int) -> str:
+        """Get partition name for a feature index."""
+        if feature_idx < self.dfc.a_end:
+            return "A-exclusive"
+        elif feature_idx < self.dfc.b_end:
+            return "B-exclusive"
+        else:
+            return "Shared"
+    def compare_texts(self, texts: List[str]) -> List[Dict]:
+        """Compare multiple texts."""
+        return [self.analyze_text(text) for text in texts]
+def print_analysis(analysis: Dict):
+    """Print analysis results in a nice format."""
+    print(f"\n{'='*60}")
+    print(f"TEXT: {analysis['text']}")
+    print(f"{'='*60}")
+    print(f"Active Features: {analysis['total_active_features']}")
+    print(f"Reconstruction MSE: {analysis['reconstruction_mse']:.6f}")
+    print(f"\nPartition Distribution:")
+    for partition, count in analysis['partition_counts'].items():
+        percentage = count / analysis['total_active_features'] * 100 if analysis['total_active_features'] > 0 else 0
+        print(f"  {partition}: {count} ({percentage:.1f}%)")
+    print(f"\nTop Active Features:")
+    for i, feat in enumerate(analysis['top_features'][:10]):
+        print(f"  {i+1:2d}. Feature {feat['index']:5d} | {feat['partition']:12s} | Value: {feat['value']:.4f}")
+def create_gradio_interface(demo: DFCDemo):
+    """Create Gradio web interface."""
+    try:
+        import gradio as gr
+    except ImportError:
+        raise ImportError("Please install gradio: pip install gradio")
+    def analyze_interface(text):
+        """Gradio interface function."""
+        if not text.strip():
+            return "Please enter some text to analyze."
+        try:
+            analysis = demo.analyze_text(text.strip())
+            # Format results
+            result = f"""
+## Analysis Results
+**Text**: {analysis['text']}
+**Active Features**: {analysis['total_active_features']}
+**Reconstruction MSE**: {analysis['reconstruction_mse']:.6f}
+### Partition Distribution
+- **A-exclusive** (ToolRL): {analysis['partition_counts']['A_exclusive']} features
+- **B-exclusive** (Base): {analysis['partition_counts']['B_exclusive']} features
+- **Shared**: {analysis['partition_counts']['Shared']} features
+### Top Active Features
+"""
+            for i, feat in enumerate(analysis['top_features'][:10]):
+                result += f"{i+1}. Feature {feat['index']} ({feat['partition']}) - Value: {feat['value']:.4f}\n"
+            return result
+        except Exception as e:
+            return f"Error: {str(e)}"
+    # Create interface
+    iface = gr.Interface(
+        fn=analyze_interface,
+        inputs=gr.Textbox(
+            lines=3,
+            placeholder="Enter text to analyze (e.g., 'To solve this problem, I need to use the calculator tool.')",
+            label="Input Text"
+        ),
+        outputs=gr.Markdown(label="Analysis Results"),
+        title="DFC CrossCoder Demo",
+        description="Analyze text using the DFC CrossCoder to see which features are active and how they're distributed between ToolRL and Base models.",
+        examples=[
+            ["To solve this problem, I need to use the calculator tool."],
+            ["The weather is beautiful today."],
+            ["Let me search for information about machine learning."],
+            ["I should use the weather API to get current conditions."],
+            ["Python is a great programming language for data science."]
+        ]
+    )
+    return iface
+def interactive_cli(demo: DFCDemo):
+    """Interactive command-line interface."""
+    print("\n" + "="*60)
+    print("DFC CrossCoder Interactive Demo")
+    print("="*60)
+    print("Commands:")
+    print("  analyze <text>  - Analyze single text")
+    print("  compare <text1> | <text2> | <text3>  - Compare multiple texts")
+    print("  help - Show this help")
+    print("  quit - Exit")
+    print("="*60)
+    while True:
+        try:
+            user_input = input("\n> ").strip()
+            if not user_input:
+                continue
+            if user_input.lower() in ["quit", "q", "exit"]:
+                print("Goodbye!")
+                break
+            elif user_input.lower() in ["help", "h"]:
+                print("\nCommands:")
+                print("  analyze <text>  - Analyze single text")
+                print("  compare <text1> | <text2> | <text3>  - Compare multiple texts")
+                print("  help - Show this help")
+                print("  quit - Exit")
+            elif user_input.startswith("analyze "):
+                text = user_input[8:].strip()
+                if text:
+                    analysis = demo.analyze_text(text)
+                    print_analysis(analysis)
+                else:
+                    print("Please provide text to analyze.")
+            elif user_input.startswith("compare "):
+                texts_str = user_input[8:].strip()
+                texts = [t.strip() for t in texts_str.split("|") if t.strip()]
+                if len(texts) < 2:
+                    print("Please provide at least 2 texts separated by |")
+                else:
+                    analyses = demo.compare_texts(texts)
+                    for analysis in analyses:
+                        print_analysis(analysis)
+            else:
+                print("Unknown command. Type 'help' for available commands.")
+        except KeyboardInterrupt:
+            print("\nGoodbye!")
+            break
+        except Exception as e:
+            print(f"Error: {e}")
+def main():
+    parser = argparse.ArgumentParser(description="DFC CrossCoder Demo")
+    parser.add_argument("--text", type=str, help="Text to analyze")
+    parser.add_argument("--checkpoint", default="./checkpoints/dfc2", help="Path to DFC checkpoint")
+    parser.add_argument("--gradio", action="store_true", help="Launch Gradio web interface")
+    parser.add_argument("--interface", action="store_true", help="Interactive CLI mode")
+    parser.add_argument("--device", default="auto", help="Device (cuda/cpu/auto)")
+    parser.add_argument("--compare", nargs="+", help="Compare multiple texts")
+    args = parser.parse_args()
+    # Create demo
+    demo = DFCDemo(
+        dfc_path=args.checkpoint,
+        device=args.device
+    )
+    try:
+        if args.gradio:
+            # Launch Gradio interface
+            iface = create_gradio_interface(demo)
+            iface.launch(share=True)
+        elif args.interface:
+            # Interactive CLI
+            interactive_cli(demo)
+        elif args.text:
+            # Single text analysis
+            analysis = demo.analyze_text(args.text)
+            print_analysis(analysis)
+        elif args.compare:
+            # Compare multiple texts
+            analyses = demo.compare_texts(args.compare)
+            for analysis in analyses:
+                print_analysis(analysis)
+        else:
+            # Default examples
+            print("DFC CrossCoder Demo - Running example analyses...")
+            example_texts = [
+                "To solve this problem, I need to use the calculator tool.",
+                "The weather is beautiful today.",
+                "Let me search for the latest research papers.",
+                "I should call the weather API to get current conditions."
+            ]
+            analyses = demo.compare_texts(example_texts)
+            for analysis in analyses:
+                print_analysis(analysis)
+            print(f"\n{'='*60}")
+            print("Demo completed! Try:")
+            print("  python minimal_demo.py --gradio        # Web interface")
+            print("  python minimal_demo.py --interface     # Interactive CLI")
+            print("  python minimal_demo.py --text 'Your text here'")
+            print("="*60)
+    except Exception as e:
+        print(f"Error: {e}")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5a343c4f59a9937d4cb01c1870729aa94e733ddc0f59555c77d52944fddb7d93
+size 537217597

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+torch>=1.9.0
+transformers>=4.20.0
+numpy
+tqdm

space_requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch>=1.9.0
+transformers>=4.20.0
+gradio>=3.0.0
+numpy
+tqdm
+spaces