Spaces:
Running on Zero
Running on Zero
| """ | |
| CommitLens β gradio.Server mode | |
| ================================ | |
| - Serves custom index.html at GET / | |
| - Exposes process_repo via @app.api() for the JS frontend to call | |
| - Mellum 2 (6-bit, CPU-resident) handles per-file summaries via batched GPU inference | |
| - Groq llama-70b handles the final report (fast, no GPU cost) | |
| - <think>...</think> blocks stripped from all Mellum outputs | |
| - Per-file output is tightly constrained to 3-5 bullet points max | |
| """ | |
| from __future__ import annotations | |
| import logging | |
| import os | |
| import re | |
| import sys | |
| from pathlib import Path | |
| import spaces | |
| import torch | |
| from fastapi.responses import HTMLResponse | |
| from gradio import Server | |
| from groq import Groq | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig | |
| from commitlens import run_pipeline | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", | |
| stream=sys.stdout, | |
| ) | |
| log = logging.getLogger("commitlens") | |
| # --------------------------------------------------------------------------- | |
| # Config | |
| # --------------------------------------------------------------------------- | |
| MODEL_REPO_ID = "JetBrains/Mellum2-12B-A2.5B-Instruct" | |
| GROQ_MODEL = "llama-3.3-70b-versatile" # fast Groq-hosted 70B | |
| # BATCH_TOKEN_BUDGET = 7000 # estimated input tokens; above this β sequential | |
| # --------------------------------------------------------------------------- | |
| # Prompts | |
| # --------------------------------------------------------------------------- | |
| # Tight, bullet-constrained prompt β short output β fewer tokens generated | |
| SUMMARY_SYSTEM_PROMPT = """ | |
| You are a senior software engineer reviewing a git diff for ONE file. | |
| Analyze the actual code changes and produce a concise technical review. | |
| Output EXACTLY in this format: | |
| Summary: | |
| <2-4 sentences describing the code changes> | |
| Reason: | |
| <1 sentence explaining the reason if clearly evident from the diff, otherwise "Reason not evident from the diff."> | |
| Observations: | |
| * <observation> | |
| * <observation> | |
| * <observation> | |
| Rules: | |
| * Use ONLY information visible in the diff and provided code context. | |
| * Refer to functions, classes, methods, imports, decorators, constants, configuration values, API calls, and control flow when relevant. | |
| * Focus on what was actually modified, added, removed, or refactored. | |
| * Mention risks, assumptions, limitations, edge cases, or behavioral changes when visible. | |
| * Mention architectural or design changes when directly supported by the diff. | |
| * Do NOT invent requirements, business goals, performance improvements, bug fixes, security improvements, or developer intent. | |
| * If something cannot be proven from the diff, do not claim it. | |
| * Avoid generic statements such as: | |
| "improves reliability" | |
| "improves scalability" | |
| "improves performance" | |
| unless explicitly supported by the code changes. | |
| * Do not repeat the filename. | |
| * No markdown headers beyond the required section names. | |
| * No code fences. | |
| * No chain-of-thought. | |
| * No speculative reasoning. | |
| * Target 80-180 words. | |
| """ | |
| FINAL_SYSTEM_PROMPT = """\ | |
| You are a technical writer producing a commit review report. | |
| Given per-file summaries, write a structured markdown report with these exact sections: | |
| ## Commit Overview | |
| One paragraph (3-5 sentences) summarising the overall intent of the commit. | |
| ## Changes Per File | |
| A sub-section per file (### `filename`) with 2-4 bullet points. | |
| ## Key Takeaways | |
| 3-5 bullets: cross-cutting concerns, risks, follow-up actions. | |
| Rules: | |
| - Total report MUST be under 400 words | |
| - No filler phrases ("In conclusion", "It is worth noting") | |
| - Output markdown only β no preamble, no explanation | |
| """ | |
| # --------------------------------------------------------------------------- | |
| # Global model state β CPU-resident between requests | |
| # --------------------------------------------------------------------------- | |
| _model: AutoModelForCausalLM | None = None | |
| _tokenizer: AutoTokenizer | None = None | |
| def _strip_thinking(text: str) -> str: | |
| """Remove <think>...</think> blocks (multiline) produced by thinking models.""" | |
| return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip() | |
| def _extract_filename(prompt: str) -> str: | |
| for line in prompt.splitlines(): | |
| if line.startswith("Filename :"): | |
| return line.split(":", 1)[1].strip() | |
| return "unknown" | |
| # --------------------------------------------------------------------------- | |
| # Startup: load Mellum 2 in 6-bit NF4 into CPU RAM | |
| # Runs ONCE before app.launch(), outside any @spaces.GPU context. | |
| # --------------------------------------------------------------------------- | |
| def load_model_on_startup() -> None: | |
| """ | |
| Load Mellum 2 into CPU RAM with 6-bit NF4 double quantization. | |
| device_map='cpu' keeps weights off-GPU until a @spaces.GPU call fires, | |
| satisfying ZeroGPU's requirement that GPU allocation only happens inside | |
| decorated functions. | |
| """ | |
| global _model, _tokenizer | |
| log.info("=== STARTUP: loading tokenizer (%s) ===", MODEL_REPO_ID) | |
| _tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO_ID) | |
| if _tokenizer.pad_token_id is None: | |
| _tokenizer.pad_token_id = _tokenizer.eos_token_id | |
| log.info("Tokenizer ready. pad_token_id=%s", _tokenizer.pad_token_id) | |
| log.info("=== STARTUP: loading model in 6-bit NF4 on CPU ===") | |
| quant_cfg = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_use_double_quant=True, # NF4 + double quant β effective 6-bit | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_compute_dtype=torch.bfloat16, | |
| ) | |
| _model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_REPO_ID, | |
| quantization_config=quant_cfg, | |
| device_map="cpu", | |
| torch_dtype=torch.bfloat16, | |
| ) | |
| _model.eval() | |
| log.info("=== STARTUP: model ready on CPU ===") | |
| # --------------------------------------------------------------------------- | |
| # Mellum inference (called inside @spaces.GPU) | |
| # --------------------------------------------------------------------------- | |
| def _build_mellum_prompt(user_content: str) -> str: | |
| """Apply Mellum's chat template to a single user turn.""" | |
| return _tokenizer.apply_chat_template( | |
| [ | |
| {"role": "system", "content": SUMMARY_SYSTEM_PROMPT}, | |
| {"role": "user", "content": user_content}, | |
| ], | |
| tokenize=False, | |
| add_generation_prompt=True, | |
| ) | |
| def _generate_sequential(prompts: list[str]) -> list[str]: | |
| """Fallback single-prompt inference when batch would OOM.""" | |
| log.info("Sequential inference: %d prompts", len(prompts)) | |
| _tokenizer.padding_side = "right" | |
| results = [] | |
| for i, prompt in enumerate(prompts): | |
| log.info(" [%d/%d]", i + 1, len(prompts)) | |
| enc = _tokenizer(prompt, return_tensors="pt").to("cuda") | |
| with torch.no_grad(): | |
| out = _model.generate( | |
| **enc, | |
| max_new_tokens=200, | |
| use_cache=True, | |
| do_sample=True, | |
| temperature=0.4, | |
| top_p=0.95, | |
| pad_token_id=_tokenizer.pad_token_id, | |
| ) | |
| text = _tokenizer.decode(out[0][enc.input_ids.shape[1]:], skip_special_tokens=True) | |
| results.append(_strip_thinking(text)) | |
| return results | |
| # --------------------------------------------------------------------------- | |
| # Groq final report (pure API call β no GPU needed) | |
| # --------------------------------------------------------------------------- | |
| def _generate_final_report_groq(per_file_summaries: list[dict]) -> str: | |
| """ | |
| Send all per-file summaries to Groq llama-3.3-70b and get back | |
| a structured markdown commit report. Fast (~2-4 s) and free of GPU cost. | |
| Reads GROQ_API_KEY from environment (set as a HF Space secret). | |
| """ | |
| groq_client = Groq(api_key=os.environ["GROQ_API_KEY"]) | |
| # Format per-file summaries as a clean user message | |
| user_content = "\n\n".join( | |
| f"### `{f['name']}`\n{f['summary']}" | |
| for f in per_file_summaries | |
| ) | |
| log.info("Calling Groq %s for final report (%d files) ...", GROQ_MODEL, len(per_file_summaries)) | |
| response = groq_client.chat.completions.create( | |
| model=GROQ_MODEL, | |
| messages=[ | |
| {"role": "system", "content": FINAL_SYSTEM_PROMPT}, | |
| {"role": "user", "content": user_content}, | |
| ], | |
| max_tokens=600, # 400-word cap + small buffer | |
| temperature=0.2, # low temp for consistent, factual output | |
| ) | |
| report = response.choices[0].message.content.strip() | |
| log.info("Groq report received (%d chars)", len(report)) | |
| return report | |
| # --------------------------------------------------------------------------- | |
| # gradio.Server app | |
| # --------------------------------------------------------------------------- | |
| app = Server() | |
| async def homepage(): | |
| html_path = Path(__file__).parent / "index.html" | |
| return HTMLResponse(content=html_path.read_text(encoding="utf-8")) | |
| def process_repo(repo_url: str, token: str) -> dict: | |
| """ | |
| Full pipeline: | |
| 1. run_pipeline() β Top 2 most changed file prompts (CPU, fast) | |
| 2. Mellum 2 sequential β per-file summaries (.md format) (GPU, sequential) | |
| 3. Groq 70B β final markdown summary report (API, ~3 s) | |
| Returns: { "files": [{"name": str, "summary": str}], "report": str } | |
| """ | |
| log.info("=== process_repo: %s ===", repo_url) | |
| _model.to("cuda") # move model to GPU for Mellum inference | |
| # Step 1 β fetch diff and build prompts (Now limited to top 2 files from commitlens.py) | |
| prompts = run_pipeline(repo_url, token.strip() or None) | |
| log.info("Got %d file prompts from pipeline (capped at top 2)", len(prompts)) | |
| if not prompts: | |
| raise ValueError("No matching source-code files changed in the latest commit.") | |
| fnames = [_extract_filename(p) for p in prompts] | |
| # Step 2 β Force sequential execution through Mellum 2 on GPU | |
| mellum_prompts = [_build_mellum_prompt(p) for p in prompts] | |
| summaries = _generate_sequential(mellum_prompts) | |
| file_results = [ | |
| {"name": n, "summary": s} | |
| for n, s in zip(fnames, summaries) | |
| ] | |
| log.info("Sequential per-file summaries done") | |
| # Step 3 β Send the 2 .md summaries to Groq for final summary generation | |
| final_report = _generate_final_report_groq(file_results) | |
| log.info("Pipeline complete β processed top %d files", len(file_results)) | |
| return {"files": file_results, "report": final_report} | |
| # --------------------------------------------------------------------------- | |
| # Boot | |
| # --------------------------------------------------------------------------- | |
| load_model_on_startup() # weights land in CPU RAM; GPU untouched until first request | |
| if __name__ == "__main__": | |
| log.info("Starting CommitLens ...") | |
| app.launch() | |