| """ |
| Critic Agent - Evaluates all other agents' outputs for quality, accuracy, and completeness. |
| |
| Checks logical clarity, conceptual accuracy, identifies redundancy between |
| perspectives, finds missing perspectives, and suggests improvements. |
| Returns structured critique with scores. |
| """ |
|
|
| import re |
| import logging |
| from reasoning_forge.agents.base_agent import ReasoningAgent |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| class CriticAgent(ReasoningAgent): |
| name = "Critic" |
| perspective = "meta_evaluative" |
| adapter_name = "multi_perspective" |
|
|
| def get_analysis_templates(self) -> list[str]: |
| |
| |
| |
| return [ |
| "Evaluating the ensemble analysis of '{concept}'.", |
| ] |
|
|
| def analyze(self, concept: str) -> str: |
| """Analyze using the multi-perspective adapter for meta-evaluation. |
| |
| This delegates to the parent class which uses LLM if orchestrator |
| is available, or templates otherwise. |
| """ |
| return super().analyze(concept) |
|
|
|
|
| def evaluate_ensemble_with_llm( |
| self, |
| concept: str, |
| analyses: dict[str, str], |
| ) -> dict: |
| """Use LLM to evaluate ensemble with real reasoning about quality. |
| |
| Falls back to heuristic evaluation if orchestrator unavailable. |
| |
| Args: |
| concept: Original concept |
| analyses: Dict of agent_name -> analysis_text |
| |
| Returns: |
| Structured critique from the LLM |
| """ |
| if not self.orchestrator or not self.adapter_name: |
| |
| return self.evaluate_ensemble(concept, analyses) |
|
|
| |
| analyses_text = "\n\n".join([ |
| f"**{agent}**:\n{text[:300]}..." |
| for agent, text in analyses.items() |
| ]) |
|
|
| eval_prompt = f"""Evaluate this ensemble analysis of "{concept}": |
| |
| {analyses_text} |
| |
| Provide a JSON assessment with: |
| - agent_scores: subjective quality scores per agent (0-1) |
| - strengths: key insights across perspectives |
| - weaknesses: gaps and redundancies |
| - overall_quality: aggregate score (0-1)""" |
|
|
| try: |
| response, tokens, _ = self.orchestrator.generate( |
| query=eval_prompt, |
| adapter_name=self.adapter_name, |
| system_prompt="You are a meta-evaluator of reasoning quality. Reply in valid JSON.", |
| enable_tools=False |
| ) |
|
|
| |
| import json |
| start = response.find('{') |
| end = response.rfind('}') + 1 |
| if start >= 0 and end > start: |
| try: |
| critique_dict = json.loads(response[start:end]) |
| critique_dict["concept"] = concept |
| return critique_dict |
| except json.JSONDecodeError: |
| logger.debug("Could not parse JSON from LLM evaluation") |
| except Exception as e: |
| logger.warning(f"LLM evaluation failed: {e}") |
|
|
| |
| return self.evaluate_ensemble(concept, analyses) |
|
|
| def evaluate_ensemble( |
| self, |
| concept: str, |
| analyses: dict[str, str], |
| ) -> dict: |
| """Evaluate all agent analyses and produce a structured critique. |
| |
| Args: |
| concept: The original concept being analyzed. |
| analyses: Dict mapping agent_name -> analysis_text. |
| |
| Returns: |
| Dictionary with scores, redundancies, gaps, and suggestions. |
| """ |
| critique = { |
| "concept": concept, |
| "agent_scores": {}, |
| "redundancies": [], |
| "missing_perspectives": [], |
| "improvement_suggestions": [], |
| "overall_quality": 0.0, |
| } |
|
|
| total_clarity = 0.0 |
| total_accuracy = 0.0 |
| agent_count = len(analyses) |
|
|
| for agent_name, text in analyses.items(): |
| clarity = self._score_logical_clarity(text) |
| accuracy = self._score_conceptual_accuracy(text, concept) |
| critique["agent_scores"][agent_name] = { |
| "logical_clarity": round(clarity, 2), |
| "conceptual_accuracy": round(accuracy, 2), |
| "combined": round((clarity + accuracy) / 2, 2), |
| } |
| total_clarity += clarity |
| total_accuracy += accuracy |
|
|
| |
| critique["redundancies"] = self._detect_redundancy(analyses) |
|
|
| |
| critique["missing_perspectives"] = self._find_missing_perspectives( |
| concept, analyses |
| ) |
|
|
| |
| critique["improvement_suggestions"] = self._suggest_improvements( |
| concept, analyses, critique["agent_scores"] |
| ) |
|
|
| |
| if agent_count > 0: |
| avg_clarity = total_clarity / agent_count |
| avg_accuracy = total_accuracy / agent_count |
| redundancy_penalty = len(critique["redundancies"]) * 0.03 |
| gap_penalty = len(critique["missing_perspectives"]) * 0.05 |
| raw_score = (avg_clarity + avg_accuracy) / 2 - redundancy_penalty - gap_penalty |
| critique["overall_quality"] = round(max(0.0, min(1.0, raw_score)), 2) |
|
|
| return critique |
|
|
| def _score_logical_clarity(self, text: str) -> float: |
| """Score the logical clarity of an analysis on a 0-1 scale. |
| |
| Heuristics: |
| - Presence of logical connectives (therefore, because, however, thus) |
| - Sentence structure variety (not all same length) |
| - Specificity (concrete terms vs vague language) |
| - Reasonable length (not too terse, not padded) |
| """ |
| score = 0.5 |
|
|
| |
| connectives = [ |
| "because", "therefore", "thus", "however", "although", |
| "consequently", "since", "given that", "implies", |
| "it follows", "this means", "as a result", "in contrast", |
| "specifically", "for example", "in particular", |
| ] |
| connective_count = sum(1 for c in connectives if c in text.lower()) |
| score += min(0.2, connective_count * 0.025) |
|
|
| |
| sentences = [s.strip() for s in re.split(r'[.!?]+', text) if s.strip()] |
| if len(sentences) >= 3: |
| lengths = [len(s.split()) for s in sentences] |
| mean_len = sum(lengths) / len(lengths) |
| variance = sum((l - mean_len) ** 2 for l in lengths) / len(lengths) |
| std_dev = variance ** 0.5 |
| if 3 < std_dev < 15: |
| score += 0.1 |
| elif std_dev >= 1: |
| score += 0.05 |
|
|
| |
| vague_terms = [ |
| "things", "stuff", "a lot", "very", "really", |
| "kind of", "sort of", "basically", "obviously", |
| ] |
| vague_count = sum(1 for v in vague_terms if v in text.lower()) |
| score -= vague_count * 0.03 |
|
|
| |
| word_count = len(text.split()) |
| if 80 <= word_count <= 300: |
| score += 0.1 |
| elif 50 <= word_count < 80 or 300 < word_count <= 500: |
| score += 0.05 |
| elif word_count < 30: |
| score -= 0.15 |
|
|
| return max(0.0, min(1.0, score)) |
|
|
| def _score_conceptual_accuracy(self, text: str, concept: str) -> float: |
| """Score how well the analysis engages with the actual concept. |
| |
| Heuristics: |
| - References to the concept terms |
| - Domain-appropriate vocabulary |
| - Absence of generic placeholder language |
| """ |
| score = 0.5 |
|
|
| concept_terms = set(re.findall(r'\b[a-zA-Z]{4,}\b', concept.lower())) |
| text_lower = text.lower() |
|
|
| |
| if concept_terms: |
| found = sum(1 for t in concept_terms if t in text_lower) |
| coverage = found / len(concept_terms) |
| score += coverage * 0.15 |
|
|
| |
| placeholders = [ |
| "this concept can be approached", |
| "from this perspective we see", |
| "looking at this through", |
| "applying this lens", |
| "in conclusion", |
| "to summarize", |
| ] |
| placeholder_count = sum(1 for p in placeholders if p in text_lower) |
| score -= placeholder_count * 0.05 |
|
|
| |
| domain_terms = [ |
| "mechanism", "cause", "effect", "evidence", "principle", |
| "constraint", "trade-off", "interaction", "dynamic", |
| "structure", "function", "process", "system", "pattern", |
| "relationship", "variable", "outcome", "hypothesis", |
| "implication", "assumption", "framework", "model", |
| ] |
| domain_count = sum(1 for d in domain_terms if d in text_lower) |
| score += min(0.2, domain_count * 0.02) |
|
|
| |
| concept_word_count = len(concept.split()) |
| text_word_count = len(text.split()) |
| if text_word_count >= concept_word_count * 3: |
| score += 0.1 |
|
|
| return max(0.0, min(1.0, score)) |
|
|
| def _detect_redundancy(self, analyses: dict[str, str]) -> list[str]: |
| """Detect thematic redundancy between agent analyses.""" |
| redundancies = [] |
| agent_names = list(analyses.keys()) |
|
|
| for i in range(len(agent_names)): |
| for j in range(i + 1, len(agent_names)): |
| name_a = agent_names[i] |
| name_b = agent_names[j] |
| overlap = self._compute_content_overlap( |
| analyses[name_a], analyses[name_b] |
| ) |
| if overlap > 0.35: |
| redundancies.append( |
| f"{name_a} and {name_b} share significant thematic overlap " |
| f"({overlap:.0%}). Consider diversifying their angles of analysis." |
| ) |
| return redundancies |
|
|
| def _compute_content_overlap(self, text_a: str, text_b: str) -> float: |
| """Compute Jaccard similarity of significant word sets.""" |
| stop_words = { |
| "the", "a", "an", "is", "are", "was", "were", "be", "been", |
| "being", "have", "has", "had", "do", "does", "did", "will", |
| "would", "could", "should", "may", "might", "can", "shall", |
| "of", "in", "to", "for", "with", "on", "at", "from", "by", |
| "about", "as", "into", "through", "during", "before", "after", |
| "and", "but", "or", "nor", "not", "so", "yet", "both", |
| "this", "that", "these", "those", "it", "its", "they", "them", |
| "their", "we", "our", "you", "your", "he", "she", "his", "her", |
| } |
| words_a = { |
| w for w in re.findall(r'\b[a-z]{4,}\b', text_a.lower()) |
| if w not in stop_words |
| } |
| words_b = { |
| w for w in re.findall(r'\b[a-z]{4,}\b', text_b.lower()) |
| if w not in stop_words |
| } |
| if not words_a or not words_b: |
| return 0.0 |
| intersection = words_a & words_b |
| union = words_a | words_b |
| return len(intersection) / len(union) |
|
|
| def _find_missing_perspectives( |
| self, concept: str, analyses: dict[str, str] |
| ) -> list[str]: |
| """Identify perspectives that are absent from the ensemble.""" |
| missing = [] |
| all_text = " ".join(analyses.values()).lower() |
|
|
| perspective_checks = [ |
| ("temporal/historical", [ |
| "history", "historical", "evolution", "over time", "timeline", |
| "past", "trajectory", "precedent", "legacy", |
| ]), |
| ("quantitative/statistical", [ |
| "statistic", "data", "quantif", "measur", "metric", |
| "number", "percentage", "rate", "frequency", |
| ]), |
| ("ecological/environmental", [ |
| "environment", "ecolog", "sustainab", "ecosystem", |
| "resource", "footprint", "biodiversity", "pollution", |
| ]), |
| ("economic/financial", [ |
| "economic", "financial", "cost", "benefit", "market", |
| "incentive", "investment", "capital", "trade", |
| ]), |
| ("legal/regulatory", [ |
| "legal", "law", "regulat", "compliance", "policy", |
| "legislation", "governance", "jurisdiction", |
| ]), |
| ("educational/pedagogical", [ |
| "learn", "teach", "education", "pedagog", "curriculum", |
| "training", "skill", "literacy", |
| ]), |
| ] |
|
|
| for perspective_name, indicators in perspective_checks: |
| found = sum(1 for ind in indicators if ind in all_text) |
| if found < 2: |
| missing.append( |
| f"The ensemble lacks a {perspective_name} perspective. " |
| f"Consider how '{concept}' relates to {perspective_name} dimensions." |
| ) |
|
|
| return missing[:3] |
|
|
| def _suggest_improvements( |
| self, |
| concept: str, |
| analyses: dict[str, str], |
| scores: dict[str, dict], |
| ) -> list[str]: |
| """Generate actionable improvement suggestions.""" |
| suggestions = [] |
|
|
| |
| if scores: |
| weakest = min(scores.items(), key=lambda x: x[1]["combined"]) |
| if weakest[1]["combined"] < 0.6: |
| suggestions.append( |
| f"The {weakest[0]} analysis scored lowest ({weakest[1]['combined']:.2f}). " |
| f"It would benefit from more specific engagement with the concept's " |
| f"concrete details rather than abstract framing." |
| ) |
|
|
| |
| all_text = " ".join(analyses.values()).lower() |
| example_indicators = ["for example", "for instance", "such as", "e.g.", "consider"] |
| example_count = sum(1 for e in example_indicators if e in all_text) |
| if example_count < 2: |
| suggestions.append( |
| "The ensemble would benefit from more concrete examples and " |
| "illustrations. Abstract reasoning without grounding in specifics " |
| "is less persuasive and harder to verify." |
| ) |
|
|
| |
| agent_names_lower = [n.lower() for n in analyses.keys()] |
| cross_references = sum( |
| 1 for name in agent_names_lower |
| if any(name in text.lower() for text in analyses.values()) |
| ) |
| if cross_references < 2: |
| suggestions.append( |
| "The analyses operate largely in isolation. The synthesis would benefit " |
| "from explicit cross-referencing between perspectives -- showing where " |
| "they agree, disagree, or complement each other." |
| ) |
|
|
| |
| action_indicators = [ |
| "should", "must", "recommend", "suggest", "action", |
| "implement", "strategy", "step", "practice", |
| ] |
| action_count = sum(1 for a in action_indicators if a in all_text) |
| if action_count < 3: |
| suggestions.append( |
| "The ensemble is more diagnostic than prescriptive. Adding concrete, " |
| "actionable recommendations would increase practical value." |
| ) |
|
|
| return suggestions[:4] |
|
|