xiaoqi-wang
/

miqa

miqa

Model card Files Files and versions

xet

Community

xiaoqi-wang commited on 25 days ago

Commit

a7301d6

verified ·

1 Parent(s): 26ee8b6

Upload video_analytics_inference.py with huggingface_hub

Browse files

Files changed (1) hide show

video_analytics_inference.py +883 -0

video_analytics_inference.py ADDED Viewed

	@@ -0,0 +1,883 @@

+import os
+import sys
+import torch
+import argparse
+import logging
+from pathlib import Path
+from typing import List, Dict, Optional, Tuple
+from collections import OrderedDict, defaultdict
+import json
+from datetime import datetime
+from tqdm import tqdm
+import numpy as np
+import cv2
+# Image processing imports
+from PIL import Image, ImageDraw, ImageFont
+import torchvision.transforms as transforms
+import matplotlib.pyplot as plt
+import matplotlib
+matplotlib.use('Agg')  # Use non-interactive backend
+# Import your existing model components
+from models.MIQA_base import get_torch_model, get_timm_model
+from models.RA_MIQA import RegionVisionTransformer
+from models.hf_model_registry import HF_REPO_ID, HF_REVISION, MODEL_FILENAMES
+from utils.hf_download_utils import ensure_checkpoint_from_hf
+# Supported file extensions
+SUPPORTED_IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.JPEG', '.png', '.bmp', '.tiff', '.tif'}
+SUPPORTED_VIDEO_EXTENSIONS = {'.mp4', '.avi', '.mov', '.mkv', '.flv', '.wmv', '.webm'}
+class VideoFrameExtractor:
+    """
+    Extracts and samples frames from video files intelligently.
+    This class handles different sampling strategies to balance between
+    thoroughness and computational efficiency.
+    """
+    def __init__(self, sampling_strategy: str = 'uniform',
+                 target_frames: int = 30,
+                 fps_sample: Optional[float] = None):
+        """
+        Initialize the frame extractor.
+        Args:
+            sampling_strategy: How to sample frames - 'uniform', 'fps', or 'keyframe'
+            target_frames: Target number of frames to extract (for uniform sampling)
+            fps_sample: Sample rate in frames per second (for fps sampling)
+        """
+        self.sampling_strategy = sampling_strategy
+        self.target_frames = target_frames
+        self.fps_sample = fps_sample
+    def extract_frames(self, video_path: str) -> Tuple[List[np.ndarray], List[float], Dict]:
+        """
+        Extract frames from video based on sampling strategy.
+        Returns:
+            Tuple of (frames_list, timestamps_list, video_metadata)
+        """
+        cap = cv2.VideoCapture(video_path)
+        if not cap.isOpened():
+            raise ValueError(f"Cannot open video file: {video_path}")
+        # Get video properties
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        duration = total_frames / fps if fps > 0 else 0
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        metadata = {
+            'total_frames': total_frames,
+            'fps': fps,
+            'duration': duration,
+            'width': width,
+            'height': height
+        }
+        # Determine which frames to sample
+        frame_indices = self._get_sample_indices(total_frames, fps)
+        frames = []
+        timestamps = []
+        for idx in frame_indices:
+            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
+            ret, frame = cap.read()
+            if ret:
+                # Convert BGR to RGB (OpenCV uses BGR)
+                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                frames.append(frame_rgb)
+                # Calculate timestamp in seconds
+                timestamp = idx / fps if fps > 0 else idx
+                timestamps.append(timestamp)
+        cap.release()
+        return frames, timestamps, metadata
+    def _get_sample_indices(self, total_frames: int, fps: float) -> List[int]:
+        """
+        Determine which frame indices to sample based on strategy.
+        """
+        if self.sampling_strategy == 'uniform':
+            # Sample frames uniformly across the video
+            if total_frames <= self.target_frames:
+                return list(range(total_frames))
+            else:
+                # Calculate step size to get approximately target_frames
+                step = total_frames / self.target_frames
+                indices = [int(i * step) for i in range(self.target_frames)]
+                return indices
+        elif self.sampling_strategy == 'fps':
+            # Sample at a specific frame rate
+            if self.fps_sample is None:
+                raise ValueError("fps_sample must be specified for fps sampling strategy")
+            frame_interval = max(1, int(fps / self.fps_sample))
+            indices = list(range(0, total_frames, frame_interval))
+            return indices
+        else:
+            raise ValueError(f"Unknown sampling strategy: {self.sampling_strategy}")
+def aggregate_scores_by_second(frame_results: List[Dict]) -> List[Dict]:
+    """
+    Aggregate frame-level quality scores to per-second averages.
+    This function groups all frames that fall within the same second
+    and computes their average quality score. This provides a smoothed
+    view of quality over time, reducing noise from frame-to-frame variations.
+    Args:
+        frame_results: List of dictionaries with 'timestamp' and 'quality_score'
+    Returns:
+        List of dictionaries with per-second aggregated scores
+    """
+    # Group frames by their second (floor of timestamp)
+    seconds_data = defaultdict(list)
+    for frame in frame_results:
+        second = int(frame['timestamp'])  # Floor to nearest second
+        seconds_data[second].append(frame['quality_score'])
+    # Calculate average for each second
+    per_second_results = []
+    for second in sorted(seconds_data.keys()):
+        scores = seconds_data[second]
+        per_second_results.append({
+            'second': second,
+            'timestamp': float(second),  # Use second as timestamp for plotting
+            'quality_score': np.mean(scores),
+            'min_score': np.min(scores),
+            'max_score': np.max(scores),
+            'num_frames': len(scores),
+            'std_score': np.std(scores) if len(scores) > 1 else 0.0
+        })
+    return per_second_results
+class MIQAInference:
+    """
+    Inference wrapper for MIQA models supporting both images and videos.
+    """
+    def __init__(self, task: str, model_name: str = 'ra_miqa',
+                 metric_type: str = 'composite', device: Optional[str] = None,
+                 video_sampling: str = 'uniform', video_target_frames: int = 30):
+        """
+        Initialize the MIQA inference system.
+        Args:
+            task: Task type - 'cls', 'det', or 'ins'
+            model_name: Model architecture to use
+            metric_type: Training objective - 'composite', 'consistency', or 'accuracy'
+            device: Device to run inference on
+            video_sampling: Frame sampling strategy for videos
+            video_target_frames: Target number of frames to extract from videos
+        """
+        self.task = task.lower()
+        self.model_name = model_name
+        self.metric_type = metric_type
+        self.video_target_frames = video_target_frames
+        # Setup logging
+        self.logger = self._setup_logger()
+        # Determine device
+        if device is None:
+            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        else:
+            self.device = torch.device(device)
+        self.logger.info(f"🚀 Initializing MIQA Inference System")
+        self.logger.info(f"   Task: {self.task.upper()}")
+        self.logger.info(f"   Model: {self.model_name}")
+        self.logger.info(f"   Device: {self.device}")
+        # Validate configuration
+        self._validate_config()
+        # Initialize model
+        self.model = self._load_model()
+        # Setup image preprocessing
+        self.transforms1, self.transforms2 = self._get_transforms()
+        # Initialize video frame extractor
+        self.frame_extractor = VideoFrameExtractor(
+            sampling_strategy=video_sampling,
+            target_frames=video_target_frames
+        )
+        self.logger.info("✅ System ready for inference\n")
+    def _setup_logger(self) -> logging.Logger:
+        """Configure logging with both file and console output."""
+        logger = logging.getLogger('MIQA_Inference')
+        logger.setLevel(logging.INFO)
+        if logger.hasHandlers():
+            return logger
+        logger.propagate = False
+        # Console handler with clean formatting
+        console_handler = logging.StreamHandler(sys.stdout)
+        console_handler.setLevel(logging.INFO)
+        console_formatter = logging.Formatter('%(message)s')
+        console_handler.setFormatter(console_formatter)
+        logger.addHandler(console_handler)
+        return logger
+    def _validate_config(self) -> None:
+        """Validate that the requested configuration is supported."""
+        if self.metric_type not in ['composite', 'consistency', 'accuracy']:
+            raise ValueError(
+                f"Invalid metric_type '{self.metric_type}'. "
+                f"Supported: ['composite', 'consistency', 'accuracy']"
+            )
+        if self.task not in MODEL_FILENAMES[self.metric_type]:
+            raise ValueError(
+                f"Invalid task '{self.task}'. "
+                f"Supported tasks: {list(MODEL_FILENAMES[self.metric_type].keys())}"
+            )
+        if self.model_name not in MODEL_FILENAMES[self.metric_type][self.task]:
+            available = list(MODEL_FILENAMES[self.metric_type][self.task].keys())
+            raise ValueError(
+                f"Model '{self.model_name}' not available for task '{self.task}'. "
+                f"Available models: {available}"
+            )
+    def _get_checkpoint_path(self) -> str:
+        """Generate the path where model checkpoint should be stored."""
+        base_dir = Path('models') / 'checkpoints' / f'{self.metric_type}_metric'
+        base_dir.mkdir(parents=True, exist_ok=True)
+        filename = MODEL_FILENAMES[self.metric_type][self.task][self.model_name]
+        return str(base_dir / filename)
+    def _download_weights(self, checkpoint_path: str) -> bool:
+        """
+        Download model weights if not present locally.
+        Returns:
+            True if weights are available (already existed or successfully downloaded)
+        """
+        if os.path.exists(checkpoint_path):
+            self.logger.info(f"✓ Found cached model weights")
+            return True
+        self.logger.info(
+            f"⏬ Downloading from Hugging Face: repo={HF_REPO_ID}, "
+            f"file={Path(checkpoint_path).name}, rev={HF_REVISION}"
+        )
+        try:
+            ensure_checkpoint_from_hf(
+                repo_id=HF_REPO_ID,
+                filename=Path(checkpoint_path).name,
+                local_dir=str(Path(checkpoint_path).parent),
+                revision=HF_REVISION,
+            )
+            self.logger.info("✓ Successfully downloaded model weights")
+            return True
+        except Exception as e:
+            self.logger.error(f"❌ Failed to download model weights from Hugging Face: {e}")
+            return False
+    def _create_model(self) -> torch.nn.Module:
+        """Create the model architecture."""
+        if self.model_name == 'ra_miqa':
+            self.logger.info("Building Region-Aware Vision Transformer...")
+            model = RegionVisionTransformer(
+                base_model_name='vit_small_patch16_224',
+                pretrained=False,  # We'll load our trained weights
+                mmseg_config_path='models/model_configs/fcn_sere-small_finetuned_fp16_8x32_224x224_3600_imagenets919.py',
+                checkpoint_path='models/checkpoints/sere_finetuned_vit_small_ep100.pth'
+            )
+        else:
+            try:
+                self.logger.info(f"Building {self.model_name} from PyTorch...")
+                model = get_torch_model(model_name=self.model_name, pretrained=False, num_classes=1)
+            except Exception:
+                self.logger.info(f"Building {self.model_name} from timm library...")
+                model = get_timm_model(model_name=self.model_name, pretrained=False, num_classes=1)
+        return model
+    def _load_model(self) -> torch.nn.Module:
+        """Load model with weights."""
+        checkpoint_path = self._get_checkpoint_path()
+        if not self._download_weights(checkpoint_path):
+            raise RuntimeError("Cannot proceed without model weights")
+        self.logger.info("🔧 Loading model...")
+        model = self._create_model()
+        checkpoint = torch.load(checkpoint_path, map_location='cpu')
+        state_dict = checkpoint.get('state_dict', checkpoint)
+        new_state_dict = OrderedDict()
+        for k, v in state_dict.items():
+            name = k.replace('module.', '') if k.startswith('module.') else k
+            new_state_dict[name] = v
+        model.load_state_dict(new_state_dict, strict=True)
+        model = model.to(self.device)
+        model.eval()
+        self.logger.info("✓ Model loaded successfully")
+        return model
+    def _get_transforms(self) -> [transforms.Compose, transforms.Compose]:
+        """
+        Get image preprocessing transforms.
+        These transforms normalize images to match the training distribution.
+        """
+        IMAGENET_MEAN = (0.485, 0.456, 0.406)
+        IMAGENET_STD = (0.229, 0.224, 0.225)
+        SIMPLE_MEAN = (0.5, 0.5, 0.5)
+        SIMPLE_STD = (0.5, 0.5, 0.5)
+        transforms_list1 = [
+            transforms.Resize(288),
+            transforms.CenterCrop(size=224),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=SIMPLE_MEAN,
+                                 std=SIMPLE_STD)
+        ]
+        transform_list_2 = [
+            transforms.Resize(288),
+            transforms.CenterCrop((288, 288)),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=IMAGENET_MEAN,
+                                 std=IMAGENET_STD)
+        ]
+        return transforms.Compose(transforms_list1), transforms.Compose(transform_list_2)
+    def _prepare_frame(self, frame: np.ndarray) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Preprocess a video frame for model input.
+        Args:
+            frame: Numpy array in RGB format
+        Returns:
+            Tuple of (cropped_tensor, resized_tensor)
+        """
+        # Convert numpy array to PIL Image
+        img = Image.fromarray(frame)
+        # Apply transforms
+        img1 = self.transforms1(img).unsqueeze(0)
+        img2 = self.transforms2(img).unsqueeze(0)
+        return img1, img2
+    def _prepare_image(self, image_path: str) -> Tuple[torch.Tensor, torch.Tensor, Image.Image]:
+        """Load and preprocess an image file."""
+        img = Image.open(image_path).convert('RGB')
+        img1 = self.transforms1(img).unsqueeze(0)
+        img2 = self.transforms2(img).unsqueeze(0)
+        return img1, img2, img
+    @torch.no_grad()
+    def predict_single_image(self, image_path: str) -> Dict:
+        """Run inference on a single image."""
+        img_cropped, img_resized, original_img = self._prepare_image(image_path)
+        img_cropped = img_cropped.to(self.device)
+        img_resized = img_resized.to(self.device)
+        output = self.model(img_cropped, img_resized)
+        score = output.item()
+        return {
+            'image_path': image_path,
+            'image_name': Path(image_path).name,
+            'quality_score': score,
+            'original_image': original_img,
+            'type': 'image'
+        }
+    @torch.no_grad()
+    def predict_video(self, video_path: str, show_progress: bool = True) -> Dict:
+        """
+        Run inference on a video file.
+        This extracts frames, predicts quality for each, aggregates to per-second
+        averages, and returns comprehensive time-series data suitable for visualization.
+        """
+        self.logger.info(f"🎬 Processing video: {Path(video_path).name}")
+        # Extract frames
+        frames, timestamps, metadata = self.frame_extractor.extract_frames(video_path)
+        self.logger.info(f"   Extracted {len(frames)} frames from {metadata['duration']:.1f}s video")
+        # Process each frame
+        frame_results = []
+        iterator = tqdm(frames, desc="Analyzing frames", disable=not show_progress, ncols=80)
+        for frame, timestamp in zip(iterator, timestamps):
+            img_cropped, img_resized = self._prepare_frame(frame)
+            img_cropped = img_cropped.to(self.device)
+            img_resized = img_resized.to(self.device)
+            output = self.model(img_cropped, img_resized)
+            score = output.item()
+            frame_results.append({
+                'timestamp': timestamp,
+                'quality_score': score,
+                'frame': frame  # Store for visualization if needed
+            })
+        # Aggregate frame results by second
+        per_second_results = aggregate_scores_by_second(frame_results)
+        # Calculate statistics from per-second data for better representation
+        second_scores = [r['quality_score'] for r in per_second_results]
+        return {
+            'video_path': video_path,
+            'video_name': Path(video_path).name,
+            'type': 'video',
+            'metadata': metadata,
+            'frame_results': frame_results,
+            'per_second_results': per_second_results,  # NEW: Added per-second aggregation
+            'num_frames_analyzed': len(frame_results),
+            'num_seconds': len(per_second_results),  # NEW: Number of unique seconds
+            'average_quality': np.mean(second_scores),
+            'min_quality': np.min(second_scores),
+            'max_quality': np.max(second_scores),
+            'std_quality': np.std(second_scores)
+        }
+    def predict(self, input_path: str, show_progress: bool = True) -> List[Dict]:
+        """
+        Main prediction interface - handles images, videos, and directories.
+        """
+        input_path = Path(input_path)
+        # Handle single file
+        if input_path.is_file():
+            ext = input_path.suffix.lower()
+            if ext in SUPPORTED_IMAGE_EXTENSIONS:
+                return [self.predict_single_image(str(input_path))]
+            elif ext in SUPPORTED_VIDEO_EXTENSIONS:
+                return [self.predict_video(str(input_path), show_progress)]
+            else:
+                raise ValueError(f"Unsupported file extension: {ext}")
+        # Handle directory
+        elif input_path.is_dir():
+            results = []
+            # Find all supported files
+            image_paths = []
+            video_paths = []
+            for ext in SUPPORTED_IMAGE_EXTENSIONS:
+                image_paths.extend(input_path.glob(f"*{ext}"))
+            for ext in SUPPORTED_VIDEO_EXTENSIONS:
+                video_paths.extend(input_path.glob(f"*{ext}"))
+            image_paths = sorted([str(p) for p in image_paths])
+            video_paths = sorted([str(p) for p in video_paths])
+            if not image_paths and not video_paths:
+                raise ValueError(f"No supported files found in {input_path}")
+            self.logger.info(f"📁 Found {len(image_paths)} images and {len(video_paths)} videos")
+            # Process images
+            if image_paths:
+                for img_path in tqdm(image_paths, desc="Processing images", ncols=80):
+                    try:
+                        result = self.predict_single_image(img_path)
+                        results.append(result)
+                    except Exception as e:
+                        self.logger.warning(f"⚠️  Failed: {img_path}")
+            # Process videos
+            if video_paths:
+                for vid_path in video_paths:
+                    try:
+                        result = self.predict_video(vid_path, show_progress)
+                        results.append(result)
+                    except Exception as e:
+                        self.logger.warning(f"⚠️  Failed: {vid_path}")
+            return results
+        else:
+            raise ValueError(f"Input path does not exist: {input_path}")
+    def visualize_video_results(self, video_result: Dict, output_dir: str = 'inference_results',
+                                granularity: str = 'second') -> None:
+        """
+        Create time-series visualization for video quality predictions.
+        This generates a line plot showing how quality varies across the video timeline.
+        You can choose between frame-level and second-level granularity.
+        Args:
+            video_result: Dictionary containing video analysis results
+            output_dir: Directory to save visualizations
+            granularity: Visualization granularity - 'frame' for frame-by-frame,
+                        'second' for per-second averages, or 'both' for dual plot
+        """
+        output_path = Path(output_dir)
+        output_path.mkdir(parents=True, exist_ok=True)
+        if granularity == 'both':
+            # Create side-by-side comparison
+            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 6))
+            # Frame-level plot (left)
+            frame_results = video_result['frame_results']
+            frame_timestamps = [r['timestamp'] for r in frame_results]
+            frame_scores = [r['quality_score'] for r in frame_results]
+            ax1.plot(frame_timestamps, frame_scores, linewidth=1.5, color='#2E86AB',
+                     marker='o', markersize=3, markerfacecolor='white', markeredgewidth=1,
+                     alpha=0.7, label='Frame-level')
+            ax1.set_xlabel('Time (seconds)', fontsize=11, fontweight='bold')
+            ax1.set_ylabel('Quality Score', fontsize=11, fontweight='bold')
+            ax1.set_title('Frame-Level Quality', fontsize=12, fontweight='bold')
+            ax1.grid(True, alpha=0.3, linestyle='--')
+            ax1.set_ylim(0, 1)
+            # Second-level plot (right)
+            second_results = video_result['per_second_results']
+            second_timestamps = [r['timestamp'] for r in second_results]
+            second_scores = [r['quality_score'] for r in second_results]
+            ax2.plot(second_timestamps, second_scores, linewidth=2.5, color='#A23B72',
+                     marker='s', markersize=6, markerfacecolor='white', markeredgewidth=1.5,
+                     label='Per-second average')
+            # Add error bars showing variability within each second
+            if second_results and 'std_score' in second_results[0]:
+                stds = [r['std_score'] for r in second_results]
+                ax2.fill_between(second_timestamps,
+                                 np.array(second_scores) - np.array(stds),
+                                 np.array(second_scores) + np.array(stds),
+                                 alpha=0.2, color='#A23B72')
+            ax2.set_xlabel('Time (seconds)', fontsize=11, fontweight='bold')
+            ax2.set_ylabel('Quality Score', fontsize=11, fontweight='bold')
+            ax2.set_title('Per-Second Averaged Quality', fontsize=12, fontweight='bold')
+            ax2.grid(True, alpha=0.3, linestyle='--')
+            ax2.set_ylim(0, 1)
+            # Add overall average line to both
+            avg_score = video_result['average_quality']
+            ax1.axhline(y=avg_score, color='#F18F01', linestyle='--',
+                        linewidth=1.5, alpha=0.7, label=f'Overall avg: {avg_score:.2f}')
+            ax2.axhline(y=avg_score, color='#F18F01', linestyle='--',
+                        linewidth=1.5, alpha=0.7, label=f'Overall avg: {avg_score:.2f}')
+            ax1.legend(loc='best', framealpha=0.9)
+            ax2.legend(loc='best', framealpha=0.9)
+            plt.suptitle(f"Video Quality Analysis: {video_result['video_name']}, {self.task}-oriented MIQA",
+                         fontsize=14, fontweight='bold', y=1.02)
+            suffix = 'comparison'
+        else:
+            # Single plot based on selected granularity
+            plt.figure(figsize=(14, 6))
+            if granularity == 'frame':
+                frame_results = video_result['frame_results']
+                timestamps = [r['timestamp'] for r in frame_results]
+                scores = [r['quality_score'] for r in frame_results]
+                plot_color = '#2E86AB'
+                plot_label = 'Frame-level quality'
+                title_suffix = '(Frame-Level)'
+                suffix = 'frame'
+                marker_size = 4
+            else:  # second
+                second_results = video_result['per_second_results']
+                timestamps = [r['timestamp'] for r in second_results]
+                scores = [r['quality_score'] for r in second_results]
+                plot_color = '#A23B72'
+                plot_label = 'Per-second average'
+                title_suffix = '(Per-Second Average)'
+                suffix = 'second'
+                marker_size = 6
+            # Main quality plot
+            plt.plot(timestamps, scores, linewidth=2, color=plot_color, marker='o',
+                     markersize=marker_size, markerfacecolor='white', markeredgewidth=1.5,
+                     label=plot_label)
+            # Add shaded region for second-level showing variability
+            if granularity == 'second' and second_results and 'std_score' in second_results[0]:
+                stds = [r['std_score'] for r in second_results]
+                plt.fill_between(timestamps,
+                             np.array(scores) - np.array(stds),
+                             np.array(scores) + np.array(stds),
+                             alpha=0.2, color=plot_color)
+            # Add average line
+            avg_score = video_result['average_quality']
+            plt.axhline(y=avg_score, color='#F18F01', linestyle='--',
+                        linewidth=1.5, label=f'Average: {avg_score:.2f}')
+            # Styling
+            plt.xlabel('Time (seconds)', fontsize=12, fontweight='bold')
+            plt.ylabel('Quality Score', fontsize=12, fontweight='bold')
+            plt.title(f"Video Quality Analysis: {video_result['video_name']} {title_suffix}",
+                      fontsize=14, fontweight='bold', pad=20)
+            plt.grid(True, alpha=0.3, linestyle='--')
+            plt.legend(loc='best', framealpha=0.9)
+        # Add statistics box
+        if granularity == 'both':
+            stats_text = (
+                f"Duration: {video_result['metadata']['duration']:.1f}s\n"
+                f"Frames: {video_result['num_frames_analyzed']} | "
+                f"Seconds: {video_result['num_seconds']}\n"
+                f"Score Range: [{video_result['min_quality']:.2f}, {video_result['max_quality']:.2f}]\n"
+                f"Std Dev: {video_result['std_quality']:.2f}"
+            )
+            # Add to the right subplot
+            ax2.text(0.02, 0.98, stats_text, transform=ax2.transAxes,
+                 fontsize=9, verticalalignment='top',
+                 bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
+        else:
+            stats_text = (
+                f"Duration: {video_result['metadata']['duration']:.1f}s\n"
+                f"Frames Analyzed: {video_result['num_frames_analyzed']}\n"
+                f"Unique Seconds: {video_result['num_seconds']}\n"
+                f"Score Range: [{video_result['min_quality']:.2f}, {video_result['max_quality']:.2f}]\n"
+                f"Std Dev: {video_result['std_quality']:.2f}"
+            )
+            plt.text(0.02, 0.98, stats_text, transform=plt.gca().transAxes,
+                     fontsize=12, verticalalignment='top',
+                     bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.6))
+        plt.tight_layout()
+        # Save figure
+        output_file = output_path / f"{Path(video_result['video_name']).stem}_{self.metric_type}_quality_{suffix}.png"
+        plt.savefig(output_file, dpi=300, bbox_inches='tight')
+        plt.close()
+        self.logger.info(f"   Saved visualization: {output_file.name}")
+    def visualize_results(self, results: List[Dict], output_dir: str = 'inference_results',
+                          video_granularity: str = 'second') -> None:
+        """
+        Create visualizations for all results (images and videos).
+        Args:
+            results: List of prediction results
+            output_dir: Directory to save visualizations
+            video_granularity: For videos - 'frame', 'second', or 'both'
+        """
+        output_path = Path(output_dir)
+        output_path.mkdir(parents=True, exist_ok=True)
+        self.logger.info(f"\n🎨 Creating visualizations (granularity: {video_granularity})...")
+        for result in results:
+            if result['type'] == 'video':
+                self.visualize_video_results(result, output_dir, video_granularity)
+            elif result['type'] == 'image' and result.get('quality_score') is not None:
+                # Use original image visualization logic
+                img = result['original_image'].copy()
+                draw = ImageDraw.Draw(img)
+                score = result['quality_score']
+                score_text = f"Quality: {score:.3f}"
+                # Simple color coding (adjust range as needed)
+                # norm_score = score / 100.0
+                norm_score = max(0, score)
+                if norm_score < 0.5:
+                    r, g, b = 255, int(255 * norm_score * 2), 0
+                else:
+                    r, g, b = int(255 * (2 - norm_score * 2)), 255, 0
+                color = (r, g, b)
+                box_coords = [10, 10, 260, 60]
+                draw.rectangle(box_coords, fill=color)
+                try:
+                    font = ImageFont.truetype("arial.ttf", 24)
+                except:
+                    font = ImageFont.load_default()
+                draw.text((15, 20), score_text, fill='black', font=font)
+                output_file = output_path / f"annotated_{result['image_name']}"
+                img.save(output_file)
+        self.logger.info(f"✓ Visualizations saved to: {output_dir}/")
+    def save_results(self, results: List[Dict], output_path: str = 'predictions.json') -> None:
+        """Save prediction results to JSON file."""
+        # Clean results for JSON serialization
+        clean_results = []
+        for r in results:
+            clean_r = {k: v for k, v in r.items()
+                       if k not in ['original_image', 'frame']}
+            # For video results, remove frame data but keep scores
+            if clean_r.get('type') == 'video' and 'frame_results' in clean_r:
+                clean_r['frame_results'] = [
+                    {k: v for k, v in fr.items() if k != 'frame'}
+                    for fr in clean_r['frame_results']
+                ]
+            clean_results.append(clean_r)
+        output_path = Path(output_path)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(output_path, 'w') as f:
+            json.dump({
+                'metadata': {
+                    'task': self.task,
+                    'model': self.model_name,
+                    'metric_type': self.metric_type,
+                    'timestamp': datetime.now().isoformat(),
+                    'total_files': len(clean_results)
+                },
+                'predictions': clean_results
+            }, f, indent=2)
+        self.logger.info(f"💾 Results saved to: {output_path}")
+    def print_summary(self, results: List[Dict]) -> None:
+        """Print formatted summary of prediction results."""
+        self.logger.info("\n" + "=" * 80)
+        self.logger.info("PREDICTION SUMMARY")
+        self.logger.info("=" * 80)
+        image_results = [r for r in results if r.get('type') == 'image']
+        video_results = [r for r in results if r.get('type') == 'video']
+        if image_results:
+            valid_images = [r for r in image_results if r.get('quality_score') is not None]
+            if valid_images:
+                scores = [r['quality_score'] for r in valid_images]
+                self.logger.info(f"\n📸 Image Analysis ({len(valid_images)} images)")
+                self.logger.info(f"   Average quality: {np.mean(scores):.2f}")
+                self.logger.info(f"   Score range: [{np.min(scores):.2f}, {np.max(scores):.2f}]")
+        if video_results:
+            self.logger.info(f"\n🎬 Video Analysis ({len(video_results)} videos)")
+            for vr in video_results:
+                self.logger.info(f"\n   {vr['video_name']}:")
+                self.logger.info(f"      Duration: {vr['metadata']['duration']:.1f}s")
+                self.logger.info(f"      Frames analyzed: {vr['num_frames_analyzed']}")
+                self.logger.info(f"      Unique seconds: {vr['num_seconds']}")
+                self.logger.info(f"      Average quality (per-second): {vr['average_quality']:.2f}")
+                self.logger.info(f"      Quality range: [{vr['min_quality']:.2f}, {vr['max_quality']:.2f}]")
+                self.logger.info(f"      Variability (std): {vr['std_quality']:.2f}")
+        self.logger.info("\n" + "=" * 80 + "\n")
+def main():
+    """Command-line interface for MIQA inference."""
+    parser = argparse.ArgumentParser(
+        description='MIQA: Machine-centric Image and Video Quality Assessment',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Analyze video with per-second visualization
+  python video_analytics_inference.py --input video.mp4 --task cls --visualize --viz-granularity second
+  # Analyze video with both frame and second visualizations
+  python video_analytics_inference.py --input video.mp4 --task cls --visualize --viz-granularity both
+  # Process directory with frame-level visualization
+  python video_analytics_inference.py --input ./assets/demo_video --task det --video-frames 120 --visualize --viz-granularity second --metric-type consistency
+        """
+    )
+    parser.add_argument('--input', type=str, required=True,
+                        help='Path to input image/video or directory')
+    parser.add_argument('--task', type=str, required=True,
+                        choices=['cls', 'det', 'ins'],
+                        help='Task type')
+    parser.add_argument('--model', type=str, default='ra_miqa',
+                        choices=['ra_miqa'],
+                        help='Model architecture (RA-MIQA only; matches Hub registry)')
+    parser.add_argument('--metric-type', type=str, default='composite',
+                        choices=['composite', 'consistency', 'accuracy'],
+                        help='Training metric type')
+    parser.add_argument('--device', type=str, default=None,
+                        choices=['cuda', 'cpu'],
+                        help='Device to run on')
+    parser.add_argument('--video-frames', type=int, default=50,
+                        help='Target number of frames to sample from videos')
+    parser.add_argument('--save-results', action='store_true',
+                        help='Save prediction results to file')
+    parser.add_argument('--output-file', type=str, default='predictions.json',
+                        help='Output file path')
+    parser.add_argument('--visualize', action='store_true',
+                        help='Create visualizations')
+    parser.add_argument('--viz-dir', type=str, default='inference_results',
+                        help='Directory for visualizations')
+    parser.add_argument('--viz-granularity', type=str, default='second',
+                        choices=['frame', 'second', 'both'],
+                        help='Visualization granularity for videos: frame-level, per-second, or both')
+    parser.add_argument('--no-progress', action='store_true',
+                        help='Disable progress bar')
+    args = parser.parse_args()
+    try:
+        miqa = MIQAInference(
+            task=args.task,
+            model_name=args.model,
+            metric_type=args.metric_type,
+            device=args.device,
+            video_target_frames=args.video_frames
+        )
+        results = miqa.predict(args.input, show_progress=not args.no_progress)
+        miqa.print_summary(results)
+        if args.save_results:
+            miqa.save_results(results, args.output_file)
+        if args.visualize:
+            miqa.visualize_results(results, args.viz_dir, video_granularity=args.viz_granularity)
+    except Exception as e:
+        print(f"\n❌ Error: {str(e)}", file=sys.stderr)
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+if __name__ == '__main__':
+    main()