| """ |
| Zero-Shot Fashion Segmentation Experiment |
| |
| This experiment demonstrates zero-shot learning for fashion segmentation |
| using SAM 2 with advanced text prompting and attention mechanisms. |
| """ |
|
|
| import torch |
| import torch.nn as nn |
| import numpy as np |
| import matplotlib.pyplot as plt |
| from PIL import Image |
| import os |
| import json |
| from typing import List, Dict, Tuple |
| import argparse |
| from tqdm import tqdm |
|
|
| |
| import sys |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
|
| from models.sam2_zeroshot import SAM2ZeroShot, ZeroShotEvaluator |
| from utils.data_loader import FashionDataLoader |
| from utils.metrics import SegmentationMetrics |
| from utils.visualization import visualize_segmentation |
|
|
|
|
| class FashionZeroShotExperiment: |
| """Zero-shot learning experiment for fashion segmentation.""" |
| |
| def __init__( |
| self, |
| sam2_checkpoint: str, |
| data_dir: str, |
| output_dir: str, |
| device: str = "cuda", |
| use_attention_maps: bool = True, |
| temperature: float = 0.1 |
| ): |
| self.device = device |
| self.output_dir = output_dir |
| |
| |
| os.makedirs(output_dir, exist_ok=True) |
| |
| |
| self.model = SAM2ZeroShot( |
| sam2_checkpoint=sam2_checkpoint, |
| device=device, |
| use_attention_maps=use_attention_maps, |
| temperature=temperature |
| ) |
| |
| |
| self.evaluator = ZeroShotEvaluator() |
| |
| |
| self.data_loader = FashionDataLoader(data_dir) |
| |
| |
| self.metrics = SegmentationMetrics() |
| |
| |
| self.classes = ["shirt", "pants", "dress", "shoes"] |
| |
| |
| self.prompt_strategies = [ |
| "basic", |
| "descriptive", |
| "contextual", |
| "detailed" |
| ] |
| |
| def run_single_experiment( |
| self, |
| image: torch.Tensor, |
| ground_truth: Dict[str, torch.Tensor], |
| strategy: str = "descriptive" |
| ) -> Dict: |
| """Run a single zero-shot experiment.""" |
| |
| predictions = self.model.segment(image, "fashion", self.classes) |
| |
| |
| evaluation = self.evaluator.evaluate(predictions, ground_truth) |
| |
| return { |
| 'predictions': predictions, |
| 'evaluation': evaluation, |
| 'strategy': strategy |
| } |
| |
| def run_comparative_experiment( |
| self, |
| num_images: int = 50 |
| ) -> Dict: |
| """Run comparative experiment with different prompt strategies.""" |
| results = { |
| 'strategies': {strategy: [] for strategy in self.prompt_strategies}, |
| 'overall_comparison': {}, |
| 'class_analysis': {cls: {strategy: [] for strategy in self.prompt_strategies} |
| for cls in self.classes} |
| } |
| |
| print(f"Running comparative zero-shot experiment on {num_images} images...") |
| |
| for i in tqdm(range(num_images)): |
| |
| image, ground_truth = self.data_loader.get_test_sample() |
| |
| |
| for strategy in self.prompt_strategies: |
| |
| if strategy == "basic": |
| |
| self.model.advanced_prompts["fashion"] = { |
| "shirt": ["shirt"], |
| "pants": ["pants"], |
| "dress": ["dress"], |
| "shoes": ["shoes"] |
| } |
| elif strategy == "descriptive": |
| |
| self.model.advanced_prompts["fashion"] = { |
| "shirt": ["fashion photography of shirts", "clothing item top"], |
| "pants": ["fashion photography of pants", "lower body clothing"], |
| "dress": ["fashion photography of dresses", "full body garment"], |
| "shoes": ["fashion photography of shoes", "footwear item"] |
| } |
| elif strategy == "contextual": |
| |
| self.model.advanced_prompts["fashion"] = { |
| "shirt": ["in a fashion setting, shirt", "worn by a person, shirt"], |
| "pants": ["in a fashion setting, pants", "worn by a person, pants"], |
| "dress": ["in a fashion setting, dress", "worn by a person, dress"], |
| "shoes": ["in a fashion setting, shoes", "worn by a person, shoes"] |
| } |
| elif strategy == "detailed": |
| |
| self.model.advanced_prompts["fashion"] = { |
| "shirt": ["high quality fashion photograph of a shirt with clear details", |
| "professional clothing photography showing shirt"], |
| "pants": ["high quality fashion photograph of pants with clear details", |
| "professional clothing photography showing pants"], |
| "dress": ["high quality fashion photograph of a dress with clear details", |
| "professional clothing photography showing dress"], |
| "shoes": ["high quality fashion photograph of shoes with clear details", |
| "professional clothing photography showing shoes"] |
| } |
| |
| |
| experiment_result = self.run_single_experiment(image, ground_truth, strategy) |
| |
| |
| results['strategies'][strategy].append(experiment_result['evaluation']) |
| |
| |
| for class_name in self.classes: |
| iou_key = f"{class_name}_iou" |
| dice_key = f"{class_name}_dice" |
| |
| if iou_key in experiment_result['evaluation']: |
| results['class_analysis'][class_name][strategy].append({ |
| 'iou': experiment_result['evaluation'][iou_key], |
| 'dice': experiment_result['evaluation'][dice_key] |
| }) |
| |
| |
| if i % 10 == 0: |
| self.visualize_comparison( |
| i, image, ground_truth, |
| {s: results['strategies'][s][-1] for s in self.prompt_strategies}, |
| strategy |
| ) |
| |
| |
| for strategy in self.prompt_strategies: |
| strategy_results = results['strategies'][strategy] |
| if strategy_results: |
| results['overall_comparison'][strategy] = { |
| 'mean_iou': np.mean([r.get('mean_iou', 0) for r in strategy_results]), |
| 'mean_dice': np.mean([r.get('mean_dice', 0) for r in strategy_results]), |
| 'std_iou': np.std([r.get('mean_iou', 0) for r in strategy_results]), |
| 'std_dice': np.std([r.get('mean_dice', 0) for r in strategy_results]) |
| } |
| |
| return results |
| |
| def run_attention_analysis(self, num_images: int = 20) -> Dict: |
| """Run analysis of attention-based prompt generation.""" |
| results = { |
| 'with_attention': [], |
| 'without_attention': [], |
| 'attention_points': [] |
| } |
| |
| print(f"Running attention analysis on {num_images} images...") |
| |
| for i in tqdm(range(num_images)): |
| |
| image, ground_truth = self.data_loader.get_test_sample() |
| |
| |
| self.model.use_attention_maps = True |
| with_attention = self.run_single_experiment(image, ground_truth, "attention") |
| |
| |
| self.model.use_attention_maps = False |
| without_attention = self.run_single_experiment(image, ground_truth, "no_attention") |
| |
| |
| results['with_attention'].append(with_attention['evaluation']) |
| results['without_attention'].append(without_attention['evaluation']) |
| |
| |
| if with_attention['predictions']: |
| |
| attention_points = self.extract_attention_points(image, self.classes) |
| results['attention_points'].append(attention_points) |
| |
| return results |
| |
| def extract_attention_points(self, image: torch.Tensor, classes: List[str]) -> List[Tuple[int, int]]: |
| """Extract attention points for visualization.""" |
| |
| h, w = image.shape[-2:] |
| points = [] |
| |
| for class_name in classes: |
| |
| center_x, center_y = w // 2, h // 2 |
| points.append((center_x, center_y)) |
| |
| |
| points.append((center_x + w // 4, center_y)) |
| points.append((center_x, center_y + h // 4)) |
| |
| return points |
| |
| def visualize_comparison( |
| self, |
| image_idx: int, |
| image: torch.Tensor, |
| ground_truth: Dict[str, torch.Tensor], |
| strategy_results: Dict, |
| best_strategy: str |
| ): |
| """Visualize comparison between different strategies.""" |
| fig, axes = plt.subplots(3, 4, figsize=(20, 15)) |
| |
| |
| axes[0, 0].imshow(image.permute(1, 2, 0).cpu().numpy()) |
| axes[0, 0].set_title("Original Image") |
| axes[0, 0].axis('off') |
| |
| |
| for i, class_name in enumerate(self.classes): |
| if class_name in ground_truth: |
| axes[0, i+1].imshow(ground_truth[class_name].cpu().numpy(), cmap='gray') |
| axes[0, i+1].set_title(f"GT: {class_name}") |
| axes[0, i+1].axis('off') |
| |
| |
| best_result = strategy_results[best_strategy] |
| for i, class_name in enumerate(self.classes): |
| if class_name in best_result: |
| axes[1, i].imshow(best_result[class_name].cpu().numpy(), cmap='gray') |
| axes[1, i].set_title(f"Best: {class_name}") |
| axes[1, i].axis('off') |
| |
| |
| strategies = list(strategy_results.keys()) |
| metrics = ['mean_iou', 'mean_dice'] |
| |
| for i, metric in enumerate(metrics): |
| values = [strategy_results[s].get(metric, 0) for s in strategies] |
| axes[2, i].bar(strategies, values) |
| axes[2, i].set_title(f"{metric.replace('_', ' ').title()}") |
| axes[2, i].tick_params(axis='x', rotation=45) |
| |
| |
| summary_text = f"Best Strategy: {best_strategy}\n" |
| for strategy, result in strategy_results.items(): |
| summary_text += f"{strategy}: IoU={result.get('mean_iou', 0):.3f}, Dice={result.get('mean_dice', 0):.3f}\n" |
| |
| axes[2, 2].text(0.1, 0.5, summary_text, transform=axes[2, 2].transAxes, |
| verticalalignment='center', fontsize=10) |
| axes[2, 2].axis('off') |
| axes[2, 3].axis('off') |
| |
| plt.tight_layout() |
| plt.savefig(os.path.join(self.output_dir, f"comparison_{image_idx}.png")) |
| plt.close() |
| |
| def save_results(self, results: Dict, experiment_type: str = "comparative"): |
| """Save experiment results.""" |
| |
| with open(os.path.join(self.output_dir, f'{experiment_type}_results.json'), 'w') as f: |
| json.dump(results, f, indent=2) |
| |
| |
| if experiment_type == "comparative": |
| summary = { |
| 'experiment_type': experiment_type, |
| 'num_images': len(results['strategies'][list(results['strategies'].keys())[0]]), |
| 'overall_comparison': results['overall_comparison'], |
| 'best_strategy': max(results['overall_comparison'].items(), |
| key=lambda x: x[1]['mean_iou'])[0] |
| } |
| else: |
| summary = { |
| 'experiment_type': experiment_type, |
| 'attention_analysis': { |
| 'with_attention_mean_iou': np.mean([r.get('mean_iou', 0) for r in results['with_attention']]), |
| 'without_attention_mean_iou': np.mean([r.get('mean_iou', 0) for r in results['without_attention']]), |
| 'attention_improvement': np.mean([r.get('mean_iou', 0) for r in results['with_attention']]) - |
| np.mean([r.get('mean_iou', 0) for r in results['without_attention']]) |
| } |
| } |
| |
| with open(os.path.join(self.output_dir, f'{experiment_type}_summary.json'), 'w') as f: |
| json.dump(summary, f, indent=2) |
| |
| print(f"Results saved to {self.output_dir}") |
| if experiment_type == "comparative": |
| print(f"Best strategy: {summary['best_strategy']}") |
| print(f"Best mean IoU: {summary['overall_comparison'][summary['best_strategy']]['mean_iou']:.3f}") |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="Zero-shot fashion segmentation experiment") |
| parser.add_argument("--sam2_checkpoint", type=str, required=True, help="Path to SAM 2 checkpoint") |
| parser.add_argument("--data_dir", type=str, required=True, help="Path to fashion dataset") |
| parser.add_argument("--output_dir", type=str, default="results/zero_shot_fashion", help="Output directory") |
| parser.add_argument("--num_images", type=int, default=50, help="Number of test images") |
| parser.add_argument("--device", type=str, default="cuda", help="Device to use") |
| parser.add_argument("--experiment_type", type=str, default="comparative", |
| choices=["comparative", "attention"], help="Type of experiment") |
| parser.add_argument("--temperature", type=float, default=0.1, help="CLIP temperature") |
| |
| args = parser.parse_args() |
| |
| |
| experiment = FashionZeroShotExperiment( |
| sam2_checkpoint=args.sam2_checkpoint, |
| data_dir=args.data_dir, |
| output_dir=args.output_dir, |
| device=args.device, |
| temperature=args.temperature |
| ) |
| |
| if args.experiment_type == "comparative": |
| results = experiment.run_comparative_experiment(num_images=args.num_images) |
| else: |
| results = experiment.run_attention_analysis(num_images=args.num_images) |
| |
| experiment.save_results(results, args.experiment_type) |
|
|
|
|
| if __name__ == "__main__": |
| main() |