xiaoqi-wang
/

miqa

miqa

Model card Files Files and versions

xet

Community

xiaoqi-wang commited on 26 days ago

Commit

4e62d30

verified ·

1 Parent(s): e29b006

Upload train.py with huggingface_hub

Browse files

Files changed (1) hide show

train.py +613 -0

train.py ADDED Viewed

	@@ -0,0 +1,613 @@

+import numpy as np
+import os
+import random
+import shutil
+import time
+import warnings
+from enum import Enum
+import pandas as pd
+import torch
+import logging
+import torch.nn.parallel
+import torch.optim
+import torch.utils.data
+import torch.utils.data.distributed
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import torchmetrics
+from torch.utils.tensorboard import SummaryWriter
+from configs.args_base import get_args
+from utils.losses import build_loss_function
+from utils.optimizer import build_optimizer
+from utils.lr_scheduler import build_scheduler
+from utils.logger import build_logger
+from utils.misc import setup_seed, reduce_tensor, save_checkpoint
+from data import build_dataloader
+from models.MIQA_base import get_torch_model, get_timm_model
+from models.RA_MIQA import RegionVisionTransformer
+best_srcc = best_plcc = best_klcc = 0.
+def main(args):
+    if args.seed is not None:
+        setup_seed(args.seed)
+        warnings.warn('You have chosen to seed training. '
+                      'This will turn on the CUDNN deterministic setting, '
+                      'which can slow down your training considerably! '
+                      'You may see unexpected behavior when restarting '
+                      'from checkpoints.')
+    if args.gpu is not None:
+        warnings.warn('You have chosen a specific GPU. This will completely '
+                      'disable data parallelism.')
+    if args.dist_url == "env://" and args.world_size == -1:
+        args.world_size = int(os.environ["WORLD_SIZE"])
+    args.distributed = args.world_size > 1 or args.multiprocessing_distributed
+    if torch.cuda.is_available():
+        ngpus_per_node = torch.cuda.device_count()
+        if ngpus_per_node == 1 and args.dist_backend == "nccl":
+            warnings.warn(
+                "nccl backend >=2.5 requires GPU count>1, see https://github.com/NVIDIA/nccl/issues/103 perhaps use 'gloo'")
+    else:
+        ngpus_per_node = 1
+    if args.multiprocessing_distributed:
+        # Since we have ngpus_per_node processes per node, the total world_size
+        # needs to be adjusted accordingly
+        args.world_size = ngpus_per_node * args.world_size
+        # Use torch.multiprocessing.spawn to launch distributed processes: the
+        # main_worker process function
+        mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
+    else:
+        # Simply call main_worker function
+        main_worker(args.gpu, ngpus_per_node, args)
+def main_worker(gpu, ngpus_per_node, args):
+    global best_srcc, best_plcc, best_klcc
+    args.gpu = gpu
+    args.ngpus_per_node = ngpus_per_node
+    if args.distributed:
+        if args.dist_url == "env://" and args.rank == -1:
+            args.rank = int(os.environ["RANK"])
+        if args.multiprocessing_distributed:
+            # For multiprocessing distributed training, rank needs to be the
+            # global rank among all the processes
+            args.rank = args.rank * ngpus_per_node + gpu
+        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                world_size=args.world_size, rank=args.rank)
+    logger = build_logger(
+        output_dir=args.output_dir,
+        log_file='{}_train.log'.format(args.run_name),
+        rank=args.rank if args.distributed else None,
+        level=logging.INFO,
+        console_level=logging.INFO if args.rank in [0, None] else logging.WARNING,
+        file_level=logging.INFO
+    )
+    if args.gpu is not None:
+        logger.info("Use GPU: {} for training".format(args.gpu))
+    # create model
+    if args.arch.startswith('RA_'):
+        model = RegionVisionTransformer(
+            base_model_name = 'vit_small_patch16_224',
+            pretrained = True,
+            mmseg_config_path = 'models/model_configs/fcn_sere-small_finetuned_fp16_8x32_224x224_3600_imagenets919.py',
+            checkpoint_path = 'models/checkpoints/sere_finetuned_vit_small_ep100.pth',
+            auto_download = True,
+            force_download = False
+            )
+    else:
+        try:
+            logger.info(f"Loading model form torchvision {args.arch}")
+            model = get_torch_model(model_name=args.arch, pretrained=args.pretrained, num_classes=1)
+        except:
+            logger.info(f"Loading model form timm {args.arch}")
+            model = get_timm_model(model_name=args.arch, pretrained=args.pretrained, num_classes=1)
+    if not torch.cuda.is_available() and not torch.backends.mps.is_available():
+        logger.info('using CPU, this will be slow')
+    elif args.distributed:
+        # For multiprocessing distributed, DistributedDataParallel constructor
+        # should always set the single device scope, otherwise,
+        # DistributedDataParallel will use all available devices.
+        if torch.cuda.is_available():
+            if args.gpu is not None:
+                torch.cuda.set_device(args.gpu)
+                model.cuda(args.gpu)
+                # When using a single GPU per process and per
+                # DistributedDataParallel, we need to divide the batch size
+                # ourselves based on the total number of GPUs of the current node.
+                args.batch_size = int(args.batch_size / ngpus_per_node)
+                args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
+                model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+            else:
+                model.cuda()
+                # DistributedDataParallel will divide and allocate batch_size to all
+                # available GPUs if device_ids are not set
+                model = torch.nn.parallel.DistributedDataParallel(model)
+    elif args.gpu is not None and torch.cuda.is_available():
+        torch.cuda.set_device(args.gpu)
+        model = model.cuda(args.gpu)
+    elif torch.backends.mps.is_available():
+        device = torch.device("mps")
+        model = model.to(device)
+    else:
+        # DataParallel will divide and allocate batch_size to all available GPUs
+        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
+            model.features = torch.nn.DataParallel(model.features)
+            model.cuda()
+        else:
+            model = torch.nn.DataParallel(model).cuda()
+    if torch.cuda.is_available():
+        if args.gpu:
+            device = torch.device('cuda:{}'.format(args.gpu))
+        else:
+            device = torch.device("cuda")
+    elif torch.backends.mps.is_available():
+        device = torch.device("mps")
+    else:
+        device = torch.device("cpu")
+    # Data loading
+    train_dataset, val_dataset = build_dataloader.build_dataset(args)
+    if args.distributed:
+        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
+        val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False, drop_last=False)
+    else:
+        train_sampler = None
+        val_sampler = None
+    train_loader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
+        num_workers=args.workers, pin_memory=True, sampler=train_sampler)
+    val_loader = torch.utils.data.DataLoader(
+        val_dataset, batch_size=args.batch_size, shuffle=False,
+        num_workers=args.workers, pin_memory=True, sampler=val_sampler)
+    # define loss function (criterion), optimizer, and learning rate scheduler
+    criterion = build_loss_function(loss_name=args.loss_name)
+    optimizer = build_optimizer(args, model)
+    scheduler = build_scheduler(args, optimizer, len(train_loader))
+    # optionally resume from a checkpoint
+    if args.resume:
+        if os.path.isfile(args.resume):
+            logger.info("=> loading checkpoint '{}'".format(args.resume))
+            if args.gpu is None:
+                checkpoint = torch.load(args.resume)
+            elif torch.cuda.is_available():
+                # Map model to be loaded to specified single gpu.
+                loc = 'cuda:{}'.format(args.gpu)
+                checkpoint = torch.load(args.resume, map_location=loc)
+            args.start_epoch = checkpoint['epoch']
+            best_srcc = checkpoint['best_srcc']
+            if args.gpu is not None:
+                # best_acc1 may be from a checkpoint from a different GPU
+                best_srcc = best_srcc.to(args.gpu)
+            model.load_state_dict(checkpoint['state_dict'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            scheduler.load_state_dict(checkpoint['scheduler'])
+            logger.info("=> loaded checkpoint '{}' (epoch {})"
+                  .format(args.resume, checkpoint['epoch']))
+        else:
+            logger.info("=> no checkpoint found at '{}'".format(args.resume))
+    # evaluate on validation set
+    if args.eval_only:
+        validate(val_loader, model, criterion, args)
+        return
+    writer = SummaryWriter(log_dir=os.path.join(args.output_dir, 'tensorboard_logs', args.run_name))
+    for epoch in range(args.start_epoch, args.epochs+1):
+        if args.distributed:
+            train_sampler.set_epoch(epoch)
+        # train for one epoch
+        best_srcc, best_plcc, best_klcc = train_one_epoch(train_loader, model, criterion, optimizer, scheduler, epoch, device, args, val_loader, writer, logger)
+    writer.close()
+    logger.info('################# Training Finished ##################')
+    logger.info(f"Best SRCC: {best_srcc}, Best PLCC: {best_plcc}, Best KLCC: {best_klcc}")
+    logger.info('######################################################')
+def train_one_epoch(train_loader, model, criterion, optimizer, scheduler, epoch, device, args, val_loader, writer, logger):
+    global best_srcc, best_plcc, best_klcc
+    model.train()
+    batch_time = AverageMeter('Time', ':6.3f')
+    data_time = AverageMeter('Data', ':6.3f')
+    losses = AverageMeter('Loss', ':.4e')
+    srcc = AverageMeter('SRCC', ':6.4f')
+    plcc = AverageMeter('PLCC', ':6.4f')
+    klcc = AverageMeter('KLCC', ':6.4f')
+    # mse = AverageMeter('MSE', ':6.4f')
+    progress = ProgressMeter(
+        len(train_loader),
+        [batch_time, data_time, losses, srcc, plcc, klcc],
+        prefix="Epoch: [{}]".format(epoch))
+    validate_freq = len(train_loader) // args.validate_num
+    end = time.time()
+    global_step = epoch * len(train_loader)
+    for i, batch in enumerate(train_loader):
+        data_time.update(time.time() - end)
+        # images = batch['image'].cuda(args.gpu, non_blocking=True)
+        # target = batch['label'].cuda(args.gpu, non_blocking=True).view(-1)
+        image_cropped = batch['image_cropped'].cuda(args.gpu, non_blocking=True)
+        target = batch['label'].cuda(args.gpu, non_blocking=True).view(-1)
+        if 'image_resized' in batch:
+            image_resized = batch['image_resized'].cuda(args.gpu, non_blocking=True)
+            output = model(image_cropped, image_resized).view(-1)
+        else:
+            output = model(image_cropped).view(-1)
+        target_len = target.size(0)
+        train_loss = criterion(output, target)
+        # Calculate metrics during training sessions
+        srcc_train = torchmetrics.functional.spearman_corrcoef(output, target).item()
+        plcc_train = torchmetrics.functional.pearson_corrcoef(output, target).item()
+        klcc_train = torchmetrics.functional.kendall_rank_corrcoef(output, target).item()
+        # Update Metrics
+        losses.update(train_loss.item(), target_len)
+        srcc.update(srcc_train, target_len)
+        plcc.update(plcc_train, target_len)
+        klcc.update(klcc_train, target_len)
+        # Add training loss to the writer
+        writer.add_scalars('Loss', {
+            'train': train_loss.item()
+        }, global_step + i)
+        optimizer.zero_grad()
+        train_loss.backward()
+        optimizer.step()
+        # if scheduler is not None:
+        scheduler.step_update(global_step + i)
+        # Record the current learning rate
+        if args.rank == 0:
+            current_lr = optimizer.param_groups[0]['lr']
+            writer.add_scalar('Learning_Rate', current_lr, global_step + i)
+        batch_time.update(time.time() - end)
+        end = time.time()
+        if i % args.print_freq == 0:
+            progress.display(i + 1)
+        # Perform multiple verifications within an epoch
+        if (i + 1) % validate_freq == 0:
+            model.eval()
+            results = validate(val_loader=val_loader, model=model, criterion=criterion, args=args, logger=logger)
+            val_srcc = results['metrics']['srcc']
+            val_plcc = results['metrics']['plcc']
+            val_klcc = results['metrics']['klcc']
+            val_loss = results['metrics']['loss']
+            logger.info(f'Validation results: SRCC: {val_srcc:.4f}, PLCC: {val_plcc:.4f}, KLCC: {val_klcc:.4f}, Loss: {val_loss:.4f}')
+            if args.rank == 0:
+                # Add the validation loss to the same loss chart
+                writer.add_scalars('Loss', {
+                    'val': val_loss
+                }, global_step + i)
+                # Add all performance metrics to the same Metrics chart.
+                writer.add_scalars('Metrics', {
+                    'SRCC': val_srcc,
+                    'PLCC': val_plcc,
+                    'KLCC': val_klcc
+                }, global_step + i)
+            is_best = val_srcc > best_srcc
+            best_srcc = max(val_srcc, best_srcc)
+            best_plcc = max(val_plcc, best_plcc)
+            best_klcc = max(val_klcc, best_klcc)
+            # Save the best model and results
+            if not args.multiprocessing_distributed or (args.multiprocessing_distributed
+                                                        and args.rank % args.ngpus_per_node == 0):
+                save_checkpoint(args, {
+                    'epoch': epoch + 1,
+                    'arch': args.arch,
+                    'state_dict': model.state_dict(),
+                    'best_srcc': best_srcc,
+                    'optimizer': optimizer.state_dict(),
+                    'scheduler': scheduler.state_dict()
+                }, is_best)
+                if is_best:
+                    logger.info(
+                        f'**BEST** Validation results: SRCC: {best_srcc:.4f}, PLCC: {best_plcc:.4f}, KLCC: {best_klcc:.4f}')
+                    df = pd.DataFrame({
+                        'image_name': results['image_names'],
+                        'prediction': results['predictions'],
+                        'ground_truth': results['ground_truth']
+                    })
+                    csv_filename = os.path.join(args.output_dir,
+                                                f'{args.run_name}_best_val_results.csv')
+                    df.to_csv(csv_filename, index=False)
+            model.train()
+    logger.info(
+        f'**BEST** Validation results: SRCC: {best_srcc:.4f}, PLCC: {best_plcc:.4f}, KLCC: {best_klcc:.4f}')
+    return best_srcc, best_plcc, best_klcc
+@torch.no_grad()
+def validate(val_loader, model, args, criterion, logger):
+    model.eval()
+    val_dataset_len = len(val_loader.dataset)
+    val_loader_len = len(val_loader)
+    batch_time = AverageMeter('Time', ':6.3f')
+    losses = AverageMeter('Loss', ':.4e')
+    with torch.no_grad():
+        temp_pred_scores = []
+        temp_gt_scores = []
+        temp_img_names = []
+        time_tmp = time.time()
+        for i, batch in enumerate(val_loader):
+            if args.gpu is not None and torch.cuda.is_available():
+                device = torch.device(f'cuda:{args.gpu}')
+            elif torch.backends.mps.is_available():
+                device = torch.device('mps')
+            else:
+                device = torch.device('cpu')
+            images = batch['image_cropped'].to(device, non_blocking=True if device.type == 'cuda' else False)
+            target = batch['label'].to(device, non_blocking=True if device.type == 'cuda' else False)
+            if 'image_resized' in batch:
+                image_resized = batch['image_resized'].to(device, non_blocking=True if device.type == 'cuda' else False)
+                output = model(images, image_resized).view(-1)
+            else:
+                output = model(images).view(-1)
+            # if args.gpu is not None and torch.cuda.is_available():
+            #     images = batch['image'].cuda(args.gpu, non_blocking=True)
+            #     target = batch['label'].cuda(args.gpu, non_blocking=True)
+            # if torch.backends.mps.is_available():
+            #     images = images.to('mps')
+            #     target = target.to('mps')
+            # output = model(images).view(-1)
+            loss = criterion(output, target.view(-1))
+            loss = reduce_tensor(loss)
+            losses.update(loss.item(), target.size(0))
+            batch_time.update(time.time() - time_tmp)
+            time_tmp = time.time()
+            # Save predicted values, gt values, and image names
+            temp_pred_scores.append(output.view(-1))
+            temp_gt_scores.append(target.view(-1))
+            temp_img_names.extend(batch['image_name'])
+            if i % args.print_freq == 0:
+                logger.info(
+                    f"Test: [{i}/{val_loader_len}]\t"
+                    f"Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t"
+                    f"Loss {losses.val:.4f} ({losses.avg:.4f})\t"
+                )
+    # Combine the results of all batches
+    pred_scores = torch.cat(temp_pred_scores)
+    gt_scores = torch.cat(temp_gt_scores)
+    # Distributed processing
+    if torch.distributed.is_initialized():
+        # Collect the results of all processes
+        img_names_gather = [None for _ in range(dist.get_world_size())]
+        torch.distributed.all_gather_object(img_names_gather, temp_img_names)
+        all_img_names = []
+        for names in img_names_gather:
+            all_img_names.extend(names)
+        all_img_names = all_img_names[:val_dataset_len]  # 截取到实际数据集大小
+        preds_gather_list = [
+            torch.zeros_like(pred_scores) for _ in range(dist.get_world_size())
+        ]
+        torch.distributed.all_gather(preds_gather_list, pred_scores)
+        gather_preds = torch.cat(preds_gather_list, dim=0)[:val_dataset_len]
+        grotruth_gather_list = [
+            torch.zeros_like(gt_scores) for _ in range(dist.get_world_size())
+        ]
+        torch.distributed.all_gather(grotruth_gather_list, gt_scores)
+        gather_grotruth = torch.cat(grotruth_gather_list, dim=0)[:val_dataset_len]
+        if args.patch_num > 1:
+            gather_preds_matrix = gather_preds.view(-1, args.patch_num)
+            gather_preds = (gather_preds_matrix.mean(dim=-1)).squeeze()
+            gather_grotruth = (
+                (gather_grotruth.view(-1, args.patch_num)).mean(dim=-1)
+            ).squeeze()
+        final_preds = gather_preds.float().detach()
+        final_grotruth = gather_grotruth.float().detach()
+    else:
+        final_preds = pred_scores.float().detach()
+        final_grotruth = gt_scores.float().detach()
+        all_img_names = temp_img_names
+    # Calculate the correlation coefficient
+    try:
+        logger.info(f"len of dataset: {val_dataset_len}, final_preds shape: {final_preds.shape}, final_grotruth shape: {final_grotruth.shape}")
+        # Check for the presence of NaN or inf
+        if torch.isnan(final_preds).any() or torch.isinf(final_preds).any() or \
+                torch.isnan(final_grotruth).any() or torch.isinf(final_grotruth).any():
+            raise ValueError("Found NaN or inf values in predictions or ground truth")
+        test_srcc = torchmetrics.functional.spearman_corrcoef(final_preds, final_grotruth).item()
+        test_plcc = torchmetrics.functional.pearson_corrcoef(final_preds, final_grotruth).item()
+        test_klcc = torchmetrics.functional.kendall_rank_corrcoef(final_preds, final_grotruth).item()
+    except Exception as e:
+        logger.warning(f"Error in calculating correlations: {str(e)}. Resetting cc relation to zero...")
+        test_plcc = 0.0
+        test_srcc = 0.0
+        test_klcc = 0.0
+    # Create a result dictionary containing the correspondence between image names, predicted values, and actual values.
+    results = {
+        'image_names': all_img_names,
+        'predictions': final_preds.cpu().numpy().tolist(),
+        'ground_truth': final_grotruth.cpu().numpy().tolist(),
+        'metrics': {
+            'srcc': test_srcc,
+            'plcc': test_plcc,
+            'klcc': test_klcc,
+            'loss': losses.avg
+        }
+    }
+    return results
+class Summary(Enum):
+    NONE = 0
+    AVERAGE = 1
+    SUM = 2
+    COUNT = 3
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self, name, fmt=':f', summary_type=Summary.AVERAGE):
+        self.name = name
+        self.fmt = fmt
+        self.summary_type = summary_type
+        self.reset()
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+    def all_reduce(self):
+        if torch.cuda.is_available():
+            device = torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            device = torch.device("mps")
+        else:
+            device = torch.device("cpu")
+        total = torch.tensor([self.sum, self.count], dtype=torch.float32, device=device)
+        dist.all_reduce(total, dist.ReduceOp.SUM, async_op=False)
+        self.sum, self.count = total.tolist()
+        self.avg = self.sum / self.count
+    def __str__(self):
+        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
+        return fmtstr.format(**self.__dict__)
+    def summary(self):
+        fmtstr = ''
+        if self.summary_type is Summary.NONE:
+            fmtstr = ''
+        elif self.summary_type is Summary.AVERAGE:
+            fmtstr = '{name} {avg:.3f}'
+        elif self.summary_type is Summary.SUM:
+            fmtstr = '{name} {sum:.3f}'
+        elif self.summary_type is Summary.COUNT:
+            fmtstr = '{name} {count:.3f}'
+        else:
+            raise ValueError('invalid summary type %r' % self.summary_type)
+        return fmtstr.format(**self.__dict__)
+class ProgressMeter(object):
+    def __init__(self, num_batches, meters, prefix=""):
+        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
+        self.meters = meters
+        self.prefix = prefix
+    def display(self, batch):
+        entries = [self.prefix + self.batch_fmtstr.format(batch)]
+        entries += [str(meter) for meter in self.meters]
+        print('\t'.join(entries))
+    def display_summary(self):
+        entries = [" *"]
+        entries += [meter.summary() for meter in self.meters]
+        print(' '.join(entries))
+    def _get_batch_fmtstr(self, num_batches):
+        num_digits = len(str(num_batches // 1))
+        fmt = '{:' + str(num_digits) + 'd}'
+        return '[' + fmt + '/' + fmt.format(num_batches) + ']'
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.no_grad():
+        maxk = max(topk)
+        batch_size = target.size(0)
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = pred.eq(target.view(1, -1).expand_as(pred))
+        res = []
+        for k in topk:
+            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
+            res.append(correct_k.mul_(100.0 / batch_size))
+        return res
+if __name__ == '__main__':
+    args = get_args().parse_args()
+    args.run_name = args.arch + '_' + args.dataset + '_' + args.metric_type
+    os.makedirs(os.path.join(args.output_dir), exist_ok=True)
+    os.makedirs(os.path.join(args.output_dir, 'tensorboard_logs', args.run_name), exist_ok=True)
+    # save config file
+    with open(os.path.join(args.output_dir, 'config.yaml'), 'w') as f:
+        f.write(args.__dict__.__str__())
+    main(args)