| | import os |
| | import cv2 |
| | import numpy as np |
| | import torch |
| | import matplotlib as mpl |
| |
|
| | from .video_utils import ( |
| | read_video_frames, |
| | resize_frames_to_long_side, |
| | save_to_video, |
| | add_overlay_text |
| | ) |
| | from typing import Optional, List, Tuple |
| | from libs.models.mano_wrapper import MANO |
| | from .render_utils import Renderer |
| |
|
| | class Config: |
| | """ |
| | Configuration class for file paths, parameters, and visual settings. |
| | Paths are initialized with default values but can be overridden by arguments. |
| | """ |
| | def __init__(self, args=None): |
| | |
| | self.VIDEO_ROOT = getattr(args, 'video_root', 'data/examples/videos') |
| | self.LABEL_ROOT = getattr(args, 'label_root', 'data/examples/annotations') |
| | self.SAVE_PATH = getattr(args, 'save_path', 'data/examples/visualize') |
| | self.MANO_MODEL_PATH = getattr(args, 'mano_model_path', './weights/mano') |
| |
|
| | |
| | self.RENDER_SIZE_LONG_SIDE = 480 |
| | self.FPS = 15 |
| |
|
| | |
| | self.LEFT_CMAP = "inferno" |
| | self.RIGHT_CMAP = "inferno" |
| |
|
| | |
| | self.LEFT_COLOR = np.array([0.6594, 0.6259, 0.7451]) |
| | self.RIGHT_COLOR = np.array([0.4078, 0.4980, 0.7451]) |
| |
|
| |
|
| | class HandVisualizer: |
| | """ |
| | Main class for loading data, configuring the renderer, and visualizing |
| | the hand episode, including mesh and trajectory. |
| | """ |
| | def __init__(self, config: Config, render_gradual_traj: bool = False): |
| | self.config = config |
| | self.render_gradual_traj = render_gradual_traj |
| | self.all_modes = ['cam', 'first'] |
| | if self.render_gradual_traj: |
| | self.all_modes = ['cam', 'full', 'first'] |
| |
|
| | |
| | self.mano = MANO(model_path=self.config.MANO_MODEL_PATH).cuda() |
| | faces_right = torch.from_numpy(self.mano.faces).float().cuda() |
| | |
| | self.faces_left = faces_right[:, [0, 2, 1]] |
| | self.faces_right = faces_right |
| |
|
| | def _render_hand_trajectory(self, video_frames, hand_traj_wordspace, hand_mask, extrinsics, renderer: Renderer, mode: str): |
| | """ |
| | Renders hand mesh for one frame or hand trajectory across multiple frames, |
| | depending on the mode ('cam', 'first', 'full'). |
| | """ |
| | verts_left_worldspace, verts_right_worldspace = hand_traj_wordspace |
| | left_hand_mask, right_hand_mask = hand_mask |
| | R_w2c, t_w2c = extrinsics |
| |
|
| | num_total_frames = len(video_frames) |
| | all_save_frames = [] |
| |
|
| | |
| | if mode == 'cam': |
| | |
| | num_loop_frames = num_total_frames |
| | |
| | left_colors = self.config.LEFT_COLOR[np.newaxis, :].repeat(num_total_frames, axis=0) |
| | right_colors = self.config.RIGHT_COLOR[np.newaxis, :].repeat(num_total_frames, axis=0) |
| | elif mode == 'first': |
| | |
| | num_loop_frames = 1 |
| | left_colors = self.config.LEFT_COLOR[np.newaxis, :].repeat(num_total_frames, axis=0) |
| | right_colors = self.config.RIGHT_COLOR[np.newaxis, :].repeat(num_total_frames, axis=0) |
| | elif mode == 'full': |
| | |
| | num_loop_frames = num_total_frames |
| | |
| | left_colors, right_colors = generate_hand_colors(num_total_frames, self.config.LEFT_CMAP, self.config.RIGHT_CMAP) |
| | else: |
| | raise ValueError(f'Unknown rendering mode: {mode}') |
| |
|
| | for current_frame_idx in range(num_loop_frames): |
| |
|
| | if not mode == 'first': |
| | print(f'Processing frame {current_frame_idx + 1}/{num_loop_frames}', end='\r') |
| | |
| | curr_img_overlay = video_frames[current_frame_idx].copy().astype(np.float32) / 255.0 |
| |
|
| | |
| | R_w2c_cur = R_w2c[current_frame_idx] |
| | t_w2c_cur = t_w2c[current_frame_idx] |
| |
|
| | |
| | verts_left_camspace = ( |
| | R_w2c_cur @ verts_left_worldspace.transpose(0, 2, 1) + t_w2c_cur |
| | ).transpose(0, 2, 1) |
| | verts_right_camspace = ( |
| | R_w2c_cur @ verts_right_worldspace.transpose(0, 2, 1) + t_w2c_cur |
| | ).transpose(0, 2, 1) |
| |
|
| | |
| | if mode == 'cam': |
| | |
| | start_traj_idx = current_frame_idx |
| | end_traj_idx = current_frame_idx + 1 |
| | transparency = [1.0] |
| | elif mode == 'first': |
| | |
| | start_traj_idx = 0 |
| | end_traj_idx = num_total_frames |
| | transparency = [1.0] * (end_traj_idx - start_traj_idx) |
| | |
| | if current_frame_idx > 0: continue |
| | elif mode == 'full': |
| | |
| | start_traj_idx = current_frame_idx |
| | end_traj_idx = num_total_frames |
| | |
| | transparency = np.linspace(0.4, 0.7, end_traj_idx - start_traj_idx) |
| | else: |
| | raise ValueError(f'Unknown rendering mode: {mode}') |
| |
|
| | |
| | for traj_idx, kk in enumerate(range(start_traj_idx, end_traj_idx)): |
| |
|
| | if mode == 'first': |
| | print(f'Processing frame {traj_idx + 1}/{num_total_frames}', end='\r') |
| | curr_img_overlay = video_frames[current_frame_idx].copy().astype(np.float32)/255 |
| |
|
| | |
| | left_mask_k = left_hand_mask[kk] |
| | right_mask_k = right_hand_mask[kk] |
| | transp_k = transparency[traj_idx] if len(transparency) > traj_idx else 1.0 |
| |
|
| | left_verts_list, left_color_list, left_face_list = ([], [], []) |
| | right_verts_list, right_color_list, right_face_list = ([], [], []) |
| |
|
| | if left_mask_k != 0: |
| | left_verts_list = [torch.from_numpy(verts_left_camspace[kk]).float().cuda()] |
| | |
| | left_color_list = [torch.from_numpy(left_colors[kk]).float().unsqueeze(0).repeat(778, 1).cuda()] |
| | left_face_list = [self.faces_left] |
| |
|
| | if right_mask_k != 0: |
| | right_verts_list = [torch.from_numpy(verts_right_camspace[kk]).float().cuda()] |
| | right_color_list = [torch.from_numpy(right_colors[kk]).float().unsqueeze(0).repeat(778, 1).cuda()] |
| | right_face_list = [self.faces_right] |
| |
|
| | verts_list = left_verts_list + right_verts_list |
| | faces_list = left_face_list + right_face_list |
| | colors_list = left_color_list + right_color_list |
| |
|
| | if verts_list: |
| | |
| | rend, mask = renderer.render(verts_list, faces_list, colors_list) |
| | rend = rend[..., ::-1] |
| |
|
| | color_mesh = rend.astype(np.float32) / 255.0 |
| | valid_mask = mask[..., None].astype(np.float32) |
| |
|
| | |
| | |
| | curr_img_overlay = ( |
| | curr_img_overlay[:, :, :3] * (1 - valid_mask) + |
| | color_mesh[:, :, :3] * valid_mask * transp_k + |
| | curr_img_overlay[:, :, :3] * valid_mask * (1 - transp_k) |
| | ) |
| | if mode == 'first': |
| | |
| | final_frame = (curr_img_overlay * 255).astype(np.uint8) |
| | final_frame = cv2.cvtColor(final_frame, cv2.COLOR_BGR2RGB) |
| | all_save_frames.append(final_frame) |
| | |
| | if mode == 'cam' or mode == 'full': |
| | |
| | final_frame = (curr_img_overlay * 255).astype(np.uint8) |
| | final_frame = cv2.cvtColor(final_frame, cv2.COLOR_BGR2RGB) |
| | all_save_frames.append(final_frame) |
| |
|
| | print(f'Finished rendering with mode: {mode}') |
| | return all_save_frames |
| |
|
| | def process_episode(self, episode_name: str): |
| | """Loads data and orchestrates the visualization process for a single episode.""" |
| | print(f'\nProcessing episode: {episode_name}') |
| |
|
| | |
| | dataset_name = episode_name.split('_')[0] |
| | ep_name = episode_name.split('_')[-2] + '_' + episode_name.split('_')[-1] |
| | video_name = episode_name.replace(f'{dataset_name}_', '').replace(f'_{ep_name}', '') |
| | video_path = os.path.join(self.config.VIDEO_ROOT, f'{video_name}.mp4') |
| | label_path = os.path.join(self.config.LABEL_ROOT, episode_name + '.npy') |
| |
|
| | if not os.path.exists(label_path): |
| | print(f'Episode file {label_path} does not exist, skipping...') |
| | return |
| |
|
| | |
| | cap = cv2.VideoCapture(video_path) |
| | episode_info = np.load(label_path, allow_pickle=True).item() |
| |
|
| | start_frame, end_frame = get_frame_interval(episode_info) |
| | R_w2c, t_w2c, normalized_intrinsics = get_camera_info(episode_info) |
| | caption_left, caption_right, hand_type = get_caption_info(episode_info) |
| | (verts_left_worldspace, left_hand_mask), (verts_right_worldspace, right_hand_mask) = \ |
| | get_hand_labels(episode_info, self.mano) |
| |
|
| | |
| | video_frames = read_video_frames(cap, start_frame=start_frame, end_frame=end_frame, interval=1) |
| | resize_video_frames = resize_frames_to_long_side(video_frames, self.config.RENDER_SIZE_LONG_SIDE) |
| | H, W, _ = resize_video_frames[0].shape |
| |
|
| | |
| | |
| | intrinsics_denorm = normalized_intrinsics.copy() |
| | intrinsics_denorm[0] *= W |
| | intrinsics_denorm[1] *= H |
| | fx_exo = intrinsics_denorm[0, 0] |
| | fy_exo = intrinsics_denorm[1, 1] |
| |
|
| | renderer = Renderer(W, H, (fx_exo, fy_exo), 'cuda') |
| |
|
| | |
| | all_rendered_frames = [] |
| | hand_traj_wordspace = (verts_left_worldspace, verts_right_worldspace) |
| | hand_mask = (left_hand_mask, right_hand_mask) |
| | extrinsics = (R_w2c, t_w2c) |
| |
|
| | for mode in self.all_modes: |
| | save_frames = self._render_hand_trajectory( |
| | resize_video_frames, |
| | hand_traj_wordspace, |
| | hand_mask, |
| | extrinsics, |
| | renderer, |
| | mode=mode |
| | ) |
| | all_rendered_frames.append(save_frames) |
| |
|
| | |
| | final_save_frames = [] |
| | num_frames = len(all_rendered_frames[0]) |
| |
|
| | |
| | caption_primary = caption_right if hand_type == 'right' else caption_left |
| | caption_opposite = caption_left if hand_type == 'right' else caption_right |
| | opposite_intervals = [interval for _, interval in caption_opposite] |
| |
|
| | for frame_idx in range(num_frames): |
| | |
| | curr_img_overlay = np.concatenate( |
| | [all_rendered_frames[mode_idx][frame_idx] for mode_idx in range(len(self.all_modes))], |
| | axis=1 |
| | ) |
| |
|
| | |
| | overlay_text_primary = caption_primary[0][0] |
| |
|
| | |
| | opposite_idx = find_caption_index(frame_idx, opposite_intervals) |
| | overlay_text_opposite = caption_opposite[opposite_idx][0] if opposite_idx is not None else 'None.' |
| |
|
| | |
| | overlay_text_full = generate_overlay_text( |
| | overlay_text_primary, |
| | overlay_text_opposite, |
| | hand_type |
| | ) |
| | add_overlay_text(curr_img_overlay, overlay_text_full) |
| |
|
| | final_save_frames.append(curr_img_overlay) |
| |
|
| | |
| | os.makedirs(self.config.SAVE_PATH, exist_ok=True) |
| | save_to_video(final_save_frames, f'{self.config.SAVE_PATH}/{episode_name}.mp4', fps=self.config.FPS) |
| | print(f'\nSuccessfully saved episode to {self.config.SAVE_PATH}/{episode_name}.mp4') |
| |
|
| | def find_caption_index(frame_index: int, intervals: list[tuple[int, int]]) -> Optional[int]: |
| | """Finds the interval index for a given frame index.""" |
| | for idx, (start, end) in enumerate(intervals): |
| | if start <= frame_index <= end: |
| | return idx |
| | return None |
| |
|
| | def generate_hand_colors(T: int, left_cmap: str, right_cmap: str) -> tuple[np.ndarray, np.ndarray]: |
| | """ |
| | Generates RGB color sequences for left and right hands over T frames. |
| | Returns colors in shape (T, 3), normalized 0-1, based on the specified colormaps. |
| | """ |
| | t_norm = np.linspace(0, 0.95, T) |
| | left_colors = mpl.colormaps.get_cmap(left_cmap)(t_norm)[:, :3] |
| | right_colors = mpl.colormaps.get_cmap(right_cmap)(t_norm)[:, :3] |
| | return left_colors, right_colors |
| |
|
| | def get_frame_interval(episode_info: dict) -> tuple[int, int]: |
| | """Extracts start (inclusive) and end (exclusive) frame indices from episode info.""" |
| | video_decode_frames = episode_info['video_decode_frame'] |
| | start_frame = video_decode_frames[0] |
| | end_frame = video_decode_frames[-1] + 1 |
| | return start_frame, end_frame |
| |
|
| | def normalize_camera_intrinsics(intrinsics: np.ndarray) -> np.ndarray: |
| | """ |
| | Normalizes intrinsics based on the assumption that the principal point |
| | is at the image center (image size is 2*cx, 2*cy). |
| | """ |
| | |
| | normalized_intrinsics = intrinsics.copy() |
| | normalized_intrinsics[0] /= normalized_intrinsics[0, 2] * 2 |
| | normalized_intrinsics[1] /= normalized_intrinsics[1, 2] * 2 |
| | return normalized_intrinsics |
| |
|
| | def get_camera_info(episode_info: dict) -> tuple[np.ndarray, np.ndarray, np.ndarray]: |
| | """ |
| | Extracts and normalizes camera intrinsics and extrinsics (world-to-cam). |
| | """ |
| | extrinsics = episode_info['extrinsics'] |
| | R_w2c = extrinsics[:, :3, :3].copy() |
| | t_w2c = extrinsics[:, :3, 3:].copy() |
| |
|
| | intrinsics = episode_info['intrinsics'].copy() |
| | normalized_intrinsics = normalize_camera_intrinsics(intrinsics) |
| |
|
| | return R_w2c, t_w2c, normalized_intrinsics |
| |
|
| | def get_caption_info(episode_info: dict) -> tuple[list, list, str]: |
| | """ |
| | Extracts and formats caption information for left and right hands. |
| | Adds a large interval if captions are empty to cover all frames. |
| | """ |
| | hand_type = episode_info['anno_type'] |
| |
|
| | caption_right = episode_info['text'].get('right', []) |
| | caption_left = episode_info['text'].get('left', []) |
| |
|
| | |
| | if not caption_right: |
| | caption_right = [['None.', (0, 10000)]] |
| | if not caption_left: |
| | caption_left = [['None.', (0, 10000)]] |
| |
|
| | return caption_left, caption_right, hand_type |
| |
|
| | def get_hand_labels(episode_info: dict, mano: MANO): |
| | """ |
| | Processes hand labels (pose, shape, translation, orientation) through the MANO model |
| | to obtain hand vertices in world space. |
| | """ |
| | left_labels = episode_info['left'] |
| | right_labels = episode_info['right'] |
| |
|
| | |
| | left_hand_mask = left_labels['kept_frames'] |
| | verts_left, _ = process_single_hand_labels(left_labels, left_hand_mask, mano, is_left=True) |
| |
|
| | |
| | right_hand_mask = right_labels['kept_frames'] |
| | verts_right, _ = process_single_hand_labels(right_labels, right_hand_mask, mano) |
| | |
| | return (verts_left, left_hand_mask), (verts_right, right_hand_mask) |
| |
|
| | def process_single_hand_labels(hand_labels: dict, hand_mask: np.ndarray, mano: MANO, is_left: bool = False): |
| | """ |
| | Helper function to compute MANO vertices for a single hand (left or right). |
| | """ |
| | T = len(hand_mask) |
| | |
| | wrist_worldspace = hand_labels['transl_worldspace'].reshape(-1, 1, 3) |
| | wrist_orientation = hand_labels['global_orient_worldspace'] |
| | beta = hand_labels['beta'] |
| | pose = hand_labels['hand_pose'] |
| |
|
| | |
| | identity = np.eye(3, dtype=pose.dtype) |
| | identity_block = np.broadcast_to(identity, (pose.shape[1], 3, 3)) |
| | mask_indices = (hand_mask == 0) |
| | if np.any(mask_indices): |
| | pose[mask_indices] = identity_block |
| | |
| |
|
| | beta_torch = torch.from_numpy(beta).float().cuda().unsqueeze(0).repeat(T, 1) |
| | pose_torch = torch.from_numpy(pose).float().cuda() |
| | |
| | |
| | global_rot_placeholder = torch.eye(3).float().unsqueeze(0).unsqueeze(0).cuda().repeat(T, 1, 1, 1) |
| | |
| | mano_out = mano(betas=beta_torch, hand_pose=pose_torch, global_orient=global_rot_placeholder) |
| | |
| | verts = mano_out.vertices.cpu().numpy() |
| | joints = mano_out.joints.cpu().numpy() |
| |
|
| | |
| | |
| | if is_left: |
| | verts[:, :, 0] *= -1 |
| | joints[:, :, 0] *= -1 |
| |
|
| | |
| | |
| | verts_worldspace = ( |
| | wrist_orientation @ |
| | (verts - joints[:, 0][:, None]).transpose(0, 2, 1) |
| | ).transpose(0, 2, 1) + wrist_worldspace |
| |
|
| | return verts_worldspace, joints[:, 0] |
| |
|
| | def generate_overlay_text(overlay_text: str, overlay_text_opposite: str, hand_type: str) -> str: |
| | """Formats the caption string based on the primary hand type.""" |
| | if hand_type == 'right': |
| | return f'Left: {overlay_text_opposite} | Right: {overlay_text}' |
| | else: |
| | return f'Left: {overlay_text} | Right: {overlay_text_opposite}' |