Video-Text-to-Text
Transformers
Safetensors
English
Chinese
internvideo3
text-generation
video-understanding
multimodal
long-video
agent
custom_code
Instructions to use yanziang/InternVideo3-8B-Instruct with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use yanziang/InternVideo3-8B-Instruct with Transformers:
# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("yanziang/InternVideo3-8B-Instruct", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
| """ | |
| InternVideo3-8B-Instruct Inference Demo | |
| Supports: | |
| - Text-only conversation | |
| - Video understanding | |
| - Image understanding | |
| """ | |
| import torch | |
| import time | |
| from transformers import AutoModelForCausalLM, AutoProcessor | |
| from qwen_vl_utils import process_vision_info | |
| # ============ Model Loading ============ | |
| model_path = "/mnt/shared-storage-user/yanziang/HF_toupload/InternVideo3-8B-Instruct" # current directory | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_path, | |
| dtype=torch.bfloat16, | |
| attn_implementation="sdpa", | |
| device_map="cuda:0", | |
| trust_remote_code=True, | |
| ) | |
| processor = AutoProcessor.from_pretrained( | |
| model_path, | |
| trust_remote_code=True, | |
| ) | |
| # processor = AutoProcessor.from_pretrained("/mnt/shared-storage-user/sfteval/sfteval_models/Qwen3-VL-8B-Instruct/",trust_remote_code=True) | |
| # ============ Example 1: Text-only ============ | |
| print("=" * 50) | |
| print("Example 1: Text-only") | |
| print("=" * 50) | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "text", "text": "Please introduce yourself."}, | |
| ], | |
| } | |
| ] | |
| text = processor.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True, | |
| enable_thinking=True, | |
| ) | |
| inputs = processor(text=text, images=None, videos=None, do_resize=False, return_tensors="pt") | |
| inputs = inputs.to(model.device) | |
| start_time = time.time() | |
| gen_output = model.generate( | |
| **inputs, | |
| max_new_tokens=1024, | |
| use_cache=True, | |
| ) | |
| generated_ids_trimmed = [ | |
| out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, gen_output) | |
| ] | |
| output_text = processor.batch_decode( | |
| generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False | |
| ) | |
| print(output_text[0]) | |
| print(f"Time taken: {time.time() - start_time:.2f}s\n") | |
| # ============ Example 2: Video Understanding ============ | |
| print("=" * 50) | |
| print("Example 2: Video Understanding") | |
| print("=" * 50) | |
| video_path = "/mnt/shared-storage-user/yanziang/space_woaudio.mp4" # Replace with your video path | |
| fps = 1 | |
| min_pixels = 128 * 32 * 32 | |
| max_pixels = 128 * 32 * 32 | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "video", | |
| "video": video_path, | |
| "fps": fps, | |
| }, | |
| {"type": "text", "text": "Please describe this video in detail."}, | |
| ], | |
| } | |
| ] | |
| processor.video_processor.size = {"longest_edge": max_pixels * 512, "shortest_edge": min_pixels * 32} | |
| inputs = processor.apply_chat_template( | |
| messages, | |
| tokenize=True, | |
| add_generation_prompt=True, | |
| return_dict=True, | |
| fps=fps, | |
| return_tensors="pt", | |
| ) | |
| inputs = inputs.to(model.device) | |
| start_time = time.time() | |
| gen_output = model.generate( | |
| **inputs, | |
| max_new_tokens=1024, | |
| use_cache=True, | |
| ) | |
| generated_ids_trimmed = [ | |
| out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, gen_output) | |
| ] | |
| output_text = processor.batch_decode( | |
| generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False | |
| ) | |
| print(output_text[0]) | |
| print(f"Time taken: {time.time() - start_time:.2f}s\n") | |
| # # ============ Example 3: Image Understanding ============ | |
| # print("=" * 50) | |
| # print("Example 3: Image Understanding") | |
| # print("=" * 50) | |
| # image_path = "/mnt/shared-storage-user/yanziang/demo.jpeg" # Replace with your image path | |
| # messages = [ | |
| # { | |
| # "role": "user", | |
| # "content": [ | |
| # { | |
| # "type": "image", | |
| # "image": image_path, | |
| # }, | |
| # {"type": "text", "text": "Please describe this image in detail."}, | |
| # ], | |
| # } | |
| # ] | |
| # text = processor.apply_chat_template( | |
| # messages, | |
| # tokenize=False, | |
| # add_generation_prompt=True, | |
| # enable_thinking=True, | |
| # ) | |
| # images, videos, video_kwargs = process_vision_info( | |
| # messages, image_patch_size=16, return_video_kwargs=True, return_video_metadata=True | |
| # ) | |
| # inputs = processor( | |
| # text=text, | |
| # images=images, | |
| # videos=None, | |
| # do_resize=False, | |
| # return_tensors="pt", | |
| # ) | |
| # inputs = inputs.to(model.device) | |
| # start_time = time.time() | |
| # gen_output = model.generate( | |
| # **inputs, | |
| # max_new_tokens=1024, | |
| # use_cache=True, | |
| # ) | |
| # generated_ids_trimmed = [ | |
| # out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, gen_output) | |
| # ] | |
| # output_text = processor.batch_decode( | |
| # generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False | |
| # ) | |
| # print(output_text[0]) | |
| # print(f"Time taken: {time.time() - start_time:.2f}s\n") | |