InternVideo3-8B-Instruct / inference_demo.py
yanziang's picture
Upload folder using huggingface_hub
e3bb923 verified
"""
InternVideo3-8B-Instruct Inference Demo
Supports:
- Text-only conversation
- Video understanding
- Image understanding
"""
import torch
import time
from transformers import AutoModelForCausalLM, AutoProcessor
from qwen_vl_utils import process_vision_info
# ============ Model Loading ============
model_path = "/mnt/shared-storage-user/yanziang/HF_toupload/InternVideo3-8B-Instruct" # current directory
model = AutoModelForCausalLM.from_pretrained(
model_path,
dtype=torch.bfloat16,
attn_implementation="sdpa",
device_map="cuda:0",
trust_remote_code=True,
)
processor = AutoProcessor.from_pretrained(
model_path,
trust_remote_code=True,
)
# processor = AutoProcessor.from_pretrained("/mnt/shared-storage-user/sfteval/sfteval_models/Qwen3-VL-8B-Instruct/",trust_remote_code=True)
# ============ Example 1: Text-only ============
print("=" * 50)
print("Example 1: Text-only")
print("=" * 50)
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Please introduce yourself."},
],
}
]
text = processor.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
enable_thinking=True,
)
inputs = processor(text=text, images=None, videos=None, do_resize=False, return_tensors="pt")
inputs = inputs.to(model.device)
start_time = time.time()
gen_output = model.generate(
**inputs,
max_new_tokens=1024,
use_cache=True,
)
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, gen_output)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text[0])
print(f"Time taken: {time.time() - start_time:.2f}s\n")
# ============ Example 2: Video Understanding ============
print("=" * 50)
print("Example 2: Video Understanding")
print("=" * 50)
video_path = "/mnt/shared-storage-user/yanziang/space_woaudio.mp4" # Replace with your video path
fps = 1
min_pixels = 128 * 32 * 32
max_pixels = 128 * 32 * 32
messages = [
{
"role": "user",
"content": [
{
"type": "video",
"video": video_path,
"fps": fps,
},
{"type": "text", "text": "Please describe this video in detail."},
],
}
]
processor.video_processor.size = {"longest_edge": max_pixels * 512, "shortest_edge": min_pixels * 32}
inputs = processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
fps=fps,
return_tensors="pt",
)
inputs = inputs.to(model.device)
start_time = time.time()
gen_output = model.generate(
**inputs,
max_new_tokens=1024,
use_cache=True,
)
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, gen_output)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text[0])
print(f"Time taken: {time.time() - start_time:.2f}s\n")
# # ============ Example 3: Image Understanding ============
# print("=" * 50)
# print("Example 3: Image Understanding")
# print("=" * 50)
# image_path = "/mnt/shared-storage-user/yanziang/demo.jpeg" # Replace with your image path
# messages = [
# {
# "role": "user",
# "content": [
# {
# "type": "image",
# "image": image_path,
# },
# {"type": "text", "text": "Please describe this image in detail."},
# ],
# }
# ]
# text = processor.apply_chat_template(
# messages,
# tokenize=False,
# add_generation_prompt=True,
# enable_thinking=True,
# )
# images, videos, video_kwargs = process_vision_info(
# messages, image_patch_size=16, return_video_kwargs=True, return_video_metadata=True
# )
# inputs = processor(
# text=text,
# images=images,
# videos=None,
# do_resize=False,
# return_tensors="pt",
# )
# inputs = inputs.to(model.device)
# start_time = time.time()
# gen_output = model.generate(
# **inputs,
# max_new_tokens=1024,
# use_cache=True,
# )
# generated_ids_trimmed = [
# out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, gen_output)
# ]
# output_text = processor.batch_decode(
# generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
# )
# print(output_text[0])
# print(f"Time taken: {time.time() - start_time:.2f}s\n")