| |
|
|
| |
| |
| |
| |
|
|
| from pathlib import Path |
| from types import SimpleNamespace |
|
|
| import torchvision.transforms as transforms |
| from PIL import Image |
|
|
| from m4.models.vopt.modeling_vopt import VOPTConfig, VOPTForCausalLM |
| from m4.training.packing import image_attention_mask_for_packed_input_ids, incremental_to_binary_attention_mask |
| from m4.training.utils import get_tokenizer |
|
|
|
|
| mname_tiny = "tiny-random-vopt-clip" |
|
|
| path = Path(mname_tiny) |
| path.mkdir(parents=True, exist_ok=True) |
|
|
| |
| additional_vocab_size = 2 |
|
|
| config = VOPTConfig() |
| config.update( |
| dict( |
| ffn_dim=64, |
| hidden_size=16, |
| max_position_embeddings=128, |
| num_attention_heads=4, |
| num_hidden_layers=2, |
| word_embed_proj_dim=16, |
| max_new_tokens=100, |
| use_resampler=True, |
| resampler_depth=2, |
| resampler_head_dim=8, |
| resampler_n_heads=2, |
| resampler_n_latents=16, |
| vision_embed_dim=32, |
| vision_image_size=30, |
| vision_model_name="hf-internal-testing/tiny-random-clip", |
| vision_model_params="{}", |
| vocab_size=50265, |
| additional_vocab_size=additional_vocab_size, |
| ) |
| ) |
|
|
| |
| |
|
|
| model = VOPTForCausalLM.from_config(config) |
| |
| |
|
|
| tokenizer_config = dict( |
| tokenizer_add_special_tokens="{}", |
| tokenizer_add_tokens=( |
| '[AddedToken("<fake_token_around_image>", rstrip=False, lstrip=False), AddedToken("<image>", rstrip=False,' |
| " lstrip=False)]" |
| ), |
| tokenizer_name="facebook/opt-13b", |
| tokenizer_params='{"use_fast":True}', |
| ) |
| tokenizer_config = SimpleNamespace(**tokenizer_config) |
| |
|
|
| tokenizer = get_tokenizer( |
| tokenizer_name=tokenizer_config.tokenizer_name, |
| tokenizer_add_tokens=tokenizer_config.tokenizer_add_tokens, |
| tokenizer_add_special_tokens=tokenizer_config.tokenizer_add_special_tokens, |
| tokenizer_params=tokenizer_config.tokenizer_params, |
| additional_vocab_size=model.config.additional_vocab_size, |
| model_vocab_size=model.config.vocab_size, |
| ) |
| assert "<image>" in tokenizer.get_vocab() |
|
|
| |
| query = "<fake_token_around_image><image><fake_token_around_image>This is a picture of a cat." |
| query_tokens = tokenizer(query, return_tensors="pt") |
|
|
| num_images_per_ex = 1 |
| pixel_values = transforms.ToTensor()(Image.new("RGB", (30, 30))).repeat(1, 1, 1, 1).unsqueeze(0) |
| image_attention_mask, _ = image_attention_mask_for_packed_input_ids(query_tokens["input_ids"], tokenizer) |
| image_attention_mask = incremental_to_binary_attention_mask(image_attention_mask, num_classes=num_images_per_ex) |
|
|
| input = { |
| "input_ids": query_tokens["input_ids"], |
| "attention_mask": query_tokens["attention_mask"], |
| "pixel_values": pixel_values, |
| "pixel_values": pixel_values, |
| "image_attention_mask": image_attention_mask, |
| } |
| |
| |
| |
| |
| |
|
|
| out_gen = model.generate(**input) |
| text = tokenizer.batch_decode(out_gen) |
| |
|
|
| |
| model.half() |
| model.save_pretrained(path) |
| tokenizer.save_pretrained(path) |
|
|
| |
| model = VOPTForCausalLM.from_pretrained(path) |
|
|
| print(f"Generated {mname_tiny} - Upload the generated folder to the hub") |
|
|