dynaflip-base / processing_dynaflip.py
jlee-larr's picture
Upload folder using huggingface_hub
988579a verified
"""DynaFLIP processor — combines image processor and tokenizer."""
from typing import List, Optional, Union
from transformers import AutoTokenizer
from transformers.processing_utils import ProcessorMixin
from transformers.image_processing_utils import BatchFeature
class DynaFLIPProcessor(ProcessorMixin):
"""Processor for DynaFLIP models.
Combines DynaFLIPImageProcessor (for images) and T5Tokenizer (for text)
into a single processor, similar to SiglipProcessor.
Example:
>>> processor = DynaFLIPProcessor.from_pretrained("username/dynaflip-base")
>>> inputs = processor(images=img, text="pick up the cup", return_tensors="pt")
>>> inputs.keys() # dict_keys(['pixel_values', 'input_ids', 'attention_mask'])
"""
attributes = ["image_processor", "tokenizer"]
image_processor_class = "AutoImageProcessor"
tokenizer_class = "AutoTokenizer"
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
super().__init__(image_processor=image_processor, tokenizer=tokenizer)
def __call__(
self,
images=None,
text: Optional[Union[str, List[str]]] = None,
padding: Union[bool, str] = True,
truncation: bool = True,
max_length: int = 77,
return_tensors: Optional[str] = None,
**kwargs,
) -> BatchFeature:
"""Process images and/or text for DynaFLIP.
Args:
images: Single image or list of images.
text: Single string or list of strings.
padding: Tokenizer padding strategy.
truncation: Whether to truncate text.
max_length: Maximum text length.
return_tensors: "pt" for PyTorch, "np" for numpy.
Returns:
BatchFeature with pixel_values and/or input_ids + attention_mask.
"""
if images is None and text is None:
raise ValueError("You must provide at least one of `images` or `text`.")
encoding = BatchFeature()
if images is not None:
image_features = self.image_processor(
images, return_tensors=return_tensors, **kwargs
)
encoding.update(image_features)
if text is not None:
if isinstance(text, str):
text = [text]
text_features = self.tokenizer(
text,
return_tensors=return_tensors,
padding=padding,
truncation=truncation,
max_length=max_length,
)
encoding.update(text_features)
return encoding
def batch_decode(self, *args, **kwargs):
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
return self.tokenizer.decode(*args, **kwargs)
@property
def model_input_names(self):
image_names = self.image_processor.model_input_names
tokenizer_names = self.tokenizer.model_input_names
return list(dict.fromkeys(image_names + tokenizer_names))