jlee-larr
/

dynaflip-base

Zero-Shot Image Classification

feature-extraction

Model card Files Files and versions

dynaflip-base / processing_dynaflip.py

jlee-larr's picture

Upload folder using huggingface_hub

988579a verified 5 days ago

history blame contribute delete

3.05 kB

	"""DynaFLIP processor — combines image processor and tokenizer."""

	from typing import List, Optional, Union

	from transformers import AutoTokenizer
	from transformers.processing_utils import ProcessorMixin
	from transformers.image_processing_utils import BatchFeature


	class DynaFLIPProcessor(ProcessorMixin):
	"""Processor for DynaFLIP models.

	Combines DynaFLIPImageProcessor (for images) and T5Tokenizer (for text)
	into a single processor, similar to SiglipProcessor.

	Example:
	>>> processor = DynaFLIPProcessor.from_pretrained("username/dynaflip-base")
	>>> inputs = processor(images=img, text="pick up the cup", return_tensors="pt")
	>>> inputs.keys() # dict_keys(['pixel_values', 'input_ids', 'attention_mask'])
	"""

	attributes = ["image_processor", "tokenizer"]
	image_processor_class = "AutoImageProcessor"
	tokenizer_class = "AutoTokenizer"

	def __init__(self, image_processor=None, tokenizer=None, **kwargs):
	super().__init__(image_processor=image_processor, tokenizer=tokenizer)

	def __call__(
	self,
	images=None,
	text: Optional[Union[str, List[str]]] = None,
	padding: Union[bool, str] = True,
	truncation: bool = True,
	max_length: int = 77,
	return_tensors: Optional[str] = None,
	**kwargs,
	) -> BatchFeature:
	"""Process images and/or text for DynaFLIP.

	Args:
	images: Single image or list of images.
	text: Single string or list of strings.
	padding: Tokenizer padding strategy.
	truncation: Whether to truncate text.
	max_length: Maximum text length.
	return_tensors: "pt" for PyTorch, "np" for numpy.
	Returns:
	BatchFeature with pixel_values and/or input_ids + attention_mask.
	"""
	if images is None and text is None:
	raise ValueError("You must provide at least one of `images` or `text`.")

	encoding = BatchFeature()

	if images is not None:
	image_features = self.image_processor(
	images, return_tensors=return_tensors, **kwargs
	)
	encoding.update(image_features)

	if text is not None:
	if isinstance(text, str):
	text = [text]
	text_features = self.tokenizer(
	text,
	return_tensors=return_tensors,
	padding=padding,
	truncation=truncation,
	max_length=max_length,
	)
	encoding.update(text_features)

	return encoding

	def batch_decode(self, args, *kwargs):
	return self.tokenizer.batch_decode(args, *kwargs)

	def decode(self, args, *kwargs):
	return self.tokenizer.decode(args, *kwargs)

	@property
	def model_input_names(self):
	image_names = self.image_processor.model_input_names
	tokenizer_names = self.tokenizer.model_input_names
	return list(dict.fromkeys(image_names + tokenizer_names))