Instructions to use jlee-larr/dynaflip-base with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use jlee-larr/dynaflip-base with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("zero-shot-image-classification", model="jlee-larr/dynaflip-base", trust_remote_code=True) pipe( "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/hub/parrots.png", candidate_labels=["animals", "humans", "landscape"], )# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("jlee-larr/dynaflip-base", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
| """DynaFLIP processor — combines image processor and tokenizer.""" | |
| from typing import List, Optional, Union | |
| from transformers import AutoTokenizer | |
| from transformers.processing_utils import ProcessorMixin | |
| from transformers.image_processing_utils import BatchFeature | |
| class DynaFLIPProcessor(ProcessorMixin): | |
| """Processor for DynaFLIP models. | |
| Combines DynaFLIPImageProcessor (for images) and T5Tokenizer (for text) | |
| into a single processor, similar to SiglipProcessor. | |
| Example: | |
| >>> processor = DynaFLIPProcessor.from_pretrained("username/dynaflip-base") | |
| >>> inputs = processor(images=img, text="pick up the cup", return_tensors="pt") | |
| >>> inputs.keys() # dict_keys(['pixel_values', 'input_ids', 'attention_mask']) | |
| """ | |
| attributes = ["image_processor", "tokenizer"] | |
| image_processor_class = "AutoImageProcessor" | |
| tokenizer_class = "AutoTokenizer" | |
| def __init__(self, image_processor=None, tokenizer=None, **kwargs): | |
| super().__init__(image_processor=image_processor, tokenizer=tokenizer) | |
| def __call__( | |
| self, | |
| images=None, | |
| text: Optional[Union[str, List[str]]] = None, | |
| padding: Union[bool, str] = True, | |
| truncation: bool = True, | |
| max_length: int = 77, | |
| return_tensors: Optional[str] = None, | |
| **kwargs, | |
| ) -> BatchFeature: | |
| """Process images and/or text for DynaFLIP. | |
| Args: | |
| images: Single image or list of images. | |
| text: Single string or list of strings. | |
| padding: Tokenizer padding strategy. | |
| truncation: Whether to truncate text. | |
| max_length: Maximum text length. | |
| return_tensors: "pt" for PyTorch, "np" for numpy. | |
| Returns: | |
| BatchFeature with pixel_values and/or input_ids + attention_mask. | |
| """ | |
| if images is None and text is None: | |
| raise ValueError("You must provide at least one of `images` or `text`.") | |
| encoding = BatchFeature() | |
| if images is not None: | |
| image_features = self.image_processor( | |
| images, return_tensors=return_tensors, **kwargs | |
| ) | |
| encoding.update(image_features) | |
| if text is not None: | |
| if isinstance(text, str): | |
| text = [text] | |
| text_features = self.tokenizer( | |
| text, | |
| return_tensors=return_tensors, | |
| padding=padding, | |
| truncation=truncation, | |
| max_length=max_length, | |
| ) | |
| encoding.update(text_features) | |
| return encoding | |
| def batch_decode(self, *args, **kwargs): | |
| return self.tokenizer.batch_decode(*args, **kwargs) | |
| def decode(self, *args, **kwargs): | |
| return self.tokenizer.decode(*args, **kwargs) | |
| def model_input_names(self): | |
| image_names = self.image_processor.model_input_names | |
| tokenizer_names = self.tokenizer.model_input_names | |
| return list(dict.fromkeys(image_names + tokenizer_names)) | |