import copy from transformers import AutoConfig, Qwen3Config from transformers.configuration_utils import PretrainedConfig from transformers.utils import logging from .configuration_dinov3_vit import DINOv3ViTConfig logger = logging.get_logger(__name__) class ProjectorConfig(PretrainedConfig): model_type = "projector" _auto_class = "AutoConfig" def __init__( self, visual_hidden_size=4096, llm_hidden_size=4096, depth=2, hidden_act="gelu", bias=True, **kwargs, ): self.visual_hidden_size = visual_hidden_size self.llm_hidden_size = llm_hidden_size self.depth = depth self.hidden_act = hidden_act self.bias = bias super().__init__(**kwargs) class VectorLLMConfig(PretrainedConfig): model_type = 'vectorllm' processor_class = "VectorLLMProcessor", is_composition = True def __init__( self, vision_config=None, llm_config=None, regression_size=(128, 128), projector_depth=2, pixel_idx=0, **kwargs): super().__init__(**kwargs) if vision_config is None: vision_config = {} logger.info('vision_config is None. Initializing the DinoV3Config with default values.') if llm_config is None: llm_config = {} logger.info('llm_config is None. Initializing the Qwen3 config with default values.') self.vision_config = DINOv3ViTConfig(**vision_config) self.llm_config = Qwen3Config(**llm_config) self.text_config = self.llm_config self.hidden_size = self.llm_config.hidden_size self.vision_hidden_size = self.vision_config.hidden_size self.projector_config = ProjectorConfig( visual_hidden_size=self.vision_hidden_size, llm_hidden_size=self.hidden_size, depth=projector_depth ) self.regression_size = regression_size self.pixel_idx = pixel_idx self.tie_word_embeddings = False self.num_cls_register_tokens = 1 + self.vision_config.num_register_tokens def to_dict(self): """ Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns: `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, """ output = copy.deepcopy(self.__dict__) output['vision_config'] = self.vision_config.to_dict() output['llm_config'] = self.llm_config.to_dict() output['text_config'] = output['llm_config'] output['projector_config'] = self.projector_config.to_dict() output['model_type'] = self.__class__.model_type return output