Instructions to use SinclairSchneider/dbrx-base-quantization-fixed with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use SinclairSchneider/dbrx-base-quantization-fixed with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="SinclairSchneider/dbrx-base-quantization-fixed", trust_remote_code=True)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("SinclairSchneider/dbrx-base-quantization-fixed", trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained("SinclairSchneider/dbrx-base-quantization-fixed", trust_remote_code=True) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use SinclairSchneider/dbrx-base-quantization-fixed with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "SinclairSchneider/dbrx-base-quantization-fixed" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "SinclairSchneider/dbrx-base-quantization-fixed", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/SinclairSchneider/dbrx-base-quantization-fixed
- SGLang
How to use SinclairSchneider/dbrx-base-quantization-fixed with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "SinclairSchneider/dbrx-base-quantization-fixed" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "SinclairSchneider/dbrx-base-quantization-fixed", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "SinclairSchneider/dbrx-base-quantization-fixed" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "SinclairSchneider/dbrx-base-quantization-fixed", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use SinclairSchneider/dbrx-base-quantization-fixed with Docker Model Runner:
docker model run hf.co/SinclairSchneider/dbrx-base-quantization-fixed
| """Dbrx configuration.""" | |
| from typing import Any, Optional | |
| from transformers.configuration_utils import PretrainedConfig | |
| from transformers.utils import logging | |
| logger = logging.get_logger(__name__) | |
| DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = {} | |
| class DbrxAttentionConfig(PretrainedConfig): | |
| """Configuration class for Dbrx Attention. | |
| [`DbrxAttention`] class. It is used to instantiate attention layers | |
| according to the specified arguments, defining the layers architecture. | |
| Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the | |
| documentation from [`PretrainedConfig`] for more information. | |
| Args: | |
| attn_pdrop (`float`, *optional*, defaults to 0.0): | |
| The dropout probability for the attention layers. | |
| clip_qkv (`float`, *optional*, defualts to None): | |
| If not `None`, clip the queries, keys, and values in the attention layer to this value. | |
| kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads. | |
| rope_theta (float): The base frequency for rope. | |
| """ | |
| def __init__( | |
| self, | |
| attn_pdrop: float = 0, | |
| clip_qkv: Optional[float] = None, | |
| kv_n_heads: int = 1, | |
| rope_theta: float = 10000.0, | |
| **kwargs: Any, | |
| ): | |
| super().__init__(**kwargs) | |
| self.attn_pdrop = attn_pdrop | |
| self.clip_qkv = clip_qkv | |
| self.kv_n_heads = kv_n_heads | |
| self.rope_theta = rope_theta | |
| for k in ['model_type']: | |
| if k in kwargs: | |
| kwargs.pop(k) | |
| if len(kwargs) != 0: | |
| raise ValueError(f'Found unknown {kwargs=}') | |
| def from_pretrained(cls, pretrained_model_name_or_path: str, | |
| **kwargs: Any) -> 'PretrainedConfig': | |
| cls._set_token_in_kwargs(kwargs) | |
| config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, | |
| **kwargs) | |
| if config_dict.get('model_type') == 'dbrx': | |
| config_dict = config_dict['attn_config'] | |
| if 'model_type' in config_dict and hasattr( | |
| cls, | |
| 'model_type') and config_dict['model_type'] != cls.model_type: | |
| logger.warning( | |
| f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " | |
| + | |
| f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.' | |
| ) | |
| return cls.from_dict(config_dict, **kwargs) | |
| class DbrxFFNConfig(PretrainedConfig): | |
| """Configuration class for Dbrx FFN. | |
| [`DbrxFFN`] class. It is used to instantiate feedforward layers according to | |
| the specified arguments, defining the layers architecture. | |
| Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the | |
| documentation from [`PretrainedConfig`] for more information. | |
| Args: | |
| ffn_act_fn (dict, optional): A dict specifying activation function for the FFN. | |
| The dict should have a key 'name' with the value being the name of | |
| the activation function along with any additional keyword arguments. | |
| ffn_hidden_size (int, optional): The hidden size of the feedforward network. | |
| moe_num_experts (int, optional): The number of experts in the mixture of experts layer. | |
| moe_top_k (int, optional): The number of experts to use in the mixture of experts layer. | |
| moe_jitter_eps (float, optional): The jitter epsilon for the mixture of experts layer. | |
| moe_loss_weight (float, optional): The loss weight for the mixture of experts layer. | |
| moe_normalize_expert_weights (float, optional): The normalization factor for the expert weights. | |
| uniform_expert_assignment (bool, optional): Whether to use uniform expert assignment. | |
| This should only be used for benchmarking purposes. | |
| """ | |
| def __init__( | |
| self, | |
| ffn_act_fn: Optional[dict] = None, | |
| ffn_hidden_size: int = 3584, | |
| moe_num_experts: int = 4, | |
| moe_top_k: int = 1, | |
| moe_jitter_eps: Optional[float] = None, | |
| moe_loss_weight: float = 0.01, | |
| moe_normalize_expert_weights: Optional[float] = 1, | |
| uniform_expert_assignment: bool = False, | |
| **kwargs: Any, | |
| ): | |
| super().__init__() | |
| if ffn_act_fn is None: | |
| ffn_act_fn = {'name': 'silu'} | |
| self.ffn_act_fn = ffn_act_fn | |
| self.ffn_hidden_size = ffn_hidden_size | |
| self.moe_num_experts = moe_num_experts | |
| self.moe_top_k = moe_top_k | |
| self.moe_jitter_eps = moe_jitter_eps | |
| self.moe_loss_weight = moe_loss_weight | |
| self.moe_normalize_expert_weights = moe_normalize_expert_weights | |
| self.uniform_expert_assignment = uniform_expert_assignment | |
| for k in ['model_type']: | |
| if k in kwargs: | |
| kwargs.pop(k) | |
| if len(kwargs) != 0: | |
| raise ValueError(f'Found unknown {kwargs=}') | |
| def from_pretrained(cls, pretrained_model_name_or_path: str, | |
| **kwargs: Any) -> 'PretrainedConfig': | |
| cls._set_token_in_kwargs(kwargs) | |
| config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, | |
| **kwargs) | |
| if config_dict.get('model_type') == 'dbrx': | |
| config_dict = config_dict['ffn_config'] | |
| if 'model_type' in config_dict and hasattr( | |
| cls, | |
| 'model_type') and config_dict['model_type'] != cls.model_type: | |
| logger.warning( | |
| f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " | |
| + | |
| f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.' | |
| ) | |
| return cls.from_dict(config_dict, **kwargs) | |
| class DbrxConfig(PretrainedConfig): | |
| """Configuration class for Dbrx. | |
| [`DbrxModel`]. It is used to instantiate a Dbrx model according to the | |
| specified arguments, defining the model architecture. | |
| Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the | |
| documentation from [`PretrainedConfig`] for more information. | |
| Args: | |
| d_model (`int`, *optional*, defaults to 6144): | |
| Dimensionality of the embeddings and hidden states. | |
| n_heads (`int`, *optional*, defaults to 48): | |
| Number of attention heads for each attention layer in the Transformer encoder. | |
| n_layers (`int`, *optional*, defaults to 40): | |
| Number of hidden layers in the Transformer encoder. | |
| max_seq_len (`int`, *optional*, defaults to 32768): | |
| The maximum sequence length of the model. | |
| vocab_size (`int`, *optional*, defaults to 100352): | |
| Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by | |
| the `inputs_ids` passed when calling [`DbrxModel`]. | |
| resid_pdrop (`float`, *optional*, defaults to 0.0): | |
| The dropout probability applied to the attention output before combining with residual. | |
| emb_pdrop (`float`, *optional*, defaults to 0.0): | |
| The dropout probability for the embedding layer. | |
| attn_config (`dict`, *optional*): | |
| A dictionary used to configure the model's attention module. | |
| ffn_config (`dict`, *optional*): | |
| A dictionary used to configure the model's FFN module. | |
| use_cache (`bool`, *optional*, defaults to `False`): | |
| Whether or not the model should return the last key/values attentions (not used by all models). | |
| initializer_range (`float`, *optional*, defaults to 0.02): | |
| The standard deviation of the truncated_normal_initializer for initializing all weight matrices. | |
| output_router_logits (`bool`, *optional*, defaults to `False`): | |
| Whether or not the router logits should be returned by the model. Enabling this will also | |
| allow the model to output the auxiliary loss. See [here]() for more details | |
| router_aux_loss_coef (`float`, *optional*, defaults to 0.001): | |
| The aux loss factor for the total loss. | |
| Example: | |
| ```python | |
| >>> from transformers import DbrxConfig, DbrxModel | |
| >>> # Initializing a Dbrx configuration | |
| >>> configuration = DbrxConfig() | |
| >>> # Initializing a model (with random weights) from the configuration | |
| >>> model = DbrxModel(configuration) | |
| >>> # Accessing the model configuration | |
| >>> configuration = model.config | |
| ``` | |
| """ | |
| model_type = 'dbrx' | |
| attribute_map = { | |
| 'num_attention_heads': 'n_heads', | |
| 'hidden_size': 'd_model', | |
| 'num_hidden_layers': 'n_layers', | |
| 'max_position_embeddings': 'max_seq_len' | |
| } | |
| def __init__( | |
| self, | |
| d_model: int = 2048, | |
| n_heads: int = 16, | |
| n_layers: int = 24, | |
| max_seq_len: int = 2048, | |
| vocab_size: int = 32000, | |
| resid_pdrop: float = 0.0, | |
| emb_pdrop: float = 0.0, | |
| attn_config: Optional[DbrxAttentionConfig] = None, | |
| ffn_config: Optional[DbrxFFNConfig] = None, | |
| use_cache: bool = True, | |
| initializer_range: float = 0.02, | |
| output_router_logits: bool = False, | |
| router_aux_loss_coef: float = 0.05, | |
| **kwargs: Any, | |
| ): | |
| if attn_config is None: | |
| self.attn_config = DbrxAttentionConfig() | |
| elif isinstance(attn_config, dict): | |
| self.attn_config = DbrxAttentionConfig(**attn_config) | |
| else: | |
| self.attn_config = attn_config | |
| if ffn_config is None: | |
| self.ffn_config = DbrxFFNConfig() | |
| elif isinstance(ffn_config, dict): | |
| self.ffn_config = DbrxFFNConfig(**ffn_config) | |
| else: | |
| self.ffn_config = ffn_config | |
| self.d_model = d_model | |
| self.n_heads = n_heads | |
| self.n_layers = n_layers | |
| self.max_seq_len = max_seq_len | |
| self.vocab_size = vocab_size | |
| self.resid_pdrop = resid_pdrop | |
| self.emb_pdrop = emb_pdrop | |
| self.use_cache = use_cache | |
| self.initializer_range = initializer_range | |
| self.output_router_logits = output_router_logits | |
| self.router_aux_loss_coef = router_aux_loss_coef | |
| tie_word_embeddings = kwargs.pop('tie_word_embeddings', False) | |
| if tie_word_embeddings: | |
| raise ValueError( | |
| 'tie_word_embeddings is not supported for Dbrx models.') | |
| super().__init__( | |
| tie_word_embeddings=tie_word_embeddings, | |
| **kwargs, | |
| ) | |