Instructions to use apple/OpenELM-450M-Instruct with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use apple/OpenELM-450M-Instruct with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="apple/OpenELM-450M-Instruct", trust_remote_code=True)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("apple/OpenELM-450M-Instruct", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use apple/OpenELM-450M-Instruct with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "apple/OpenELM-450M-Instruct" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "apple/OpenELM-450M-Instruct", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/apple/OpenELM-450M-Instruct
- SGLang
How to use apple/OpenELM-450M-Instruct with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "apple/OpenELM-450M-Instruct" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "apple/OpenELM-450M-Instruct", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "apple/OpenELM-450M-Instruct" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "apple/OpenELM-450M-Instruct", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use apple/OpenELM-450M-Instruct with Docker Model Runner:
docker model run hf.co/apple/OpenELM-450M-Instruct
| # | |
| # For licensing see accompanying LICENSE file. | |
| # Copyright (C) 2024 Apple Inc. All Rights Reserved. | |
| # | |
| """Implements HF OpenELMConfig based on PretrainedConfig""" | |
| from numbers import Number | |
| from typing import List, Optional, Union | |
| import numpy as np | |
| from transformers import PretrainedConfig | |
| def make_divisible( | |
| v: Union[float, int], | |
| divisor: Optional[int] = 8, | |
| min_value: Optional[Union[float, int]] = None, | |
| ) -> Union[float, int]: | |
| """ | |
| This function is taken from the original tf repo. | |
| It ensures that all layers have a channel number that is divisible by the divisor | |
| It can be seen at: | |
| https://github.com/tensorflow/models/blob/2cfc99eff5e5eb729c6793d2f3d03aa1c9be2b15/research/slim/nets/mobilenet/mobilenet.py#L62 | |
| Args: | |
| v: input value | |
| divisor: default to 8 | |
| min_value: minimum divisor value | |
| Returns: | |
| new_v: new divisible value | |
| """ | |
| if min_value is None: | |
| min_value = divisor | |
| new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) | |
| # Make sure that round down does not go down by more than 10%. | |
| if new_v < 0.9 * v: | |
| new_v += divisor | |
| return new_v | |
| def compute_heads(model_dim: int, head_dim: int) -> int: | |
| """Compute the number of heads. | |
| Args: | |
| model_dim: Model dimension. | |
| head_dim: Head dimension. | |
| Returns: | |
| An integer denoting number of heads in multi-head attention is returned. | |
| Raises: | |
| ValueError: if model dimension is not divisible by head dimension. | |
| """ | |
| if model_dim % head_dim == 0: | |
| return model_dim // head_dim | |
| else: | |
| raise ValueError( | |
| f"Model dimension should be divisible by head dimension. Got: {model_dim} and {head_dim}." | |
| ) | |
| OpenELM_CONFIGS = { | |
| "OpenELM-270M": dict( | |
| num_transformer_layers=16, | |
| model_dim=1280, | |
| head_dim=64, | |
| num_gqa_groups=4, | |
| normalize_qk_projections=True, | |
| share_input_output_layers=True, | |
| # Vary the FFN and QKV multipliers to create variable FFN and attention layers respectively. | |
| ffn_multipliers=(0.5, 4.0), | |
| qkv_multipliers=(0.5, 1.0), | |
| ), | |
| "OpenELM-450M": dict( | |
| num_transformer_layers=20, | |
| model_dim=1536, | |
| head_dim=64, | |
| num_gqa_groups=4, | |
| normalize_qk_projections=True, | |
| share_input_output_layers=True, | |
| # Vary the FFN and QKV multipliers to create variable FFN and attention layers respectively. | |
| ffn_multipliers=(0.5, 4.0), | |
| qkv_multipliers=(0.5, 1.0), | |
| ), | |
| "OpenELM-1_1B": dict( | |
| num_transformer_layers=28, | |
| model_dim=2048, | |
| head_dim=64, | |
| num_gqa_groups=4, | |
| normalize_qk_projections=True, | |
| share_input_output_layers=True, | |
| # Vary the FFN and QKV multipliers to create variable FFN and attention layers respectively. | |
| ffn_multipliers=(0.5, 4.0), | |
| qkv_multipliers=(0.5, 1.0), | |
| ), | |
| "OpenELM-3B": dict( | |
| num_transformer_layers=36, | |
| model_dim=3072, | |
| head_dim=128, | |
| num_gqa_groups=4, | |
| normalize_qk_projections=True, | |
| share_input_output_layers=True, | |
| # Vary the FFN and QKV multipliers to create variable FFN and attention layers respectively. | |
| ffn_multipliers=(0.5, 4.0), | |
| qkv_multipliers=(0.5, 1.0), | |
| ), | |
| } | |
| class OpenELMConfig(PretrainedConfig): | |
| r""" | |
| This is the configuration class to store the configuration of a [`OpenELMModel`]. It is used to instantiate an OpenELM model according to the specified arguments, defining the model architecture. | |
| Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the | |
| documentation from [`PretrainedConfig`] for more information. | |
| Args: | |
| vocab_size (`int`, *optional*, defaults to 32000): | |
| Vocabulary size of the OpenELM model. | |
| max_context_length (`int`, *optional*, defaults to 2048): | |
| Maximum number of input tokens. | |
| num_transformer_layers (`int`, *optional*, defaults to 12): | |
| Number of hidden layers in the Transformer decoder. | |
| model_dim (`int`, *optional*, defaults to 2048): | |
| Dimension of the hidden representations. | |
| head_dim (`int`, *optional*, defaults to 128): | |
| The attention head dimension. | |
| qkv_multipliers (`Union[Number, List[Number]]`, *optional*, defaults to 1.0): | |
| If the qkv_multipliers is a Number, then all attention layers have the same latent dimensions, | |
| resulting in uniform allocation of parameters. | |
| If the qkv_multipliers is a List of Number, then each attention layer have different latent dimensions | |
| assuming qkv_multipliers[0] != qkv_multipliers[1]. This results in variable allocation of parameters in attention layer. | |
| This scaling is known as layer-wise or block-wise scaling: https://arxiv.org/abs/2008.00623 | |
| num_query_heads (`Union[int, None]`, *optional*, defaults to None): | |
| The number of query heads, computed from `compute_heads(model_dim=model_dim, head_dim=head_dim)`. | |
| num_gqa_groups (`int`, *optional*, defaults to 1): | |
| This variable allows to switch between multi-head attention, group query attention, and multi-query attention. | |
| When num_gqa_groups == 1, then it is multi-head attention. | |
| When 1 < num_gqa_groups < num_heads and num_heads is divisible by num_gqa_groups, then it is group query attention | |
| When num_gqa_groups == num_heads, then it is multi-query attention | |
| ffn_multipliers (`Union[Number, List[Number]]`, *optional*, defaults to 4.0): | |
| Feed-forward network (FFN) multipliers. | |
| If the ffn_multipliers is a Number, then all FFN layers have the same latent dimensions, | |
| resulting in uniform allocation of parameters. | |
| If the ffn_multipliers is a List of Number, then each FFN layer have different latent dimensions | |
| assuming ffn_multipliers[0] != ffn_multipliers[1]. This results in variable allocation of parameters in FFN layer. | |
| This scaling is known as layer-wise or block-wise scaling: https://arxiv.org/abs/2008.00623 | |
| ffn_with_glu (`bool`, *optional*, defaults to True): | |
| Whether to use FFN with Gated Linear Unit (GLU) | |
| ffn_dim_divisor (`int`, *optional*, defaults to 256): | |
| The ffn layer dimension divisor. | |
| activation_fn_name (`str` or `function`, *optional*, defaults to `"swish"`): | |
| The non-linear activation function (function or string) in the decoder. | |
| normalization_layer_name (`str` or `function`, *optional*, defaults to `"rms_norm"`): | |
| Type of normalization layer. | |
| normalize_qk_projections (`bool`, *optional*, defaults to False): | |
| Whether to normalize queries and keys after projections | |
| share_input_output_layers (`bool`, *optional*, defaults to False): | |
| Whether to share the embedding between input and output linear layer | |
| rope_freq_constant (`int`, *optional*, defaults to 10000): | |
| The base period of the RoPE embeddings. | |
| rope_max_length (`int`, *optional*, defaults to 4096): | |
| That rope_max_length is set to twice of max_context_length. | |
| This allows flexibility in token lengths during training or fine-tuning. | |
| initializer_range (`float`, *optional*, defaults to 0.02): | |
| The standard deviation of the truncated_normal_initializer for initializing all weight matrices. | |
| use_cache (`bool`, *optional*, defaults to `True`): | |
| Whether or not the model should return the last key/values attentions (not used by all models). Only | |
| relevant if `config.is_decoder=True`. | |
| bos_token_id (`int`, *optional*, defaults to 2): | |
| Beginning of stream token id. | |
| eos_token_id (`int`, *optional*, defaults to 1): | |
| End of stream token id. | |
| """ | |
| model_type = "openelm" | |
| def __init__( | |
| self, | |
| vocab_size: int = 32000, | |
| max_context_length: int = 2048, | |
| num_transformer_layers: int = 12, | |
| model_dim: int = 2048, | |
| head_dim: int = 128, | |
| qkv_multipliers: Union[Number, List[Number]] = 1.0, | |
| num_query_heads: Union[int, None] = None, | |
| num_gqa_groups: int = 1, | |
| ffn_multipliers: Union[Number, List[Number]] = 4.0, | |
| ffn_with_glu: bool = True, | |
| ffn_dim_divisor: int = 256, | |
| activation_fn_name: str = "swish", | |
| normalization_layer_name: str = "rms_norm", | |
| normalize_qk_projections: bool = False, | |
| share_input_output_layers: bool = False, | |
| rope_freq_constant: int = 10000, | |
| rope_max_length: int = 4096, | |
| initializer_range: float = 0.02, | |
| use_cache: bool = True, | |
| bos_token_id: int = 1, | |
| eos_token_id: int = 2, | |
| **kwargs, | |
| ) -> None: | |
| self.vocab_size = vocab_size | |
| self.max_context_length = max_context_length | |
| self.num_transformer_layers = num_transformer_layers | |
| self.model_dim = model_dim | |
| self.head_dim = head_dim | |
| self.qkv_multipliers = qkv_multipliers | |
| self.num_query_heads = num_query_heads | |
| self.num_gqa_groups = num_gqa_groups | |
| self.ffn_multipliers = ffn_multipliers | |
| self.ffn_with_glu = ffn_with_glu | |
| self.ffn_dim_divisor = ffn_dim_divisor | |
| self.activation_fn_name = activation_fn_name | |
| self.normalization_layer_name = normalization_layer_name | |
| self.normalize_qk_projections = normalize_qk_projections | |
| self.share_input_output_layers = share_input_output_layers | |
| self.rope_freq_constant = rope_freq_constant | |
| self.rope_max_length = rope_max_length | |
| self.num_query_heads = ( | |
| compute_heads(model_dim=model_dim, head_dim=head_dim) | |
| if num_query_heads is None | |
| else num_query_heads | |
| ) | |
| self.initializer_range = initializer_range | |
| self.__post_init__() | |
| super().__init__( | |
| use_cache=use_cache, | |
| bos_token_id=bos_token_id, | |
| eos_token_id=eos_token_id, | |
| **kwargs, | |
| ) | |
| def __post_init__(self) -> None: | |
| if self.num_gqa_groups is not None: | |
| head_multiple_of = self.num_gqa_groups | |
| else: | |
| head_multiple_of = 2 | |
| if isinstance(self.qkv_multipliers, Number): | |
| # All attention layers have the same latent dimensions, resulting in uniform allocation of parameters. | |
| qkv_dim = make_divisible( | |
| self.model_dim * self.qkv_multipliers, | |
| divisor=self.head_dim * head_multiple_of, | |
| ) | |
| query_dims = [int(qkv_dim)] * self.num_transformer_layers | |
| elif ( | |
| isinstance(self.qkv_multipliers, (tuple, list)) | |
| and len(self.qkv_multipliers) == 2 | |
| ): | |
| # Each attention layer have different latent dimensions assuming qkv_multipliers[0] != qkv_multipliers[1]. | |
| # This results in variable allocation of parameters in attention layer. | |
| # This scaling is known as layer-wise or block-wise scaling: https://arxiv.org/abs/2008.00623 | |
| qkv_multipliers = [ | |
| round(v, 2) | |
| for v in np.linspace( | |
| self.qkv_multipliers[0], | |
| self.qkv_multipliers[1], | |
| num=self.num_transformer_layers, | |
| dtype=float, | |
| ) | |
| ] | |
| # Make sure that scaled model dimension is divisible by scaled head dimension. | |
| query_dims = [ | |
| int( | |
| make_divisible( | |
| self.model_dim * m, divisor=self.head_dim * head_multiple_of | |
| ) | |
| ) | |
| for m in qkv_multipliers | |
| ] | |
| else: | |
| raise NotImplementedError( | |
| f"QKV multipliers should be a single number or a list containing exactly two numbers. Got: {qkv_multipliers}." | |
| ) | |
| # compute the number of query, key, and value heads | |
| # For multi-head and multi-query attention, the number of heads for query, key, and value are the same. | |
| # For group query attention, the number of key and value heads are the same. | |
| self.num_query_heads = [ | |
| int(compute_heads(q_dim, self.head_dim)) for q_dim in query_dims | |
| ] | |
| self.num_kv_heads = [ | |
| q_heads // self.num_gqa_groups for q_heads in self.num_query_heads | |
| ] | |
| # Feed-forward network (FFN) multipliers | |
| if isinstance(self.ffn_multipliers, Number): | |
| # All FFN layers have the same latent dimensions, resulting in uniform allocation of parameters. | |
| self.ffn_multipliers = [self.ffn_multipliers] * self.num_transformer_layers | |
| elif isinstance(self.ffn_multipliers, (tuple, list)): | |
| # Each FFN layer have different latent dimensions assuming ffn_multipliers[0] != ffn_multipliers[1]. | |
| # This results in variable allocation of parameters in FFN layer. | |
| # This scaling is known as layer-wise or block-wise scaling: https://arxiv.org/abs/2008.00623 | |
| if len(self.ffn_multipliers) == 2: | |
| self.ffn_multipliers = [ | |
| round(v, 2) | |
| for v in np.linspace( | |
| self.ffn_multipliers[0], | |
| self.ffn_multipliers[1], | |
| num=self.num_transformer_layers, | |
| dtype=float, | |
| ) | |
| ] | |
| else: | |
| assert ( | |
| len(self.ffn_multipliers) == self.num_transformer_layers | |
| ), f"{len(self.ffn_multipliers)=}!={self.num_transformer_layers=}" | |
| else: | |
| raise NotImplementedError( | |
| f"FFN multipliers should be a single number or a list containing exactly two numbers. Got: {qkv_multipliers}." | |
| ) | |
| # check num_query_heads divisible by num_kv_heads for every layer | |
| for layer_idx in range(len(query_dims)): | |
| assert self.num_query_heads[layer_idx] % self.num_kv_heads[layer_idx] == 0 | |