Instructions to use aidiffuser/Kimi-K2.6-MLX-4bit with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use aidiffuser/Kimi-K2.6-MLX-4bit with MLX:

# Make sure mlx-lm is installed
# pip install --upgrade mlx-lm

# Generate text with mlx-lm
from mlx_lm import load, generate

model, tokenizer = load("aidiffuser/Kimi-K2.6-MLX-4bit")

prompt = "Write a story about Einstein"
messages = [{"role": "user", "content": prompt}]
prompt = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True
)

text = generate(model, tokenizer, prompt=prompt, verbose=True)

Notebooks
Google Colab
Kaggle
Local Apps
LM Studio

Pi new

How to use aidiffuser/Kimi-K2.6-MLX-4bit with Pi:

Start the MLX server

# Install MLX LM:
uv tool install mlx-lm
# Start a local OpenAI-compatible server:
mlx_lm.server --model "aidiffuser/Kimi-K2.6-MLX-4bit"

Configure the model in Pi

# Install Pi:
npm install -g @mariozechner/pi-coding-agent
# Add to ~/.pi/agent/models.json:
{
  "providers": {
    "mlx-lm": {
      "baseUrl": "http://localhost:8080/v1",
      "api": "openai-completions",
      "apiKey": "none",
      "models": [
        {
          "id": "aidiffuser/Kimi-K2.6-MLX-4bit"
        }
      ]
    }
  }
}

Run Pi

# Start Pi in your project directory:
pi

Hermes Agent new

How to use aidiffuser/Kimi-K2.6-MLX-4bit with Hermes Agent:

Start the MLX server

# Install MLX LM:
uv tool install mlx-lm
# Start a local OpenAI-compatible server:
mlx_lm.server --model "aidiffuser/Kimi-K2.6-MLX-4bit"

Configure Hermes

# Install Hermes:
curl -fsSL https://hermes-agent.nousresearch.com/install.sh | bash
hermes setup
# Point Hermes at the local server:
hermes config set model.provider custom
hermes config set model.base_url http://127.0.0.1:8080/v1
hermes config set model.default aidiffuser/Kimi-K2.6-MLX-4bit

Run Hermes

hermes

MLX LM

How to use aidiffuser/Kimi-K2.6-MLX-4bit with MLX LM:

Generate or start a chat session

# Install MLX LM
uv tool install mlx-lm
# Interactive chat REPL
mlx_lm.chat --model "aidiffuser/Kimi-K2.6-MLX-4bit"

Run an OpenAI-compatible server

# Install MLX LM
uv tool install mlx-lm
# Start the server
mlx_lm.server --model "aidiffuser/Kimi-K2.6-MLX-4bit"
# Calling the OpenAI-compatible server with curl
curl -X POST "http://localhost:8000/v1/chat/completions" \
   -H "Content-Type: application/json" \
   --data '{
     "model": "aidiffuser/Kimi-K2.6-MLX-4bit",
     "messages": [
       {"role": "user", "content": "Hello"}
     ]
   }'

Kimi-K2.6-MLX-4bit / kimi_k25_processor.py

aidiffuser

Upload folder using huggingface_hub

d00de99 verified 25 days ago

raw

history blame contribute delete

6.91 kB

	from transformers.feature_extraction_utils import BatchFeature
	from transformers.processing_utils import ProcessorMixin
	from transformers.utils import logging

	logger = logging.get_logger(__name__)


	class KimiK25Processor(ProcessorMixin):
	r"""
	Constructs a KimiK25 processor which wraps a KimiK25 image processor and a tokenizer into a single processor.

	[`KimiK25Processor`] offers all the functionalities of [`KimiK25ImageProcessor`] and [`TikTokenTokenizer`]. See the
	[`~KimiK25Processor.__call__`] and [`~KimiK25Processor.decode`] for more information.

	Args:
	image_processor ([`KimiK25ImageProcessor`], optional):
	The image processor is a required input.
	tokenizer ([`TikTokenTokenizer`], optional):
	The tokenizer is a required input.
	chat_template (`str`, optional): A Jinja template which will be used to convert lists of messages
	in a chat into a tokenizable string.
	"""

	attributes = ["image_processor", "tokenizer"]
	valid_kwargs = ["chat_template"]
	image_processor_class = "AutoImageProcessor"
	tokenizer_class = "AutoTokenizer"

	def __init__(
	self,
	image_processor=None,
	tokenizer=None,
	chat_template=None,
	**kwargs,
	):
	super().__init__(image_processor,
	tokenizer,
	chat_template=chat_template)
	self.media_processor = image_processor
	# A special temporal placeholder to be replaced by actual video placeholders
	self.video_placeholder = "<\|kimi_k25_video_placeholder\|>"

	def update_raw_text(self, text: str, video_prompts: list[str]) -> str:
	# replace video prompt in text with video chunk prompts
	video_count = text.count(self.video_placeholder)
	if video_count == 0:
	return text
	assert video_count == len(video_prompts)
	text_parts = text.split(self.video_placeholder)
	assert len(text_parts) == len(video_prompts) + 1
	text = "".join([
	text_parts[i] + video_prompts[i] for i in range(len(video_prompts))
	])
	text += text_parts[-1]
	return text

	def preprocess_medias(self, medias: list[dict]) -> list[dict]:
	updated_medias = []
	video_prompts = []
	for media in medias:
	if media['type'] == 'image':
	updated_medias.append(media)
	elif media['type'] == 'video':
	video_chunks = self.media_processor.split_video_chunks(
	media['video'])
	updated_medias.extend(video_chunks)
	video_prompts.append("".join(
	[vc['prompt'] for vc in video_chunks]))
	else:
	raise ValueError(f"unsupported media type: {media['type']}")
	return updated_medias, video_prompts

	def __call__(self,
	messages: list[dict] = None,
	medias: list[dict] = None,
	text: str = None,
	return_tensors: str = "pt",
	**kwargs) -> BatchFeature:
	"""
	Process multimodal inputs for Kimi-K2.5 model.

	This processor accepts ordered messages and extracts both media and text in a single pass.
	text will be automatically updated if video input detected in messages

	Args:
	messages: List of message dicts with 'role' and 'content' fields.
	If provided, medias and text will be extracted automatically.
	medias: Pre-extracted list of media dicts. If None, extracted from messages.
	text: Pre-formatted text string. If None, generated via apply_chat_template.
	return_tensors: Format of returned tensors ('pt', 'np', 'tf'). Default: 'pt'.
	**kwargs: Additional arguments passed to tokenizer.apply_chat_template.

	Returns:
	BatchFeature with fields: input_ids, attention_mask, pixel_values, grid_thws.
	"""
	if messages is None and (medias is None or text is None):
	raise ValueError(
	"Provide either 'messages' or both 'medias' and 'text'")

	if medias is not None and text is not None:
	updated_medias, video_prompts = self.preprocess_medias(medias)
	preprocessed = self.media_processor.preprocess(
	updated_medias, return_tensors=return_tensors)
	text = self.update_raw_text(text, video_prompts)
	text_inputs = self.tokenizer(text, return_tensors=return_tensors)
	return BatchFeature(data={text_inputs, preprocessed.data})

	if medias is None:
	medias = self._extract_medias_from_messages(messages)
	updated_medias, video_prompts = self.preprocess_medias(medias)
	preprocessed = self.media_processor.preprocess(
	updated_medias, return_tensors=return_tensors)

	# Generate text if not provided
	if text is None:
	text = self.tokenizer.apply_chat_template(messages, **kwargs)

	text = self.update_raw_text(text, video_prompts)

	text_inputs = self.tokenizer(text, return_tensors=return_tensors)
	return BatchFeature(data={text_inputs, preprocessed.data})

	@staticmethod
	def _extract_medias_from_messages(messages: list[dict]) -> list[dict]:
	"""
	Extract media items from messages in a single pass.

	This is an optimized version that processes messages only once.
	Kept as internal method since external callers should use __call__.
	"""
	medias = []
	for msg in messages:
	if msg['role'] != 'user' or not msg.get('content'):
	continue

	for content_part in msg['content']:
	if not isinstance(content_part, dict):
	continue

	content_type = content_part.get('type')
	if content_type in ['video_url', 'video']:
	medias.append({
	'type': 'video',
	'video': content_part['video_url']['url'],
	'first_frame_timestamp': 0.0
	})
	elif content_type in ['image_url', 'image']:
	medias.append({
	'type': 'image',
	'image': content_part['image_url'],
	})
	return medias

	def apply_chat_template(self, messages, **kwargs):
	return self.tokenizer.apply_chat_template(messages, **kwargs)

	def batch_decode(self, args, *kwargs):
	return self.tokenizer.batch_decode(args, *kwargs)

	def decode(self, args, *kwargs):
	return self.tokenizer.decode(args, *kwargs)

	@property
	def model_input_names(self):
	return ['input_ids', 'attention_mask', 'pixel_values', 'grid_thws']