Instructions to use microsoft/Phi-4-reasoning-plus with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use microsoft/Phi-4-reasoning-plus with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="microsoft/Phi-4-reasoning-plus") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-4-reasoning-plus") model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-4-reasoning-plus") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use microsoft/Phi-4-reasoning-plus with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "microsoft/Phi-4-reasoning-plus" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "microsoft/Phi-4-reasoning-plus", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/microsoft/Phi-4-reasoning-plus
- SGLang
How to use microsoft/Phi-4-reasoning-plus with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "microsoft/Phi-4-reasoning-plus" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "microsoft/Phi-4-reasoning-plus", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "microsoft/Phi-4-reasoning-plus" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "microsoft/Phi-4-reasoning-plus", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use microsoft/Phi-4-reasoning-plus with Docker Model Runner:
docker model run hf.co/microsoft/Phi-4-reasoning-plus
Grok3 had a few things to add
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import logging
import re
import json
import gzip
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, validator
import uvicorn
import requests
import time
import threading
Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
Create a FastAPI application
app = FastAPI()
class DataFormat:
"""DSL for encoding/decoding data for efficient transmission."""
def init(self, name: str, fields: list):
self.name = name
self.fields = fields
def encode(self, data: dict) -> bytes:
"""Encode data as compressed JSON."""
try:
json_data = json.dumps({k: data[k] for k in self.fields if k in data})
return gzip.compress(json_data.encode('utf-8'))
except Exception as e:
logging.error(f"Encoding error: {str(e)}")
raise ValueError(f"Failed to encode data: {str(e)}")
def decode(self, encoded_data: bytes) -> dict:
"""Decode compressed JSON data."""
try:
return json.loads(gzip.decompress(encoded_data).decode('utf-8'))
except Exception as e:
logging.error(f"Decoding error: {str(e)}")
raise ValueError(f"Failed to decode data: {str(e)}")
class MultilingualGenerator:
"""Class to generate and transmit text in multiple languages."""
def init(self, model_name: str = "google/mt5-small"):
"""Initialize with a multilingual Seq2Seq model."""
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.data_format = DataFormat("TextGeneration", ["prompt", "language", "response"])
try:
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(self.device)
except Exception as e:
logging.error(f"Failed to load model or tokenizer: {str(e)}")
raise ValueError(f"Failed to load model or tokenizer: {str(e)}")
def preprocess_text(self, text: str, language: str) -> str:
"""Apply language-specific preprocessing."""
try:
if language == "Persian":
text = re.sub(r'[\u0643]', '\u06A9', text) # Arabic kaf to Persian kaf
text = re.sub(r'[\u064A]', '\u06CC', text) # Arabic yeh to Persian yeh
text = re.sub(r'\s+','', text.strip())
elif language == "Hebrew":
text = re.sub(r'\s+','', text.strip())
elif language == "Arabic":
text = re.sub(r'[\u0622\u0623\u0625]', '\u0627', text) # Unify alef
text = re.sub(r'\s+','', text.strip())
elif language == "English":
text = text.strip()
elif language == "Turkish":
text = re.sub(r'\s+','', text.strip())
return text
except Exception as e:
logging.error(f"Preprocessing error for {language}: {str(e)}")
return text
def generate_text(self, prompt: str, language: str, max_new_tokens: int = 100) -> dict:
"""Generate text for a given prompt and language."""
if not prompt:
raise ValueError("Prompt cannot be empty")
prompt = self.preprocess_text(prompt, language)
try:
inputs = self.tokenizer(
f"{language}: {prompt}",
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
).to(self.device)
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=max_new_tokens,
num_beams=5,
early_stopping=True,
temperature=0.7,
no_repeat_ngram_size=2
)
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
return {"prompt": prompt, "language": language, "response": response}
except RuntimeError as e:
logging.error(f"Model inference error for {language}: {str(e)}")
return {"prompt": prompt, "language": language, "response": f"Error: {str(e)}"}
except Exception as e:
logging.error(f"Unexpected error generating text for {language}: {str(e)}")
return {"prompt": prompt, "language": language, "response": f"Error: {str(e)}"}
def transmit_response(self