Instructions to use microsoft/Phi-4-reasoning-plus with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use microsoft/Phi-4-reasoning-plus with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="microsoft/Phi-4-reasoning-plus")
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-4-reasoning-plus")
model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-4-reasoning-plus")
messages = [
    {"role": "user", "content": "Who are you?"},
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use microsoft/Phi-4-reasoning-plus with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "microsoft/Phi-4-reasoning-plus"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "microsoft/Phi-4-reasoning-plus",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/microsoft/Phi-4-reasoning-plus

SGLang

How to use microsoft/Phi-4-reasoning-plus with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "microsoft/Phi-4-reasoning-plus" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "microsoft/Phi-4-reasoning-plus",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "microsoft/Phi-4-reasoning-plus" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "microsoft/Phi-4-reasoning-plus",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use microsoft/Phi-4-reasoning-plus with Docker Model Runner:
```
docker model run hf.co/microsoft/Phi-4-reasoning-plus
```

Grok3 had a few things to add

by ebearden - opened May 4, 2025

Discussion

ebearden

May 4, 2025

import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import logging
import re
import json
import gzip
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, validator
import uvicorn
import requests
import time
import threading

Set up logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

Create a FastAPI application

app = FastAPI()

class DataFormat:
"""DSL for encoding/decoding data for efficient transmission."""
def init(self, name: str, fields: list):
self.name = name
self.fields = fields

def encode(self, data: dict) -> bytes:
    """Encode data as compressed JSON."""
    try:
        json_data = json.dumps({k: data[k] for k in self.fields if k in data})
        return gzip.compress(json_data.encode('utf-8'))
    except Exception as e:
        logging.error(f"Encoding error: {str(e)}")
        raise ValueError(f"Failed to encode data: {str(e)}")

def decode(self, encoded_data: bytes) -> dict:
    """Decode compressed JSON data."""
    try:
        return json.loads(gzip.decompress(encoded_data).decode('utf-8'))
    except Exception as e:
        logging.error(f"Decoding error: {str(e)}")
        raise ValueError(f"Failed to decode data: {str(e)}")

class MultilingualGenerator:
"""Class to generate and transmit text in multiple languages."""
def init(self, model_name: str = "google/mt5-small"):
"""Initialize with a multilingual Seq2Seq model."""
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.data_format = DataFormat("TextGeneration", ["prompt", "language", "response"])
try:
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(self.device)
except Exception as e:
logging.error(f"Failed to load model or tokenizer: {str(e)}")
raise ValueError(f"Failed to load model or tokenizer: {str(e)}")

def preprocess_text(self, text: str, language: str) -> str:
    """Apply language-specific preprocessing."""
    try:
        if language == "Persian":
            text = re.sub(r'[\u0643]', '\u06A9', text)  # Arabic kaf to Persian kaf
            text = re.sub(r'[\u064A]', '\u06CC', text)  # Arabic yeh to Persian yeh
            text = re.sub(r'\s+','', text.strip())
        elif language == "Hebrew":
            text = re.sub(r'\s+','', text.strip())
        elif language == "Arabic":
            text = re.sub(r'[\u0622\u0623\u0625]', '\u0627', text)  # Unify alef
            text = re.sub(r'\s+','', text.strip())
        elif language == "English":
            text = text.strip()
        elif language == "Turkish":
            text = re.sub(r'\s+','', text.strip())
        return text
    except Exception as e:
        logging.error(f"Preprocessing error for {language}: {str(e)}")
        return text

def generate_text(self, prompt: str, language: str, max_new_tokens: int = 100) -> dict:
    """Generate text for a given prompt and language."""
    if not prompt:
        raise ValueError("Prompt cannot be empty")
    prompt = self.preprocess_text(prompt, language)
    try:
        inputs = self.tokenizer(
            f"{language}: {prompt}",
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512
        ).to(self.device)
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                num_beams=5,
                early_stopping=True,
                temperature=0.7,
                no_repeat_ngram_size=2
            )
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return {"prompt": prompt, "language": language, "response": response}
    except RuntimeError as e:
        logging.error(f"Model inference error for {language}: {str(e)}")
        return {"prompt": prompt, "language": language, "response": f"Error: {str(e)}"}
    except Exception as e:
        logging.error(f"Unexpected error generating text for {language}: {str(e)}")
        return {"prompt": prompt, "language": language, "response": f"Error: {str(e)}"}

def transmit_response(self

gugarosa changed discussion status to closed May 9, 2025

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

· Sign up or log in to comment