File size: 4,410 Bytes

093ad9c

import requests
import json
import os
import argparse

def query_endpoint(endpoint_url, api_token=None, prompt="Hello, how are you?", 
                  system_message="You are a helpful assistant.", 
                  max_tokens=256, temperature=0.7, 
                  format_type="openai"):
    """
    Query the Phi-4 Mini model at the specified HuggingFace Inference Endpoint.
    
    Args:
        endpoint_url: The URL of your HuggingFace Inference Endpoint
        api_token: Your HuggingFace API token (if needed)
        prompt: The user message to send to the model
        system_message: The system message to include
        max_tokens: Maximum number of tokens to generate
        temperature: Temperature for generation (0.0 to 1.0)
        format_type: Type of request format to use:
                    "openai" - Standard OpenAI format
                    "hf_wrapped" - HuggingFace format with OpenAI format wrapped in "inputs"
                    "simple" - Simple text input in "inputs" field
    
    Returns:
        The response from the model
    """
    # Prepare headers
    headers = {
        "Content-Type": "application/json"
    }
    
    if api_token:
        headers["Authorization"] = f"Bearer {api_token}"
    
    # Prepare the request payload based on format_type
    if format_type == "openai":
        # Standard OpenAI format
        payload = {
            "messages": [
                {"role": "system", "content": system_message},
                {"role": "user", "content": prompt}
            ],
            "max_tokens": max_tokens,
            "temperature": temperature
        }
    elif format_type == "hf_wrapped":
        # HuggingFace wrapped format
        payload = {
            "inputs": {
                "messages": [
                    {"role": "system", "content": system_message},
                    {"role": "user", "content": prompt}
                ],
                "max_tokens": max_tokens,
                "temperature": temperature
            }
        }
    elif format_type == "simple":
        # Simple text input
        payload = {
            "inputs": prompt
        }
    else:
        raise ValueError(f"Invalid format type: {format_type}")
    
    # Make the request
    try:
        print(f"Request payload: {json.dumps(payload, indent=2)}")
        response = requests.post(endpoint_url, headers=headers, data=json.dumps(payload))
        response.raise_for_status()  # Raise an exception for HTTP errors
        
        # Parse and return the response
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error making request: {e}")
        if hasattr(e, 'response') and e.response:
            print(f"Response content: {e.response.text}")
        return None

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Query a Phi-4 Mini HuggingFace Inference Endpoint")
    parser.add_argument("--url", type=str, required=True, help="The endpoint URL")
    parser.add_argument("--token", type=str, default=os.environ.get("HF_API_TOKEN"), help="HuggingFace API token")
    parser.add_argument("--prompt", type=str, default="Explain quantum computing in simple terms.", help="User prompt")
    parser.add_argument("--system", type=str, default="You are a helpful assistant.", help="System message")
    parser.add_argument("--max_tokens", type=int, default=256, help="Maximum tokens to generate")
    parser.add_argument("--temperature", type=float, default=0.7, help="Temperature (0.0 to 1.0)")
    parser.add_argument("--format", type=str, default="openai", 
                        choices=["openai", "hf_wrapped", "simple"],
                        help="Format to use for the request")
    
    args = parser.parse_args()
    
    print(f"Querying endpoint: {args.url}")
    print(f"Prompt: {args.prompt}")
    print(f"Format: {args.format}")
    
    response = query_endpoint(
        args.url, 
        args.token, 
        args.prompt, 
        args.system, 
        args.max_tokens, 
        args.temperature,
        args.format
    )
    
    if response:
        print("\nResponse:")
        if "choices" in response and len(response["choices"]) > 0:
            print(response["choices"][0]["message"]["content"])
        else:
            print(json.dumps(response, indent=2))
    else:
        print("Failed to get a valid response")