| from vllm import LLM, SamplingParams |
| import argparse |
| import json |
| import os |
| import time |
| import datetime |
|
|
| def setup_model(model_path, tensor_parallel_size=None, dtype="bfloat16", gpu_memory_utilization=0.85): |
| """ |
| Initialize the fine-tuned Qwen-2.5-7B model from a local path with explicit GPU configuration. |
| |
| Args: |
| model_path: Path to the directory containing the trained model |
| tensor_parallel_size: Number of GPUs to use for tensor parallelism (None means auto-detect) |
| dtype: Data type for model weights (bfloat16, float16, or float32) |
| gpu_memory_utilization: Fraction of GPU memory to use (0.0 to 1.0) |
| """ |
| print(f"Loading fine-tuned Qwen model from: {model_path}") |
| print(f"GPU configuration: tensor_parallel_size={tensor_parallel_size}, dtype={dtype}, " |
| f"gpu_memory_utilization={gpu_memory_utilization}") |
| |
| |
| llm = LLM( |
| model=model_path, |
| trust_remote_code=True, |
| tensor_parallel_size=tensor_parallel_size, |
| dtype=dtype, |
| gpu_memory_utilization=gpu_memory_utilization, |
| enforce_eager=False, |
| |
| ) |
| |
| print("Model loaded successfully!") |
| return llm |
|
|
| def generate_response(llm, prompt, temperature=0.7, max_tokens=512, top_p=0.9): |
| """Generate a response for a given prompt.""" |
| sampling_params = SamplingParams( |
| temperature=temperature, |
| top_p=top_p, |
| max_tokens=max_tokens |
| ) |
| |
| outputs = llm.generate([prompt], sampling_params) |
| return outputs[0].outputs[0].text |
|
|
| def chat_completion(llm, messages, temperature=0.7, max_tokens=512): |
| """Generate a chat completion from messages.""" |
| sampling_params = SamplingParams( |
| temperature=temperature, |
| top_p=0.9, |
| max_tokens=max_tokens |
| ) |
| |
| |
| tokenizer = llm.get_tokenizer() |
| if hasattr(tokenizer, "apply_chat_template"): |
| |
| prompt = tokenizer.apply_chat_template( |
| messages, |
| tokenize=False, |
| add_generation_prompt=True |
| ) |
| else: |
| |
| prompt = format_messages_manually(messages) |
| |
| outputs = llm.generate([prompt], sampling_params) |
| return outputs[0].outputs[0].text |
|
|
| def format_messages_manually(messages): |
| """Format messages manually if chat template is not available.""" |
| formatted_prompt = "" |
| for message in messages: |
| role = message["role"] |
| content = message["content"] |
| if role == "system": |
| formatted_prompt += f"<|im_start|>system\n{content}<|im_end|>\n" |
| elif role == "user": |
| formatted_prompt += f"<|im_start|>user\n{content}<|im_end|>\n" |
| elif role == "assistant": |
| formatted_prompt += f"<|im_start|>assistant\n{content}<|im_end|>\n" |
| formatted_prompt += "<|im_start|>assistant\n" |
| return formatted_prompt |
|
|
| def batch_inference(llm, prompts, temperature=0.7, max_tokens=512): |
| """Run batch inference on multiple prompts.""" |
| sampling_params = SamplingParams( |
| temperature=temperature, |
| top_p=0.9, |
| max_tokens=max_tokens |
| ) |
| |
| outputs = llm.generate(prompts, sampling_params) |
| return [output.outputs[0].text for output in outputs] |
|
|
| def save_to_json(data, output_path=None): |
| """Save results to a JSON file.""" |
| if not output_path: |
| timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") |
| output_path = f"qwen_inference_results_{timestamp}.json" |
| |
| with open(output_path, 'w', encoding='utf-8') as f: |
| json.dump(data, f, ensure_ascii=False, indent=2) |
| |
| print(f"Results saved to: {output_path}") |
| return output_path |
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="GPU inference with fine-tuned Qwen model with JSON output") |
| parser.add_argument("--model_path", required=True, help="Path to the fine-tuned model directory") |
| parser.add_argument("--mode", choices=["single", "chat", "batch"], default="single", help="Inference mode") |
| parser.add_argument("--prompt", help="Prompt for single inference mode") |
| parser.add_argument("--prompt_file", help="File containing prompts for batch mode (one per line)") |
| parser.add_argument("--output_file", help="Path to save JSON results (default: auto-generated)") |
| parser.add_argument("--max_tokens", type=int, default=512, help="Maximum tokens in response") |
| parser.add_argument("--temperature", type=float, default=0.7, help="Temperature for sampling") |
| parser.add_argument("--gpu_count", type=int, help="Number of GPUs to use (default: all available)") |
| parser.add_argument("--dtype", choices=["float16", "bfloat16", "float32"], default="bfloat16", help="Data type for weights") |
| parser.add_argument("--gpu_memory_utilization", type=float, default=0.85, help="GPU memory utilization (0.0-1.0)") |
| args = parser.parse_args() |
| |
| |
| llm = setup_model( |
| model_path=args.model_path, |
| tensor_parallel_size=args.gpu_count, |
| dtype=args.dtype, |
| gpu_memory_utilization=args.gpu_memory_utilization |
| ) |
| |
| results = {} |
| |
| if args.mode == "single": |
| if not args.prompt: |
| args.prompt = input("Enter your prompt: ") |
| |
| print("\nGenerating response...") |
| start_time = time.time() |
| response = generate_response( |
| llm, |
| args.prompt, |
| temperature=args.temperature, |
| max_tokens=args.max_tokens |
| ) |
| end_time = time.time() |
| |
| print(f"\nResponse:\n{response}") |
| |
| results = { |
| "mode": "single", |
| "timestamp": datetime.datetime.now().isoformat(), |
| "input": args.prompt, |
| "output": response, |
| "parameters": { |
| "temperature": args.temperature, |
| "max_tokens": args.max_tokens |
| }, |
| "performance": { |
| "time_seconds": end_time - start_time |
| } |
| } |
| |
| elif args.mode == "chat": |
| |
| messages = [{"role": "system", "content": "You are a helpful AI assistant."}] |
| results = { |
| "mode": "chat", |
| "timestamp": datetime.datetime.now().isoformat(), |
| "conversation": [] |
| } |
| |
| print("\nChat mode. Type 'exit' or 'quit' to end the conversation and save to JSON.\n") |
| |
| while True: |
| user_input = input("\nYou: ") |
| if user_input.lower() in ["exit", "quit"]: |
| print("Ending conversation and saving results...") |
| break |
| |
| messages.append({"role": "user", "content": user_input}) |
| |
| start_time = time.time() |
| response = chat_completion( |
| llm, |
| messages, |
| temperature=args.temperature, |
| max_tokens=args.max_tokens |
| ) |
| end_time = time.time() |
| |
| print(f"\nAssistant: {response}") |
| messages.append({"role": "assistant", "content": response}) |
| |
| |
| results["conversation"].append({ |
| "user": user_input, |
| "assistant": response, |
| "time_seconds": end_time - start_time |
| }) |
| |
| elif args.mode == "batch": |
| if not args.prompt_file: |
| print("Error: --prompt_file required for batch mode") |
| return |
| with open(args.prompt_file, 'r', encoding='utf-8') as f: |
| prompts = json.load(f) |
| |
| print(f"Running batch inference on {len(prompts)} prompts...") |
| inference_results = batch_inference( |
| llm, |
| prompts, |
| temperature=args.temperature, |
| max_tokens=args.max_tokens |
| ) |
| |
| with open(args.output_file, "w") as final: |
| json.dump(inference_results, final) |
|
|
| if __name__ == "__main__": |
| main() |