| from fastapi import FastAPI |
| from pydantic import BaseModel, Field |
| from typing import List, Dict |
| from llama_cpp import Llama |
| import os |
| os.environ["HF_HOME"] = "./cache" |
|
|
|
|
| |
| llm = Llama.from_pretrained( |
| repo_id="bartowski/Llama-3.2-3B-Instruct-GGUF", |
| filename="Llama-3.2-3B-Instruct-Q8_0.gguf", |
| n_ctx=4096, |
| cache_dir="./cache", |
| n_threads=2, |
| ) |
|
|
| |
| class Message(BaseModel): |
| role: str |
| content: str |
|
|
| class Validation(BaseModel): |
| messages: List[Message] = Field(default_factory=list) |
| max_tokens: int = 1024 |
| temperature: float = 0.01 |
|
|
| |
| app = FastAPI() |
|
|
| |
| @app.post("/generate_response") |
| async def generate_response(item: Validation): |
| |
| response = llm.create_chat_completion( |
| messages=[{"role": msg.role, "content": msg.content} for msg in item.messages], |
| max_tokens=item.max_tokens, |
| temperature=item.temperature |
| ) |
|
|
| |
| return {"response": response['choices'][0]['message']['content']} |
|
|