Spaces:
Runtime error
Runtime error
| import logging | |
| from fastapi import FastAPI | |
| from llama_index.llms.llama_cpp import LlamaCPP | |
| from transformers import AutoTokenizer | |
| logging.basicConfig( | |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
| level=logging.INFO | |
| ) | |
| logger = logging.getLogger(__name__) | |
| logger.info("Запускаемся... 🥳🥳🥳") | |
| app = FastAPI() | |
| model_url = "https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GGUF/resolve/main/qwen2.5-7b-instruct-q3_k_m.gguf" | |
| tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct") | |
| def messages_to_prompt(messages): | |
| messages = [{"role": m.role.value, "content": m.content} for m in messages] | |
| prompt = tokenizer.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True | |
| ) | |
| return prompt | |
| def completion_to_prompt(completion): | |
| messages = [{"role": "user", "content": completion}] | |
| prompt = tokenizer.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True | |
| ) | |
| return prompt | |
| llm = LlamaCPP( | |
| # You can pass in the URL to a GGML model to download it automatically | |
| model_url=model_url, | |
| # optionally, you can set the path to a pre-downloaded model instead of model_url | |
| model_path=None, | |
| temperature=0.1, | |
| max_new_tokens=256, | |
| # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room | |
| context_window=16384, | |
| # kwargs to pass to __call__() | |
| generate_kwargs={}, | |
| # kwargs to pass to __init__() | |
| # set to at least 1 to use GPU | |
| model_kwargs={"n_gpu_layers": -1}, | |
| # transform inputs into Llama2 format | |
| messages_to_prompt=messages_to_prompt, | |
| completion_to_prompt=completion_to_prompt, | |
| verbose=True, | |
| ) | |
| def greet_json(): | |
| return {"Hello": "World!"} | |
| async def set_system_prompt(text: str): | |
| logger.info('post/system-prompt') | |
| # global SYSTEM_PROMPT | |
| # SYSTEM_PROMPT = text | |
| async def predict(text: str): | |
| # Генерация ответа с помощью модели | |
| logger.info('post/predict') | |
| response = llm.complete(text) | |
| return {"response": response} |