File size: 2,152 Bytes
5a44068 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
import os
from llama_cpp import Llama
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.utils import embedding_functions
from fastapi import FastAPI, Query
import gradio as gr
# 1. Load model via llama-cpp-python
model = Llama.from_pretrained(
repo_id="openbmb/MiniCPM-V-2_6-gguf",
filename="*.gguf",
n_ctx=4096,
)
# 2. Setup RAG
embedder = SentenceTransformer("all-MiniLM-L6-v2")
client = chromadb.PersistentClient(path="chroma_db")
col = client.get_or_create_collection(
"docs",
embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(
model_name="all-MiniLM-L6-v2"
)
)
# Seed with example context
seed_texts = [
"MiniCPM‑V‑2_6‑gguf runs well on CPU via llama.cpp.",
"This model supports RAG with Chromadb and FastAPI + Gradio UI."
]
for t in seed_texts:
col.add(documents=[t], ids=[str(hash(t))])
def rag_query(q: str) -> str:
results = col.query(
query_embeddings=[embedder.encode(q)],
n_results=3
)
context = "\n".join(results["documents"][0])
prompt = f"Context:\n{context}\n\nUser: {q}\nAssistant:"
out = model.create(prompt=prompt, max_tokens=256, temperature=0.7)
return out["choices"][0]["text"]
# 3. FastAPI app
app = FastAPI()
@app.get("/ask")
def ask(q: str = Query(...)):
return {"answer": rag_query(q)}
@app.post("/ask")
def ask_post(body: dict):
return ask(q=body.get("q",""))
# 4. Gradio UI
def chat_fn(message, history):
reply = rag_query(message)
history = history or []
history.append(("User", message))
history.append(("Assistant", reply))
return history, history
with gr.Blocks() as demo:
chatbot = gr.Chatbot()
txt = gr.Textbox(placeholder="Ask me...", show_label=False)
txt.submit(chat_fn, [txt, chatbot], [chatbot, chatbot])
gr.Markdown("### 🧠 MiniCPM‑V‑2_6‑gguf RAG Chat (GET & POST API support)")
@app.on_event("startup")
def startup():
demo.queue().launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT",7860)))
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)
|