File size: 2,152 Bytes
5a44068
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import os
from llama_cpp import Llama
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.utils import embedding_functions
from fastapi import FastAPI, Query
import gradio as gr

# 1. Load model via llama-cpp-python
model = Llama.from_pretrained(
    repo_id="openbmb/MiniCPM-V-2_6-gguf",
    filename="*.gguf",
    n_ctx=4096,
)
# 2. Setup RAG
embedder = SentenceTransformer("all-MiniLM-L6-v2")
client = chromadb.PersistentClient(path="chroma_db")
col = client.get_or_create_collection(
    "docs",
    embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name="all-MiniLM-L6-v2"
    )
)
# Seed with example context
seed_texts = [
    "MiniCPM‑V‑2_6‑gguf runs well on CPU via llama.cpp.",
    "This model supports RAG with Chromadb and FastAPI + Gradio UI."
]
for t in seed_texts:
    col.add(documents=[t], ids=[str(hash(t))])

def rag_query(q: str) -> str:
    results = col.query(
        query_embeddings=[embedder.encode(q)],
        n_results=3
    )
    context = "\n".join(results["documents"][0])
    prompt = f"Context:\n{context}\n\nUser: {q}\nAssistant:"
    out = model.create(prompt=prompt, max_tokens=256, temperature=0.7)
    return out["choices"][0]["text"]

# 3. FastAPI app
app = FastAPI()

@app.get("/ask")
def ask(q: str = Query(...)):
    return {"answer": rag_query(q)}

@app.post("/ask")
def ask_post(body: dict):
    return ask(q=body.get("q",""))

# 4. Gradio UI
def chat_fn(message, history):
    reply = rag_query(message)
    history = history or []
    history.append(("User", message))
    history.append(("Assistant", reply))
    return history, history

with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    txt = gr.Textbox(placeholder="Ask me...", show_label=False)
    txt.submit(chat_fn, [txt, chatbot], [chatbot, chatbot])
    gr.Markdown("### 🧠 MiniCPM‑V‑2_6‑gguf RAG Chat (GET & POST API support)")

@app.on_event("startup")
def startup():
    demo.queue().launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT",7860)))

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)