GradLLM / app.py
Mungert's picture
Update app.py
2001be3 verified
raw
history blame
5.26 kB
# app.py
import asyncio
from contextlib import asynccontextmanager
import gradio as gr
from fastapi import FastAPI
from config import settings
from rabbit_base import RabbitBase
from listener import RabbitListenerBase
from rabbit_repo import RabbitRepo
from service import LLMService
from runners.base import ILLMRunner
# =========================
# @spaces.GPU() SECTION
# =========================
# Mirrors the working Space: define a concrete GPU-decorated fn that Gradio calls.
try:
import spaces
ZERO_GPU_AVAILABLE = True
@spaces.GPU(duration=120) # trivial GPU entrypoint; detector-friendly
def gpu_entrypoint():
"""
Minimal GPU function so ZeroGPU sees a GPU endpoint.
Replace the body later with real CUDA work as needed.
"""
return "gpu: ready"
except Exception:
ZERO_GPU_AVAILABLE = False
def gpu_entrypoint():
return "gpu: not available (CPU only)"
# ---------------- Runner factory (stub) ----------------
class EchoRunner(ILLMRunner):
Type = "EchoRunner"
async def StartProcess(self, llmServiceObj: dict): pass
async def RemoveProcess(self, sessionId: str): pass
async def StopRequest(self, sessionId: str): pass
async def SendInputAndGetResponse(self, llmServiceObj: dict): pass
async def runner_factory(llmServiceObj: dict) -> ILLMRunner:
return EchoRunner()
# ---------------- Publisher and Service ----------------
publisher = RabbitRepo(external_source="https://space.external")
service = LLMService(publisher, runner_factory)
# ---------------- Handlers (.NET FuncName -> service) ----------------
async def h_start(data): await service.StartProcess(data or {})
async def h_user(data): await service.UserInput(data or {})
async def h_remove(data): await service.RemoveSession(data or {})
async def h_stop(data): await service.StopRequest(data or {})
async def h_qir(data): await service.QueryIndexResult(data or {})
async def h_getreg(_): await service.GetFunctionRegistry(False)
async def h_getreg_f(_): await service.GetFunctionRegistry(True)
handlers = {
"llmStartSession": h_start,
"llmUserInput": h_user,
"llmRemoveSession": h_remove,
"llmStopRequest": h_stop,
"queryIndexResult": h_qir,
"getFunctionRegistry": h_getreg,
"getFunctionRegistryFiltered": h_getreg_f,
}
# ---------------- Listener wiring ----------------
base = RabbitBase()
listener = RabbitListenerBase(
base,
instance_name=settings.RABBIT_INSTANCE_NAME, # queue prefix like your .NET instance
handlers=handlers,
)
# Declarations mirror your C# InitRabbitMQObjs()
DECLS = [
{"ExchangeName": f"llmStartSession{settings.SERVICE_ID}", "FuncName": "llmStartSession",
"MessageTimeout": 600_000, "RoutingKeys": [settings.RABBIT_ROUTING_KEY]},
{"ExchangeName": f"llmUserInput{settings.SERVICE_ID}", "FuncName": "llmUserInput",
"MessageTimeout": 600_000, "RoutingKeys": [settings.RABBIT_ROUTING_KEY]},
{"ExchangeName": f"llmRemoveSession{settings.SERVICE_ID}", "FuncName": "llmRemoveSession",
"MessageTimeout": 60_000, "RoutingKeys": [settings.RABBIT_ROUTING_KEY]},
{"ExchangeName": f"llmStopRequest{settings.SERVICE_ID}", "FuncName": "llmStopRequest",
"MessageTimeout": 60_000, "RoutingKeys": [settings.RABBIT_ROUTING_KEY]},
{"ExchangeName": f"queryIndexResult{settings.SERVICE_ID}", "FuncName": "queryIndexResult",
"MessageTimeout": 60_000, "RoutingKeys": [settings.RABBIT_ROUTING_KEY]},
{"ExchangeName": f"getFunctionRegistry{settings.SERVICE_ID}", "FuncName": "getFunctionRegistry",
"MessageTimeout": 60_000, "RoutingKeys": [settings.RABBIT_ROUTING_KEY]},
{"ExchangeName": f"getFunctionRegistryFiltered{settings.SERVICE_ID}", "FuncName": "getFunctionRegistryFiltered",
"MessageTimeout": 60_000, "RoutingKeys": [settings.RABBIT_ROUTING_KEY]},
]
# ---------------- Gradio UI (smoke test + GPU button) ----------------
async def ping():
return "ok"
with gr.Blocks() as demo:
gr.Markdown("### LLM Runner (Python) β€” RabbitMQ listener (ZeroGPU-ready)")
with gr.Row():
btn = gr.Button("Ping")
out = gr.Textbox(label="Ping result")
btn.click(ping, inputs=None, outputs=out)
# Reference the GPU-decorated function **directly** (no lambda)
with gr.Row():
gpu_btn = gr.Button("GPU Ready Probe")
gpu_out = gr.Textbox(label="GPU Probe Result", interactive=False)
gpu_btn.click(gpu_entrypoint, inputs=None, outputs=gpu_out)
# ---------------- FastAPI + lifespan ----------------
@asynccontextmanager
async def lifespan(_app: FastAPI):
# startup
await publisher.connect()
await service.init()
await listener.start(DECLS)
yield
# shutdown (optional)
# await publisher.close()
# await listener.stop()
app = FastAPI(lifespan=lifespan)
app = gr.mount_gradio_app(app, demo, path="/")
@app.get("/health")
async def health():
return {"status": "ok"}
# Also expose the probe via HTTP (extra-safe for detectors)
@app.get("/gpu-probe")
def gpu_probe_route():
return {"status": gpu_entrypoint()}
if __name__ == "__main__":
# For local runs; on HF Spaces, the SDK manages the server.
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)