johnbridges commited on
Commit
ec97b47
Β·
1 Parent(s): 111b7a3
Files changed (3) hide show
  1. app.py +8 -11
  2. state.py +5 -0
  3. vllm_backend.py +6 -5
app.py CHANGED
@@ -7,7 +7,8 @@ from rabbit_base import RabbitBase
7
  from listener import RabbitListenerBase
8
  from rabbit_repo import RabbitRepo
9
  from oa_server import OpenAIServers
10
- from vllm_backend import VLLMChatBackend, StubImagesBackend # βœ… use your backend module
 
11
 
12
  # ---- vLLM imports ----
13
  from vllm.engine.async_llm_engine import AsyncLLMEngine
@@ -28,14 +29,10 @@ except Exception:
28
  def gpu_entrypoint() -> str:
29
  return "gpu: not available (CPU only)"
30
 
31
- # ----------------- vLLM globals -----------------
32
- vllm_engine: AsyncLLMEngine | None = None
33
-
34
  async def init_vllm():
35
- """Initialize vLLM engine with a Hugging Face model."""
36
- global vllm_engine
37
- if vllm_engine is not None:
38
- return vllm_engine
39
 
40
  model_id = getattr(settings, "LlmHFModelID", "Qwen/Qwen2.5-7B-Instruct")
41
  log.info(f"Loading vLLM model: {model_id}")
@@ -45,8 +42,8 @@ async def init_vllm():
45
  trust_remote_code=True,
46
  max_model_len=getattr(settings, "LlmOpenAICtxSize", 32768),
47
  )
48
- vllm_engine = AsyncLLMEngine.from_engine_args(args)
49
- return vllm_engine
50
 
51
  # ----------------- RabbitMQ wiring -----------------
52
  publisher = RabbitRepo(external_source="openai.mq.server")
@@ -55,7 +52,7 @@ base = RabbitBase(exchange_type_resolver=resolver)
55
 
56
  servers = OpenAIServers(
57
  publisher,
58
- chat_backend=VLLMChatBackend(), # βœ… now from llm_backend.py
59
  images_backend=StubImagesBackend()
60
  )
61
 
 
7
  from listener import RabbitListenerBase
8
  from rabbit_repo import RabbitRepo
9
  from oa_server import OpenAIServers
10
+ from vllm_backend import VLLMChatBackend, StubImagesBackend # βœ… our backend
11
+ import state # holds vllm_engine reference
12
 
13
  # ---- vLLM imports ----
14
  from vllm.engine.async_llm_engine import AsyncLLMEngine
 
29
  def gpu_entrypoint() -> str:
30
  return "gpu: not available (CPU only)"
31
 
32
+ # ----------------- vLLM init -----------------
 
 
33
  async def init_vllm():
34
+ if state.vllm_engine is not None:
35
+ return state.vllm_engine
 
 
36
 
37
  model_id = getattr(settings, "LlmHFModelID", "Qwen/Qwen2.5-7B-Instruct")
38
  log.info(f"Loading vLLM model: {model_id}")
 
42
  trust_remote_code=True,
43
  max_model_len=getattr(settings, "LlmOpenAICtxSize", 32768),
44
  )
45
+ state.vllm_engine = AsyncLLMEngine.from_engine_args(args)
46
+ return state.vllm_engine
47
 
48
  # ----------------- RabbitMQ wiring -----------------
49
  publisher = RabbitRepo(external_source="openai.mq.server")
 
52
 
53
  servers = OpenAIServers(
54
  publisher,
55
+ chat_backend=VLLMChatBackend(),
56
  images_backend=StubImagesBackend()
57
  )
58
 
state.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # state.py
2
+ from typing import Optional
3
+ from vllm.engine.async_llm_engine import AsyncLLMEngine
4
+
5
+ vllm_engine: Optional[AsyncLLMEngine] = None
vllm_backend.py CHANGED
@@ -1,10 +1,10 @@
1
  # vllm_backend.py
2
- import time, json, logging
3
  from typing import Any, Dict, AsyncIterable
4
 
5
  from vllm.sampling_params import SamplingParams
6
  from oa_server import ChatBackend, ImagesBackend
7
- from app import vllm_engine # global vLLM engine created in app.py
8
 
9
  logger = logging.getLogger(__name__)
10
 
@@ -17,14 +17,14 @@ class VLLMChatBackend(ChatBackend):
17
  if vllm_engine is None:
18
  raise RuntimeError("vLLM engine not initialized")
19
 
20
- # Naive: just grab the last user message for now
21
  messages = request.get("messages", [])
22
  prompt = messages[-1]["content"] if messages else "(empty)"
23
 
24
  params = SamplingParams(
25
  temperature=float(request.get("temperature", 0.7)),
26
  max_tokens=int(request.get("max_tokens", 512)),
27
- stream=True
28
  )
29
 
30
  rid = f"chatcmpl-local-{int(time.time())}"
@@ -47,7 +47,7 @@ class VLLMChatBackend(ChatBackend):
47
  logger.exception("vLLM generation failed")
48
  raise
49
 
50
- # final stop signal
51
  yield {
52
  "id": rid,
53
  "object": "chat.completion.chunk",
@@ -63,4 +63,5 @@ class StubImagesBackend(ImagesBackend):
63
  """
64
  async def generate_b64(self, request: Dict[str, Any]) -> str:
65
  logger.warning("Image generation not supported in local vLLM backend.")
 
66
  return "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGP4BwQACfsD/etCJH0AAAAASUVORK5CYII="
 
1
  # vllm_backend.py
2
+ import time, logging
3
  from typing import Any, Dict, AsyncIterable
4
 
5
  from vllm.sampling_params import SamplingParams
6
  from oa_server import ChatBackend, ImagesBackend
7
+ from state import vllm_engine # βœ… the single source of truth
8
 
9
  logger = logging.getLogger(__name__)
10
 
 
17
  if vllm_engine is None:
18
  raise RuntimeError("vLLM engine not initialized")
19
 
20
+ # For now: just grab the last user message
21
  messages = request.get("messages", [])
22
  prompt = messages[-1]["content"] if messages else "(empty)"
23
 
24
  params = SamplingParams(
25
  temperature=float(request.get("temperature", 0.7)),
26
  max_tokens=int(request.get("max_tokens", 512)),
27
+ stream=True,
28
  )
29
 
30
  rid = f"chatcmpl-local-{int(time.time())}"
 
47
  logger.exception("vLLM generation failed")
48
  raise
49
 
50
+ # Final stop signal
51
  yield {
52
  "id": rid,
53
  "object": "chat.completion.chunk",
 
63
  """
64
  async def generate_b64(self, request: Dict[str, Any]) -> str:
65
  logger.warning("Image generation not supported in local vLLM backend.")
66
+ # 1x1 transparent PNG
67
  return "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGP4BwQACfsD/etCJH0AAAAASUVORK5CYII="