artush-habetyan commited on
Commit
c537c46
·
verified ·
1 Parent(s): fa4ad33

Upload 8 files

Browse files
lightweight_conversational_llm.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
3
+ import logging
4
+
5
+ class LightweightConversationalLLM:
6
+ def __init__(self, model_name="HuggingFaceTB/SmolLM-1.7B-Instruct"):
7
+ self.model_name = model_name
8
+ self.model = None
9
+ self.tokenizer = None
10
+ self.setup_model()
11
+
12
+ def setup_model(self):
13
+ try:
14
+ # Configure 4-bit quantization for memory efficiency
15
+ quantization_config = BitsAndBytesConfig(
16
+ load_in_4bit=True,
17
+ bnb_4bit_compute_dtype=torch.bfloat16,
18
+ bnb_4bit_use_double_quant=True,
19
+ bnb_4bit_quant_type="nf4"
20
+ )
21
+
22
+ # Load model with quantization
23
+ self.model = AutoModelForCausalLM.from_pretrained(
24
+ self.model_name,
25
+ quantization_config=quantization_config,
26
+ device_map="auto",
27
+ torch_dtype=torch.bfloat16,
28
+ trust_remote_code=True
29
+ )
30
+
31
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
32
+ if self.tokenizer.pad_token is None:
33
+ self.tokenizer.pad_token = self.tokenizer.eos_token
34
+
35
+ logging.info(f"Successfully loaded {self.model_name}")
36
+
37
+ except Exception as e:
38
+ logging.warning(f"Failed to load {self.model_name}: {e}")
39
+ self.model = None
40
+ self.tokenizer = None
41
+
42
+ def generate_response(self, venue_context, user_query, max_length=200):
43
+ if not self.model or not self.tokenizer:
44
+ return "I can help you find venues, but conversational features are currently unavailable."
45
+
46
+ try:
47
+ # Create a focused prompt for venue recommendations
48
+ prompt = f"""You are a helpful Yerevan venue assistant. Based on the venue information provided, give a brief, friendly response.
49
+
50
+ Venue Context: {venue_context[:500]}...
51
+
52
+ User: {user_query}
53
+ Assistant:"""
54
+
55
+ inputs = self.tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length=512)
56
+
57
+ with torch.no_grad():
58
+ outputs = self.model.generate(
59
+ inputs,
60
+ max_new_tokens=max_length,
61
+ temperature=0.7,
62
+ do_sample=True,
63
+ pad_token_id=self.tokenizer.eos_token_id,
64
+ no_repeat_ngram_size=3
65
+ )
66
+
67
+ response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
68
+ # Extract only the assistant's response
69
+ assistant_response = response.split("Assistant:")[-1].strip()
70
+
71
+ return assistant_response[:max_length] if len(assistant_response) > max_length else assistant_response
72
+
73
+ except Exception as e:
74
+ logging.error(f"Error generating response: {e}")
75
+ return "I found the venues you requested, but had trouble generating a conversational response."
requirements.txt CHANGED
@@ -5,4 +5,7 @@ geopy>=2.3.0
5
  scikit-learn>=1.3.0
6
  regex>=2023.6.3
7
  huggingface_hub>=0.20.0
8
- llama-cpp-python>=0.2.0
 
 
 
 
5
  scikit-learn>=1.3.0
6
  regex>=2023.6.3
7
  huggingface_hub>=0.20.0
8
+ transformers>=4.35.0
9
+ torch>=2.0.0
10
+ accelerate>=0.20.0
11
+ bitsandbytes>=0.41.0
venue_ai_complete.py CHANGED
@@ -30,6 +30,15 @@ except Exception as e:
30
  # Import the lightweight RAG enhancer
31
  from lightweight_rag import LightweightRAGEnhancer
32
 
 
 
 
 
 
 
 
 
 
33
  class CompleteYerevanVenueAI:
34
  """
35
  Complete Bilingual (Armenian/English) AI Assistant for Yerevan Venue Recommendations
@@ -882,55 +891,47 @@ class CompleteYerevanVenueAI:
882
  }
883
 
884
  def _initialize_conversational_llm(self):
885
- """Initialize the conversational LLM for chat-like responses"""
886
- if not LLAMA_CPP_AVAILABLE:
887
- logger.warning("llama-cpp-python not available. Conversational features will be limited.")
888
- return
 
 
 
 
 
 
889
 
890
- try:
891
- # Use TinyLlama for CPU deployment - much smaller and faster
892
  try:
893
  from huggingface_hub import hf_hub_download
894
  logger.info("Downloading TinyLlama model from Hugging Face Hub...")
895
 
896
- # Download smaller, CPU-optimized model
897
  model_path = hf_hub_download(
898
  repo_id="TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
899
  filename="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
900
  cache_dir="./model_cache"
901
  )
902
 
903
- logger.info(f"TinyLlama model downloaded to: {model_path}")
904
- model_paths = [model_path]
 
 
 
 
 
 
 
 
 
 
905
 
906
  except Exception as e:
907
- logger.error(f"Failed to download TinyLlama from HF Hub: {e}")
908
- # Fallback - no local model available
909
- logger.warning("No conversational model available. Using template responses.")
910
- return
911
-
912
- for model_path in model_paths:
913
- try:
914
- logger.info(f"Attempting to load conversational model: {model_path}")
915
- self.conversational_llm = Llama(
916
- model_path=model_path,
917
- n_ctx=1024, # Smaller context window for CPU
918
- n_threads=2, # Limit CPU threads
919
- n_gpu_layers=0, # CPU only
920
- verbose=False,
921
- use_mmap=True, # Memory mapping for efficiency
922
- use_mlock=False # Don't lock memory
923
- )
924
- logger.info(f"Successfully loaded TinyLlama model: {model_path}")
925
- return # Exit after successful load
926
- except Exception as e:
927
- logger.warning(f"Failed to load model {model_path}: {e}")
928
-
929
- logger.error("Could not load any conversational model. Using template responses.")
930
-
931
- except Exception as e:
932
- logger.error(f"Error initializing conversational LLM: {e}")
933
- self.conversational_llm = None
934
 
935
  def _add_to_conversation_history(self, user_message: str, ai_response: str):
936
  """Add a user message and AI response to the conversation history"""
@@ -986,28 +987,33 @@ class CompleteYerevanVenueAI:
986
  return self._generate_template_response(query, language)
987
 
988
  try:
989
- context = self._get_conversation_context()
990
-
991
- # Optimized prompt for TinyLlama
992
- if language == 'armenian':
993
- prompt = f"""You are a helpful assistant for Yerevan, Armenia. Be brief and friendly.
 
 
 
 
 
994
  User: {query}
995
  Assistant:"""
996
- else:
997
- prompt = f"""You are a helpful assistant for Yerevan, Armenia. Be brief and friendly.
998
  User: {query}
999
  Assistant:"""
1000
 
1001
- response = self.conversational_llm(
1002
- prompt,
1003
- max_tokens=50, # Shorter responses for CPU efficiency
1004
- stop=["User:", "Assistant:", "\n"],
1005
- temperature=0.7,
1006
- echo=False,
1007
- )
1008
-
1009
- generated_text = response['choices'][0]['text'].strip()
1010
- return generated_text if generated_text else self._generate_template_response(query, language)
1011
 
1012
  except Exception as e:
1013
  logger.error(f"Error generating conversational response: {e}")
 
30
  # Import the lightweight RAG enhancer
31
  from lightweight_rag import LightweightRAGEnhancer
32
 
33
+ # Import lightweight conversational LLM
34
+ try:
35
+ from lightweight_conversational_llm import LightweightConversationalLLM
36
+ LIGHTWEIGHT_LLM_AVAILABLE = True
37
+ logger.info("Lightweight conversational LLM available")
38
+ except ImportError as e:
39
+ logger.warning(f"Lightweight conversational LLM not available: {e}")
40
+ LIGHTWEIGHT_LLM_AVAILABLE = False
41
+
42
  class CompleteYerevanVenueAI:
43
  """
44
  Complete Bilingual (Armenian/English) AI Assistant for Yerevan Venue Recommendations
 
891
  }
892
 
893
  def _initialize_conversational_llm(self):
894
+ """Initialize conversational LLM with lightweight model preferred"""
895
+ # Try lightweight transformers-based model first (no compilation needed)
896
+ if LIGHTWEIGHT_LLM_AVAILABLE:
897
+ try:
898
+ logger.info("Initializing lightweight conversational LLM...")
899
+ self.conversational_llm = LightweightConversationalLLM()
900
+ logger.info("Successfully initialized lightweight conversational LLM")
901
+ return
902
+ except Exception as e:
903
+ logger.warning(f"Failed to initialize lightweight LLM: {e}")
904
 
905
+ # Legacy llama-cpp fallback (if available)
906
+ if LLAMA_CPP_AVAILABLE:
907
  try:
908
  from huggingface_hub import hf_hub_download
909
  logger.info("Downloading TinyLlama model from Hugging Face Hub...")
910
 
 
911
  model_path = hf_hub_download(
912
  repo_id="TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
913
  filename="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
914
  cache_dir="./model_cache"
915
  )
916
 
917
+ from llama_cpp import Llama
918
+ self.conversational_llm = Llama(
919
+ model_path=model_path,
920
+ n_ctx=1024,
921
+ n_threads=2,
922
+ n_gpu_layers=0,
923
+ verbose=False,
924
+ use_mmap=True,
925
+ use_mlock=False
926
+ )
927
+ logger.info("Successfully loaded legacy TinyLlama model")
928
+ return
929
 
930
  except Exception as e:
931
+ logger.warning(f"Failed to initialize legacy conversational LLM: {e}")
932
+
933
+ logger.info("No conversational LLM available, using template-based responses")
934
+ self.conversational_llm = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
935
 
936
  def _add_to_conversation_history(self, user_message: str, ai_response: str):
937
  """Add a user message and AI response to the conversation history"""
 
987
  return self._generate_template_response(query, language)
988
 
989
  try:
990
+ # Check if this is the new lightweight model
991
+ if hasattr(self.conversational_llm, 'generate_response'):
992
+ # Use the lightweight model's generate_response method
993
+ return self.conversational_llm.generate_response("", query, max_length=100)
994
+ else:
995
+ # Legacy llama-cpp model
996
+ context = self._get_conversation_context()
997
+
998
+ if language == 'armenian':
999
+ prompt = f"""You are a helpful assistant for Yerevan, Armenia. Be brief and friendly.
1000
  User: {query}
1001
  Assistant:"""
1002
+ else:
1003
+ prompt = f"""You are a helpful assistant for Yerevan, Armenia. Be brief and friendly.
1004
  User: {query}
1005
  Assistant:"""
1006
 
1007
+ response = self.conversational_llm(
1008
+ prompt,
1009
+ max_tokens=50,
1010
+ stop=["User:", "Assistant:", "\n"],
1011
+ temperature=0.7,
1012
+ echo=False,
1013
+ )
1014
+
1015
+ generated_text = response['choices'][0]['text'].strip()
1016
+ return generated_text if generated_text else self._generate_template_response(query, language)
1017
 
1018
  except Exception as e:
1019
  logger.error(f"Error generating conversational response: {e}")