Spaces:

artush-habetyan
/

vibe_sip

Sleeping

App Files Files Community

artush-habetyan commited on Jun 25

Commit

c537c46

verified ·

1 Parent(s): fa4ad33

Upload 8 files

Browse files

Files changed (3) hide show

lightweight_conversational_llm.py +75 -0
requirements.txt +4 -1
venue_ai_complete.py +59 -53

lightweight_conversational_llm.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+import logging
+class LightweightConversationalLLM:
+    def __init__(self, model_name="HuggingFaceTB/SmolLM-1.7B-Instruct"):
+        self.model_name = model_name
+        self.model = None
+        self.tokenizer = None
+        self.setup_model()
+    def setup_model(self):
+        try:
+            # Configure 4-bit quantization for memory efficiency
+            quantization_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=torch.bfloat16,
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_quant_type="nf4"
+            )
+            # Load model with quantization
+            self.model = AutoModelForCausalLM.from_pretrained(
+                self.model_name,
+                quantization_config=quantization_config,
+                device_map="auto",
+                torch_dtype=torch.bfloat16,
+                trust_remote_code=True
+            )
+            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+            if self.tokenizer.pad_token is None:
+                self.tokenizer.pad_token = self.tokenizer.eos_token
+            logging.info(f"Successfully loaded {self.model_name}")
+        except Exception as e:
+            logging.warning(f"Failed to load {self.model_name}: {e}")
+            self.model = None
+            self.tokenizer = None
+    def generate_response(self, venue_context, user_query, max_length=200):
+        if not self.model or not self.tokenizer:
+            return "I can help you find venues, but conversational features are currently unavailable."
+        try:
+            # Create a focused prompt for venue recommendations
+            prompt = f"""You are a helpful Yerevan venue assistant. Based on the venue information provided, give a brief, friendly response.
+Venue Context: {venue_context[:500]}...
+User: {user_query}
+Assistant:"""
+            inputs = self.tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length=512)
+            with torch.no_grad():
+                outputs = self.model.generate(
+                    inputs,
+                    max_new_tokens=max_length,
+                    temperature=0.7,
+                    do_sample=True,
+                    pad_token_id=self.tokenizer.eos_token_id,
+                    no_repeat_ngram_size=3
+                )
+            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            # Extract only the assistant's response
+            assistant_response = response.split("Assistant:")[-1].strip()
+            return assistant_response[:max_length] if len(assistant_response) > max_length else assistant_response
+        except Exception as e:
+            logging.error(f"Error generating response: {e}")
+            return "I found the venues you requested, but had trouble generating a conversational response."

requirements.txt CHANGED Viewed

@@ -5,4 +5,7 @@ geopy>=2.3.0
 scikit-learn>=1.3.0
 regex>=2023.6.3
 huggingface_hub>=0.20.0
-llama-cpp-python>=0.2.0

 scikit-learn>=1.3.0
 regex>=2023.6.3
 huggingface_hub>=0.20.0
+transformers>=4.35.0
+torch>=2.0.0
+accelerate>=0.20.0
+bitsandbytes>=0.41.0

venue_ai_complete.py CHANGED Viewed

@@ -30,6 +30,15 @@ except Exception as e:
 # Import the lightweight RAG enhancer
 from lightweight_rag import LightweightRAGEnhancer
 class CompleteYerevanVenueAI:
     """
     Complete Bilingual (Armenian/English) AI Assistant for Yerevan Venue Recommendations
@@ -882,55 +891,47 @@ class CompleteYerevanVenueAI:
         }
     def _initialize_conversational_llm(self):
-        """Initialize the conversational LLM for chat-like responses"""
-        if not LLAMA_CPP_AVAILABLE:
-            logger.warning("llama-cpp-python not available. Conversational features will be limited.")
-            return
-        try:
-            # Use TinyLlama for CPU deployment - much smaller and faster
             try:
                 from huggingface_hub import hf_hub_download
                 logger.info("Downloading TinyLlama model from Hugging Face Hub...")
-                # Download smaller, CPU-optimized model
                 model_path = hf_hub_download(
                     repo_id="TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
                     filename="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
                     cache_dir="./model_cache"
                 )
-                logger.info(f"TinyLlama model downloaded to: {model_path}")
-                model_paths = [model_path]
             except Exception as e:
-                logger.error(f"Failed to download TinyLlama from HF Hub: {e}")
-                # Fallback - no local model available
-                logger.warning("No conversational model available. Using template responses.")
-                return
-            for model_path in model_paths:
-                try:
-                    logger.info(f"Attempting to load conversational model: {model_path}")
-                    self.conversational_llm = Llama(
-                        model_path=model_path,
-                        n_ctx=1024,  # Smaller context window for CPU
-                        n_threads=2,  # Limit CPU threads
-                        n_gpu_layers=0,  # CPU only
-                        verbose=False,
-                        use_mmap=True,  # Memory mapping for efficiency
-                        use_mlock=False  # Don't lock memory
-                    )
-                    logger.info(f"Successfully loaded TinyLlama model: {model_path}")
-                    return # Exit after successful load
-                except Exception as e:
-                    logger.warning(f"Failed to load model {model_path}: {e}")
-            logger.error("Could not load any conversational model. Using template responses.")
-        except Exception as e:
-            logger.error(f"Error initializing conversational LLM: {e}")
-            self.conversational_llm = None
     def _add_to_conversation_history(self, user_message: str, ai_response: str):
         """Add a user message and AI response to the conversation history"""
@@ -986,28 +987,33 @@ class CompleteYerevanVenueAI:
             return self._generate_template_response(query, language)
         try:
-            context = self._get_conversation_context()
-            # Optimized prompt for TinyLlama
-            if language == 'armenian':
-                prompt = f"""You are a helpful assistant for Yerevan, Armenia. Be brief and friendly.
 User: {query}
 Assistant:"""
-            else:
-                prompt = f"""You are a helpful assistant for Yerevan, Armenia. Be brief and friendly.
 User: {query}
 Assistant:"""
-            response = self.conversational_llm(
-                prompt,
-                max_tokens=50,  # Shorter responses for CPU efficiency
-                stop=["User:", "Assistant:", "\n"],
-                temperature=0.7,
-                echo=False,
-            )
-            generated_text = response['choices'][0]['text'].strip()
-            return generated_text if generated_text else self._generate_template_response(query, language)
         except Exception as e:
             logger.error(f"Error generating conversational response: {e}")

 # Import the lightweight RAG enhancer
 from lightweight_rag import LightweightRAGEnhancer
+# Import lightweight conversational LLM
+try:
+    from lightweight_conversational_llm import LightweightConversationalLLM
+    LIGHTWEIGHT_LLM_AVAILABLE = True
+    logger.info("Lightweight conversational LLM available")
+except ImportError as e:
+    logger.warning(f"Lightweight conversational LLM not available: {e}")
+    LIGHTWEIGHT_LLM_AVAILABLE = False
 class CompleteYerevanVenueAI:
     """
     Complete Bilingual (Armenian/English) AI Assistant for Yerevan Venue Recommendations
         }
     def _initialize_conversational_llm(self):
+        """Initialize conversational LLM with lightweight model preferred"""
+        # Try lightweight transformers-based model first (no compilation needed)
+        if LIGHTWEIGHT_LLM_AVAILABLE:
+            try:
+                logger.info("Initializing lightweight conversational LLM...")
+                self.conversational_llm = LightweightConversationalLLM()
+                logger.info("Successfully initialized lightweight conversational LLM")
+                return
+            except Exception as e:
+                logger.warning(f"Failed to initialize lightweight LLM: {e}")
+        # Legacy llama-cpp fallback (if available)
+        if LLAMA_CPP_AVAILABLE:
             try:
                 from huggingface_hub import hf_hub_download
                 logger.info("Downloading TinyLlama model from Hugging Face Hub...")
                 model_path = hf_hub_download(
                     repo_id="TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
                     filename="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
                     cache_dir="./model_cache"
                 )
+                from llama_cpp import Llama
+                self.conversational_llm = Llama(
+                    model_path=model_path,
+                    n_ctx=1024,
+                    n_threads=2,
+                    n_gpu_layers=0,
+                    verbose=False,
+                    use_mmap=True,
+                    use_mlock=False
+                )
+                logger.info("Successfully loaded legacy TinyLlama model")
+                return
             except Exception as e:
+                logger.warning(f"Failed to initialize legacy conversational LLM: {e}")
+        logger.info("No conversational LLM available, using template-based responses")
+        self.conversational_llm = None
     def _add_to_conversation_history(self, user_message: str, ai_response: str):
         """Add a user message and AI response to the conversation history"""
             return self._generate_template_response(query, language)
         try:
+            # Check if this is the new lightweight model
+            if hasattr(self.conversational_llm, 'generate_response'):
+                # Use the lightweight model's generate_response method
+                return self.conversational_llm.generate_response("", query, max_length=100)
+            else:
+                # Legacy llama-cpp model
+                context = self._get_conversation_context()
+                if language == 'armenian':
+                    prompt = f"""You are a helpful assistant for Yerevan, Armenia. Be brief and friendly.
 User: {query}
 Assistant:"""
+                else:
+                    prompt = f"""You are a helpful assistant for Yerevan, Armenia. Be brief and friendly.
 User: {query}
 Assistant:"""
+                response = self.conversational_llm(
+                    prompt,
+                    max_tokens=50,
+                    stop=["User:", "Assistant:", "\n"],
+                    temperature=0.7,
+                    echo=False,
+                )
+                generated_text = response['choices'][0]['text'].strip()
+                return generated_text if generated_text else self._generate_template_response(query, language)
         except Exception as e:
             logger.error(f"Error generating conversational response: {e}")