Spaces:

artush-habetyan
/

vibe_sip

Sleeping

File size: 5,004 Bytes

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import logging

# Try to import BitsAndBytesConfig, but don't fail if not available
try:
    from transformers import BitsAndBytesConfig
    QUANTIZATION_AVAILABLE = True
except ImportError:
    QUANTIZATION_AVAILABLE = False
    logging.warning("BitsAndBytesConfig not available, quantization disabled")

class LightweightConversationalLLM:
    def __init__(self, model_name="HuggingFaceTB/SmolLM-1.7B-Instruct"):
        self.model_name = model_name
        self.model = None
        self.tokenizer = None
        self.setup_model()
    
    def setup_model(self):
        try:
            # Try quantization first if available, fallback to regular loading
            if QUANTIZATION_AVAILABLE:
                try:
                    # Configure 4-bit quantization for memory efficiency
                    quantization_config = BitsAndBytesConfig(
                        load_in_4bit=True,
                        bnb_4bit_compute_dtype=torch.bfloat16,
                        bnb_4bit_use_double_quant=True,
                        bnb_4bit_quant_type="nf4"
                    )
                    
                    # Load model with quantization
                    self.model = AutoModelForCausalLM.from_pretrained(
                        self.model_name,
                        quantization_config=quantization_config,
                        device_map="auto",
                        torch_dtype=torch.bfloat16,
                        trust_remote_code=True
                    )
                    logging.info(f"Successfully loaded {self.model_name} with 4-bit quantization")
                    
                except Exception as quant_error:
                    logging.warning(f"4-bit quantization failed: {quant_error}")
                    logging.info("Falling back to regular model loading...")
                    
                    # Fallback to regular loading without quantization
                    self.model = AutoModelForCausalLM.from_pretrained(
                        self.model_name,
                        device_map="auto",
                        torch_dtype=torch.float16,  # Use float16 for memory efficiency
                        trust_remote_code=True,
                        low_cpu_mem_usage=True
                    )
                    logging.info(f"Successfully loaded {self.model_name} without quantization")
            else:
                # Load without quantization
                logging.info("Loading model without quantization (bitsandbytes not available)")
                self.model = AutoModelForCausalLM.from_pretrained(
                    self.model_name,
                    device_map="auto",
                    torch_dtype=torch.float16,  # Use float16 for memory efficiency
                    trust_remote_code=True,
                    low_cpu_mem_usage=True
                )
                logging.info(f"Successfully loaded {self.model_name} without quantization")
            
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token
                
        except Exception as e:
            logging.warning(f"Failed to load {self.model_name}: {e}")
            self.model = None
            self.tokenizer = None
    
    def generate_response(self, venue_context, user_query, max_length=400):
        if not self.model or not self.tokenizer:
            return "I can help you find venues, but conversational features are currently unavailable."
        
        try:
            # Create a focused prompt for venue recommendations
            prompt = f"""You are a helpful Yerevan venue assistant. Based on the venue information provided, give a brief, friendly response.

            Venue Context: {venue_context[:800]}...

User: {user_query}
Assistant:"""

            inputs = self.tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length=512)
            
            with torch.no_grad():
                outputs = self.model.generate(
                    inputs,
                    max_new_tokens=max_length,
                    temperature=0.7,
                    do_sample=True,
                    pad_token_id=self.tokenizer.eos_token_id,
                    no_repeat_ngram_size=3
                )
            
            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            # Extract only the assistant's response
            assistant_response = response.split("Assistant:")[-1].strip()
            
            return assistant_response[:max_length] if len(assistant_response) > max_length else assistant_response
            
        except Exception as e:
            logging.error(f"Error generating response: {e}")
            return "I found the venues you requested, but had trouble generating a conversational response."