Spaces:
Sleeping
Sleeping
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| import logging | |
| # Try to import BitsAndBytesConfig, but don't fail if not available | |
| try: | |
| from transformers import BitsAndBytesConfig | |
| QUANTIZATION_AVAILABLE = True | |
| except ImportError: | |
| QUANTIZATION_AVAILABLE = False | |
| logging.warning("BitsAndBytesConfig not available, quantization disabled") | |
| class LightweightConversationalLLM: | |
| def __init__(self, model_name="HuggingFaceTB/SmolLM-1.7B-Instruct"): | |
| self.model_name = model_name | |
| self.model = None | |
| self.tokenizer = None | |
| self.setup_model() | |
| def setup_model(self): | |
| try: | |
| # Try quantization first if available, fallback to regular loading | |
| if QUANTIZATION_AVAILABLE: | |
| try: | |
| # Configure 4-bit quantization for memory efficiency | |
| quantization_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_compute_dtype=torch.bfloat16, | |
| bnb_4bit_use_double_quant=True, | |
| bnb_4bit_quant_type="nf4" | |
| ) | |
| # Load model with quantization | |
| self.model = AutoModelForCausalLM.from_pretrained( | |
| self.model_name, | |
| quantization_config=quantization_config, | |
| device_map="auto", | |
| torch_dtype=torch.bfloat16, | |
| trust_remote_code=True | |
| ) | |
| logging.info(f"Successfully loaded {self.model_name} with 4-bit quantization") | |
| except Exception as quant_error: | |
| logging.warning(f"4-bit quantization failed: {quant_error}") | |
| logging.info("Falling back to regular model loading...") | |
| # Fallback to regular loading without quantization | |
| self.model = AutoModelForCausalLM.from_pretrained( | |
| self.model_name, | |
| device_map="auto", | |
| torch_dtype=torch.float16, # Use float16 for memory efficiency | |
| trust_remote_code=True, | |
| low_cpu_mem_usage=True | |
| ) | |
| logging.info(f"Successfully loaded {self.model_name} without quantization") | |
| else: | |
| # Load without quantization | |
| logging.info("Loading model without quantization (bitsandbytes not available)") | |
| self.model = AutoModelForCausalLM.from_pretrained( | |
| self.model_name, | |
| device_map="auto", | |
| torch_dtype=torch.float16, # Use float16 for memory efficiency | |
| trust_remote_code=True, | |
| low_cpu_mem_usage=True | |
| ) | |
| logging.info(f"Successfully loaded {self.model_name} without quantization") | |
| self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) | |
| if self.tokenizer.pad_token is None: | |
| self.tokenizer.pad_token = self.tokenizer.eos_token | |
| except Exception as e: | |
| logging.warning(f"Failed to load {self.model_name}: {e}") | |
| self.model = None | |
| self.tokenizer = None | |
| def generate_response(self, venue_context, user_query, max_length=400): | |
| if not self.model or not self.tokenizer: | |
| return "I can help you find venues, but conversational features are currently unavailable." | |
| try: | |
| # Create a focused prompt for venue recommendations | |
| prompt = f"""You are a helpful Yerevan venue assistant. Based on the venue information provided, give a brief, friendly response. | |
| Venue Context: {venue_context[:800]}... | |
| User: {user_query} | |
| Assistant:""" | |
| inputs = self.tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length=512) | |
| with torch.no_grad(): | |
| outputs = self.model.generate( | |
| inputs, | |
| max_new_tokens=max_length, | |
| temperature=0.7, | |
| do_sample=True, | |
| pad_token_id=self.tokenizer.eos_token_id, | |
| no_repeat_ngram_size=3 | |
| ) | |
| response = self.tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Extract only the assistant's response | |
| assistant_response = response.split("Assistant:")[-1].strip() | |
| return assistant_response[:max_length] if len(assistant_response) > max_length else assistant_response | |
| except Exception as e: | |
| logging.error(f"Error generating response: {e}") | |
| return "I found the venues you requested, but had trouble generating a conversational response." |