#!/usr/bin/env python3
"""
Simple script to test Sanskrit text transcription on a single image
No UI, no Gradio - just command line testing
"""

import torch
from PIL import Image
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
from peft import PeftModel
import os
import logging
import argparse

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def load_model():
    """Load the model and processor"""
    model_path = 'diabolic6045/qwen2-5-vl-sanskrit-ocr'
    
    logger.info("Loading processor...")
    processor = AutoProcessor.from_pretrained(model_path)
    
    logger.info("Loading merged model...")
    # Check if CUDA is available, otherwise use CPU
    device_map = "auto" if torch.cuda.is_available() else "cpu"
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        model_path,
        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
        device_map=device_map
    )
    
    model.eval()
    device = next(model.parameters()).device
    logger.info(f"Merged model loaded on device: {device}")
    
    return model, processor

def transcribe_image(model, processor, image_path, prompt=None):
    """Transcribe Sanskrit text from image"""
    if prompt is None:
        prompt = "Please transcribe the Sanskrit text shown in this image:"
    
    try:
        # Load image
        image = Image.open(image_path)
        logger.info(f"Loaded image: {image_path}")
        logger.info(f"Image size: {image.size}")
        
        # Format the conversation using chat template
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {"type": "text", "text": prompt}
                ]
            }
        ]
        
        # Preparation for inference
        text = processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )
        
        # Get model device and move inputs there
        model_device = next(model.parameters()).device
        inputs = {k: v.to(model_device) for k, v in inputs.items()}
        
        logger.info("Generating transcription...")
        with torch.no_grad():
            generated_ids = model.generate(
                **inputs,
                max_new_tokens=512,
                do_sample=False,
                pad_token_id=processor.tokenizer.eos_token_id,
                use_cache=True,
                repetition_penalty=1.1
            )
        
        # Extract only the generated part
        generated_ids_trimmed = [
            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs['input_ids'], generated_ids)
        ]
        output_text = processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )
        
        return output_text[0] if output_text else ""
        
    except Exception as e:
        logger.error(f"Error generating response: {e}")
        return f"Error: {str(e)}"

def main():
    """Main function"""
    parser = argparse.ArgumentParser(description="Test Sanskrit text transcription on a single image")
    parser.add_argument("image_path", help="Path to the image file")
    parser.add_argument("--prompt", "-p", help="Custom prompt for transcription", 
                       default="Please transcribe the Sanskrit text shown in this image:")
    parser.add_argument("--output", "-o", help="Output file to save transcription (optional)")
    
    args = parser.parse_args()
    
    # Check if image file exists
    if not os.path.exists(args.image_path):
        logger.error(f"Image file not found: {args.image_path}")
        return
    
    try:
        # Load model
        logger.info("Loading model...")
        model, processor = load_model()
        
        # Transcribe image
        logger.info(f"Transcribing image: {args.image_path}")
        result = transcribe_image(model, processor, args.image_path, args.prompt)
        
        # Print result
        print("\n" + "="*50)
        print("TRANSCRIPTION RESULT:")
        print("="*50)
        print(result)
        print("="*50)
        
        # Save to file if requested
        if args.output:
            with open(args.output, 'w', encoding='utf-8') as f:
                f.write(result)
            logger.info(f"Transcription saved to: {args.output}")
        
    except Exception as e:
        logger.error(f"Error: {e}")
        return

if __name__ == "__main__":
    main()