#!/usr/bin/env python3 """ Simple script to test Sanskrit text transcription on a single image No UI, no Gradio - just command line testing """ import torch from PIL import Image from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor from qwen_vl_utils import process_vision_info from peft import PeftModel import os import logging import argparse # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def load_model(): """Load the model and processor""" model_path = 'diabolic6045/qwen2-5-vl-sanskrit-ocr' logger.info("Loading processor...") processor = AutoProcessor.from_pretrained(model_path) logger.info("Loading merged model...") # Check if CUDA is available, otherwise use CPU device_map = "auto" if torch.cuda.is_available() else "cpu" model = Qwen2_5_VLForConditionalGeneration.from_pretrained( model_path, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, device_map=device_map ) model.eval() device = next(model.parameters()).device logger.info(f"Merged model loaded on device: {device}") return model, processor def transcribe_image(model, processor, image_path, prompt=None): """Transcribe Sanskrit text from image""" if prompt is None: prompt = "Please transcribe the Sanskrit text shown in this image:" try: # Load image image = Image.open(image_path) logger.info(f"Loaded image: {image_path}") logger.info(f"Image size: {image.size}") # Format the conversation using chat template messages = [ { "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": prompt} ] } ] # Preparation for inference text = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) image_inputs, video_inputs = process_vision_info(messages) inputs = processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ) # Get model device and move inputs there model_device = next(model.parameters()).device inputs = {k: v.to(model_device) for k, v in inputs.items()} logger.info("Generating transcription...") with torch.no_grad(): generated_ids = model.generate( **inputs, max_new_tokens=512, do_sample=False, pad_token_id=processor.tokenizer.eos_token_id, use_cache=True, repetition_penalty=1.1 ) # Extract only the generated part generated_ids_trimmed = [ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs['input_ids'], generated_ids) ] output_text = processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False ) return output_text[0] if output_text else "" except Exception as e: logger.error(f"Error generating response: {e}") return f"Error: {str(e)}" def main(): """Main function""" parser = argparse.ArgumentParser(description="Test Sanskrit text transcription on a single image") parser.add_argument("image_path", help="Path to the image file") parser.add_argument("--prompt", "-p", help="Custom prompt for transcription", default="Please transcribe the Sanskrit text shown in this image:") parser.add_argument("--output", "-o", help="Output file to save transcription (optional)") args = parser.parse_args() # Check if image file exists if not os.path.exists(args.image_path): logger.error(f"Image file not found: {args.image_path}") return try: # Load model logger.info("Loading model...") model, processor = load_model() # Transcribe image logger.info(f"Transcribing image: {args.image_path}") result = transcribe_image(model, processor, args.image_path, args.prompt) # Print result print("\n" + "="*50) print("TRANSCRIPTION RESULT:") print("="*50) print(result) print("="*50) # Save to file if requested if args.output: with open(args.output, 'w', encoding='utf-8') as f: f.write(result) logger.info(f"Transcription saved to: {args.output}") except Exception as e: logger.error(f"Error: {e}") return if __name__ == "__main__": main()