TianheWu
/

VisualQuality-R1-7B-preview

@@ -15,582 +15,3 @@ tags:
 The latest version can be found in [VisualQuality-R1-7B](https://huggingface.co/TianheWu/VisualQuality-R1-7B)
-# VisualQuality-R1-7B-preview
-This is a demo version of VisualQuality-R1 which is trained on the combination of KADID-10K, TID2013, and KONIQ-10K. The base model of VisualQuality-R1 is Qwen2.5-VL-7B-Instruct.
-Paper link: [arXiv](https://arxiv.org/abs/2505.14460)
-## ⚡Quick Start
-### Non-Thinking Inference
-When you execute inference with VisualQuality-R1 as a reward/evaluation model, you can only use **non-thinking** mode to reduce inference time, generating only a single output token with the following prompt:
-```
-PROMPT = (
-    "You are doing the image quality assessment task. Here is the question: "
-    "What is your overall rating on the quality of this picture? The rating should be a float between 1 and 5, "
-    "rounded to two decimal places, with 1 representing very poor quality and 5 representing excellent quality."
-)
-QUESTION_TEMPLATE = "{Question} Please only output the final answer with only one score in <answer> </answer> tags."
-```
-For single image quality rating, the code is:
-<details>
-<summary>Example Code (VisualQuality-R1: Image Quality Rating with non-thinking mode)</summary>
-```python
-from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
-from qwen_vl_utils import process_vision_info
-import torch
-import random
-import re
-import os
-def score_image(image_path, model, processor):
-    PROMPT = (
-        "You are doing the image quality assessment task. Here is the question: "
-        "What is your overall rating on the quality of this picture? The rating should be a float between 1 and 5, "
-        "rounded to two decimal places, with 1 representing very poor quality and 5 representing excellent quality. "
-        "First output the thinking process in <think> </think> tags and then output the final answer with only one score in <answer> </answer> tags."
-    )
-    QUESTION_TEMPLATE = "{Question} Please only output the final answer with only one score in <answer> </answer> tags."
-    message = [
-        {
-            "role": "user",
-            "content": [
-                {'type': 'image', 'image': image_path},
-                {"type": "text", "text": PROMPT}
-            ],
-        }
-    ]
-    batch_messages = [message]
-    # Preparation for inference
-    text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True, add_vision_id=True) for msg in batch_messages]
-    image_inputs, video_inputs = process_vision_info(batch_messages)
-    inputs = processor(
-        text=text,
-        images=image_inputs,
-        videos=video_inputs,
-        padding=True,
-        return_tensors="pt",
-    )
-    inputs = inputs.to(device)
-    # Inference: Generation of the output
-    generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=2048, do_sample=True, top_k=50, top_p=1)
-    generated_ids_trimmed = [
-        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-    ]
-    batch_output_text = processor.batch_decode(
-        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-    )
-    reasoning = None
-    try:
-        model_output_matches = re.findall(r'<answer>(.*?)</answer>', batch_output_text[0], re.DOTALL)
-        model_answer = model_output_matches[-1].strip() if model_output_matches else batch_output_text[0].strip()
-        score = float(re.search(r'\d+(\.\d+)?', model_answer).group())
-    except:
-        print(f"================= Meet error with {img_path}, please generate again. =================")
-        score = random.randint(1, 5)
-    return reasoning, score
-random.seed(1)
-MODEL_PATH = ""
-device = torch.device("cuda:5") if torch.cuda.is_available() else torch.device("cpu")
-image_path = ""
-model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_PATH,
-    torch_dtype=torch.bfloat16,
-    attn_implementation="flash_attention_2",
-    device_map=device,
-)
-processor = AutoProcessor.from_pretrained(MODEL_PATH)
-processor.tokenizer.padding_side = "left"
-reasoning, score = score_image(
-    image_path, model, processor
-)
-print(score)
-```
-</details>
-<details>
-<summary>Example Code (VisualQuality-R1: Batch Images Quality Rating with non-thinking mode)</summary>
-```python
-from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
-from qwen_vl_utils import process_vision_info
-from tqdm import tqdm
-import torch
-import random
-import re
-import os
-def get_image_paths(folder_path):
-    image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff', '.webp'}
-    image_paths = []
-    for root, dirs, files in os.walk(folder_path):
-        for file in files:
-            _, ext = os.path.splitext(file)
-            if ext.lower() in image_extensions:
-                image_paths.append(os.path.join(root, file))
-    return image_paths
-def score_batch_image(image_paths, model, processor):
-    PROMPT = (
-        "You are doing the image quality assessment task. Here is the question: "
-        "What is your overall rating on the quality of this picture? The rating should be a float between 1 and 5, "
-        "rounded to two decimal places, with 1 representing very poor quality and 5 representing excellent quality."
-    )
-    QUESTION_TEMPLATE = "{Question} Please only output the final answer with only one score in <answer> </answer> tags."
-    messages = []
-    for img_path in image_paths:
-        message = [
-            {
-                "role": "user",
-                "content": [
-                    {'type': 'image', 'image': img_path},
-                    {"type": "text", "text": QUESTION_TEMPLATE.format(Question=PROMPT)}
-                ],
-            }
-        ]
-        messages.append(message)
-    BSZ = 32
-    all_outputs = []  # List to store all answers
-    for i in tqdm(range(0, len(messages), BSZ)):
-        batch_messages = messages[i:i + BSZ]
-        # Preparation for inference
-        text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True, add_vision_id=True) for msg in batch_messages]
-        image_inputs, video_inputs = process_vision_info(batch_messages)
-        inputs = processor(
-            text=text,
-            images=image_inputs,
-            videos=video_inputs,
-            padding=True,
-            return_tensors="pt",
-        )
-        inputs = inputs.to(device)
-        # Inference: Generation of the output
-        generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=512, do_sample=True, top_k=50, top_p=1)
-        generated_ids_trimmed = [
-            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-        ]
-        batch_output_text = processor.batch_decode(
-            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )
-        all_outputs.extend(batch_output_text)
-    path_score_dict = {}
-    for img_path, model_output in zip(image_paths, all_outputs):
-        try:
-            model_output_matches = re.findall(r'<answer>(.*?)</answer>', model_output, re.DOTALL)
-            model_answer = model_output_matches[-1].strip() if model_output_matches else model_output.strip()
-            score = float(re.search(r'\d+(\.\d+)?', model_answer).group())
-        except:
-            print(f"Meet error with {img_path}, please generate again.")
-            score = random.randint(1, 5)
-        path_score_dict[img_path] = score
-    return path_score_dict
-random.seed(1)
-MODEL_PATH = ""
-device = torch.device("cuda:3") if torch.cuda.is_available() else torch.device("cpu")
-model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_PATH,
-    torch_dtype=torch.bfloat16,
-    attn_implementation="flash_attention_2",
-    device_map=device,
-)
-processor = AutoProcessor.from_pretrained(MODEL_PATH)
-processor.tokenizer.padding_side = "left"
-image_root = ""
-image_paths = get_image_paths(image_root) # It should be a list
-path_score_dict = score_batch_image(
-    image_paths, model, processor
-)
-file_name = "output.txt"
-with open(file_name, "w") as file:
-    for key, value in path_score_dict.items():
-        file.write(f"{key} {value}\n")
-print("Done!")
-```
-</details>
-### Thinking mode for inference
-<details>
-<summary>Example Code (VisualQuality-R1: Single Image Quality Rating with thinking)</summary>
-```python
-from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
-from qwen_vl_utils import process_vision_info
-import torch
-import random
-import re
-import os
-def score_image(image_path, model, processor):
-    PROMPT = (
-        "You are doing the image quality assessment task. Here is the question: "
-        "What is your overall rating on the quality of this picture? The rating should be a float between 1 and 5, "
-        "rounded to two decimal places, with 1 representing very poor quality and 5 representing excellent quality. "
-        "First output the thinking process in <think> </think> tags and then output the final answer with only one score in <answer> </answer> tags."
-    )
-    QUESTION_TEMPLATE = "{Question} First output the thinking process in <think> </think> tags and then output the final answer with only one score in <answer> </answer> tags."
-    # QUESTION_TEMPLATE = "Please describe the quality of this image."
-    message = [
-        {
-            "role": "user",
-            "content": [
-                {'type': 'image', 'image': image_path},
-                {"type": "text", "text": PROMPT}
-            ],
-        }
-    ]
-    batch_messages = [message]
-    # Preparation for inference
-    text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True, add_vision_id=True) for msg in batch_messages]
-    image_inputs, video_inputs = process_vision_info(batch_messages)
-    inputs = processor(
-        text=text,
-        images=image_inputs,
-        videos=video_inputs,
-        padding=True,
-        return_tensors="pt",
-    )
-    inputs = inputs.to(device)
-    # Inference: Generation of the output
-    generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=2048, do_sample=True, top_k=50, top_p=1)
-    generated_ids_trimmed = [
-        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-    ]
-    batch_output_text = processor.batch_decode(
-        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-    )
-    reasoning = re.findall(r'<think>(.*?)</think>', batch_output_text[0], re.DOTALL)
-    reasoning = reasoning[-1].strip()
-    try:
-        model_output_matches = re.findall(r'<answer>(.*?)</answer>', batch_output_text[0], re.DOTALL)
-        model_answer = model_output_matches[-1].strip() if model_output_matches else batch_output_text[0].strip()
-        score = float(re.search(r'\d+(\.\d+)?', model_answer).group())
-    except:
-        print(f"================= Meet error with {img_path}, please generate again. =================")
-        score = random.randint(1, 5)
-    return reasoning, score
-random.seed(1)
-MODEL_PATH = ""
-device = torch.device("cuda:5") if torch.cuda.is_available() else torch.device("cpu")
-image_path = ""
-model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_PATH,
-    torch_dtype=torch.bfloat16,
-    attn_implementation="flash_attention_2",
-    device_map=device,
-)
-processor = AutoProcessor.from_pretrained(MODEL_PATH)
-processor.tokenizer.padding_side = "left"
-reasoning, score = score_image(
-    image_path, model, processor
-)
-print(reasoning)
-print(score)
-```
-</details>
-<details>
-<summary>Example Code (VisualQuality-R1: Batch Images Quality Rating with thinking)</summary>
-```python
-from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
-from qwen_vl_utils import process_vision_info
-from tqdm import tqdm
-import torch
-import random
-import re
-import os
-def get_image_paths(folder_path):
-    image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff', '.webp'}
-    image_paths = []
-    for root, dirs, files in os.walk(folder_path):
-        for file in files:
-            _, ext = os.path.splitext(file)
-            if ext.lower() in image_extensions:
-                image_paths.append(os.path.join(root, file))
-    return image_paths
-def score_batch_image(image_paths, model, processor):
-    PROMPT = (
-        "You are doing the image quality assessment task. Here is the question: "
-        "What is your overall rating on the quality of this picture? The rating should be a float between 1 and 5, "
-        "rounded to two decimal places, with 1 representing very poor quality and 5 representing excellent quality."
-    )
-    QUESTION_TEMPLATE = "{Question} First output the thinking process in <think> </think> tags and then output the final answer with only one score in <answer> </answer> tags."
-    messages = []
-    for img_path in image_paths:
-        message = [
-            {
-                "role": "user",
-                "content": [
-                    {'type': 'image', 'image': img_path},
-                    {"type": "text", "text": QUESTION_TEMPLATE.format(Question=PROMPT)}
-                ],
-            }
-        ]
-        messages.append(message)
-    BSZ = 32
-    all_outputs = []  # List to store all answers
-    for i in tqdm(range(0, len(messages), BSZ)):
-        batch_messages = messages[i:i + BSZ]
-        # Preparation for inference
-        text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True, add_vision_id=True) for msg in batch_messages]
-        image_inputs, video_inputs = process_vision_info(batch_messages)
-        inputs = processor(
-            text=text,
-            images=image_inputs,
-            videos=video_inputs,
-            padding=True,
-            return_tensors="pt",
-        )
-        inputs = inputs.to(device)
-        # Inference: Generation of the output
-        generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=512, do_sample=True, top_k=50, top_p=1)
-        generated_ids_trimmed = [
-            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-        ]
-        batch_output_text = processor.batch_decode(
-            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )
-        all_outputs.extend(batch_output_text)
-    path_score_dict = {}
-    for img_path, model_output in zip(image_paths, all_outputs):
-        reasoning = re.findall(r'<think>(.*?)</think>', model_output, re.DOTALL)
-        reasoning = reasoning[-1].strip()
-        try:
-            model_output_matches = re.findall(r'<answer>(.*?)</answer>', model_output, re.DOTALL)
-            model_answer = model_output_matches[-1].strip() if model_output_matches else model_output.strip()
-            score = float(re.search(r'\d+(\.\d+)?', model_answer).group())
-        except:
-            print(f"Meet error with {img_path}, please generate again.")
-            score = random.randint(1, 5)
-        path_score_dict[img_path] = score
-    return path_score_dict
-random.seed(1)
-MODEL_PATH = ""
-device = torch.device("cuda:3") if torch.cuda.is_available() else torch.device("cpu")
-model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_PATH,
-    torch_dtype=torch.bfloat16,
-    attn_implementation="flash_attention_2",
-    device_map=device,
-)
-processor = AutoProcessor.from_pretrained(MODEL_PATH)
-processor.tokenizer.padding_side = "left"
-image_root = ""
-image_paths = get_image_paths(image_root) # It should be a list
-path_score_dict = score_batch_image(
-    image_paths, model, processor
-)
-file_name = "output.txt"
-with open(file_name, "w") as file:
-    for key, value in path_score_dict.items():
-        file.write(f"{key} {value}\n")
-print("Done!")
-```
-</details>
-## 🚀 Updated: VisualQuality-R1 high efficiency inference script with vLLM
-<details>
-<summary>Example Code (VisualQuality-R1: Batch Images Quality Rating with thinking, using vLLM)</summary>
-```python
-# Please install vLLM first: https://docs.vllm.ai/en/stable/getting_started/installation/gpu.html
-from transformers import Qwen2_5_VLProcessor, AutoProcessor
-from vllm import LLM, RequestOutput, SamplingParams
-from qwen_vl_utils import process_vision_info
-import torch
-import random
-import re
-import os
-IMAGE_PATH = "./images"
-MODEL_PATH = "TianheWu/VisualQuality-R1-7B"
-def get_image_paths(folder_path):
-    image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff', '.webp'}
-    image_paths = []
-    for root, dirs, files in os.walk(folder_path):
-        for file in files:
-            _, ext = os.path.splitext(file)
-            if ext.lower() in image_extensions:
-                image_paths.append(os.path.join(root, file))
-    return image_paths
-def score_batch_image(image_paths, model: LLM, processor: Qwen2_5_VLProcessor):
-    PROMPT = (
-        "You are doing the image quality assessment task. Here is the question: "
-        "What is your overall rating on the quality of this picture? The rating should be a float between 1 and 5, "
-        "rounded to two decimal places, with 1 representing very poor quality and 5 representing excellent quality."
-    )
-    QUESTION_TEMPLATE = "{Question} First output the thinking process in <think> </think> tags and then output the final answer with only one score in <answer> </answer> tags."
-    messages = []
-    for img_path in image_paths:
-        message = [
-            {
-                "role": "user",
-                "content": [
-                    {'type': 'image', 'image': img_path},
-                    {"type": "text", "text": QUESTION_TEMPLATE.format(Question=PROMPT)}
-                ],
-            }
-        ]
-        messages.append(message)
-    all_outputs = []  # List to store all answers
-    # Preparation for inference
-    print("preprocessing ...")
-    texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True, add_vision_id=True) for msg in messages]
-    image_inputs, video_inputs = process_vision_info(messages)
-    inputs = [{
-        "prompt": texts[i],
-        "multi_modal_data": {
-            "image": image_inputs[i]
-        },
-    } for i in range(len(messages))]
-    output: list[RequestOutput] = model.generate(
-        inputs,
-        sampling_params=SamplingParams(
-            max_tokens=512,
-            temperature=0.1,
-            top_k=50,
-            top_p=1.0,
-            stop_token_ids=[processor.tokenizer.eos_token_id],
-        ),
-    )
-    batch_output_text = [o.outputs[0].text for o in output]
-    all_outputs.extend(batch_output_text)
-    path_score_dict = {}
-    for img_path, model_output in zip(image_paths, all_outputs):
-        print(f"{model_output = }")
-        try:
-            model_output_matches = re.findall(r'<answer>(.*?)</answer>', model_output, re.DOTALL)
-            model_answer = model_output_matches[-1].strip() if model_output_matches else model_output.strip()
-            score = float(re.search(r'\d+(\.\d+)?', model_answer).group())
-        except:
-            print(f"Meet error with {img_path}, please generate again.")
-            score = random.randint(1, 5)
-        path_score_dict[img_path] = score
-    return path_score_dict
-random.seed(1)
-model = LLM(
-    model=MODEL_PATH,
-    tensor_parallel_size=1,
-    trust_remote_code=True,
-    seed=1,
-)
-processor = AutoProcessor.from_pretrained(MODEL_PATH)
-processor.tokenizer.padding_side = "left"
-image_paths = get_image_paths(IMAGE_PATH) # It should be a list
-path_score_dict = score_batch_image(
-    image_paths, model, processor
-)
-file_name = "output.txt"
-with open(file_name, "w") as file:
-    for key, value in path_score_dict.items():
-        file.write(f"{key} {value}\n")
-print("Done!")
-```
-</details>


15
16	The latest version can be found in [VisualQuality-R1-7B](https://huggingface.co/TianheWu/VisualQuality-R1-7B)
17