Upload 6 files

Browse files

Files changed (7) hide show

.gitattributes +3 -0
Qwen3-Reranker-0.6B_f16.rkllm +3 -0
Qwen3-Reranker-0.6B_w8a8.rkllm +3 -0
librkllmrt.so +3 -0
rkllm-convert.py +74 -0
rkllm_binding.py +658 -0
test_reranker.py +551 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+librkllmrt.so filter=lfs diff=lfs merge=lfs -text
+Qwen3-Reranker-0.6B_f16.rkllm filter=lfs diff=lfs merge=lfs -text
+Qwen3-Reranker-0.6B_w8a8.rkllm filter=lfs diff=lfs merge=lfs -text

Qwen3-Reranker-0.6B_f16.rkllm ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a5577675b87b5d19b1fbc20360b9caa174fa9d64d08b6cbe564d74456216f117
+size 1524801182

Qwen3-Reranker-0.6B_w8a8.rkllm ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:820c0b6b2419d3799a612751d9d8b86ed0e0e852364a6de78972a3976c34eaa8
+size 931372078

librkllmrt.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f6a9c2de93cf94bb524eb071c27190ad4c83401e01b562534f265dff4cb40da2
+size 6710712

rkllm-convert.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import os
+from rkllm.api import RKLLM
+def convert_model(model_path, output_name, do_quantization=False):
+    """转换单个模型"""
+    llm = RKLLM()
+    print(f"正在加载模型: {model_path}")
+    ret = llm.load_huggingface(model=model_path, model_lora=None, device='cpu')
+    if ret != 0:
+        print(f'加载模型失败: {model_path}')
+        return ret
+    print(f"正在构建模型: {output_name} (量化: {do_quantization})")
+    qparams = None
+    ret = llm.build(do_quantization=do_quantization, optimization_level=1, quantized_dtype='w8a8',
+                    quantized_algorithm='normal', target_platform='rk3588', num_npu_core=3, extra_qparams=qparams)
+    if ret != 0:
+        print(f'构建模型失败: {output_name}')
+        return ret
+    # 导出rkllm模型
+    print(f"正在导出模型: {output_name}")
+    ret = llm.export_rkllm(output_name)
+    if ret != 0:
+        print(f'导出模型失败: {output_name}')
+        return ret
+    print(f"成功转换: {output_name}")
+    return 0
+def main():
+    """主函数：遍历所有子文件夹并转换模型"""
+    current_dir = '.'
+    # 获取所有子文件夹
+    subdirs = [d for d in os.listdir(current_dir)
+               if os.path.isdir(os.path.join(current_dir, d)) and not d.startswith('.')]
+    print(f"找到 {len(subdirs)} 个模型文件夹: {subdirs}")
+    for subdir in subdirs:
+        model_path = os.path.join(current_dir, subdir)
+        # 生成输出文件名
+        base_name = subdir.replace('/', '_').replace('\\', '_')
+        quantized_output = f"{base_name}_w8a8.rkllm"
+        unquantized_output = f"{base_name}_f16.rkllm"
+        print(f"\n{'='*50}")
+        print(f"处理模型文件夹: {subdir}")
+        print(f"{'='*50}")
+        # 转换非量化版本
+        print(f"\n--- 转换非量化版本 ---")
+        ret = convert_model(model_path, unquantized_output, do_quantization=False)
+        if ret != 0:
+            print(f"非量化版本转换失败: {subdir}")
+            continue
+        # 转换量化版本
+        print(f"\n--- 转换量化版本 ---")
+        ret = convert_model(model_path, quantized_output, do_quantization=True)
+        if ret != 0:
+            print(f"量化版本转换失败: {subdir}")
+            continue
+        print(f"\n✓ {subdir} 模型转换完成!")
+        print(f"  - 非量化版本: {unquantized_output}")
+        print(f"  - 量化版本: {quantized_output}")
+if __name__ == "__main__":
+    main()

rkllm_binding.py ADDED Viewed

	@@ -0,0 +1,658 @@

+import ctypes
+import enum
+import os
+# Define constants from the header
+CPU0 = (1 << 0)  # 0x01
+CPU1 = (1 << 1)  # 0x02
+CPU2 = (1 << 2)  # 0x04
+CPU3 = (1 << 3)  # 0x08
+CPU4 = (1 << 4)  # 0x10
+CPU5 = (1 << 5)  # 0x20
+CPU6 = (1 << 6)  # 0x40
+CPU7 = (1 << 7)  # 0x80
+# --- Enums ---
+class LLMCallState(enum.IntEnum):
+    RKLLM_RUN_NORMAL = 0
+    RKLLM_RUN_WAITING = 1
+    RKLLM_RUN_FINISH = 2
+    RKLLM_RUN_ERROR = 3
+class RKLLMInputType(enum.IntEnum):
+    RKLLM_INPUT_PROMPT = 0
+    RKLLM_INPUT_TOKEN = 1
+    RKLLM_INPUT_EMBED = 2
+    RKLLM_INPUT_MULTIMODAL = 3
+class RKLLMInferMode(enum.IntEnum):
+    RKLLM_INFER_GENERATE = 0
+    RKLLM_INFER_GET_LAST_HIDDEN_LAYER = 1
+    RKLLM_INFER_GET_LOGITS = 2
+# --- Structures ---
+class RKLLMExtendParam(ctypes.Structure):
+    # 基础iommu domain ID, 对>1b的模型建议设置为1
+    base_domain_id: ctypes.c_int32
+    # 是否使用flash存储Embedding
+    embed_flash: ctypes.c_int8
+    # 启用的cpu核心数
+    enabled_cpus_num: ctypes.c_int8
+    # 启用的cpu核心掩码
+    enabled_cpus_mask: ctypes.c_uint32
+    reserved: ctypes.c_uint8 * 106
+    _fields_ = [
+        ("base_domain_id", ctypes.c_int32),
+        ("embed_flash", ctypes.c_int8),
+        ("enabled_cpus_num", ctypes.c_int8),
+        ("enabled_cpus_mask", ctypes.c_uint32),
+        ("reserved", ctypes.c_uint8 * 106)
+    ]
+class RKLLMParam(ctypes.Structure):
+    # 模型文件路径
+    model_path: ctypes.c_char_p
+    # 上下文窗口最大token数
+    max_context_len: ctypes.c_int32
+    # 最大生成新token数
+    max_new_tokens: ctypes.c_int32
+    # Top-K采样参数
+    top_k: ctypes.c_int32
+    # 上下文窗口移动时保留的kv缓存数量
+    n_keep: ctypes.c_int32
+    # Top-P采样参数
+    top_p: ctypes.c_float
+    # 采样温度，影响token选择的随机性
+    temperature: ctypes.c_float
+    # 重复token惩罚
+    repeat_penalty: ctypes.c_float
+    # 频繁token惩罚
+    frequency_penalty: ctypes.c_float
+    # 输入中已存在token的惩罚
+    presence_penalty: ctypes.c_float
+    # Mirostat采样策略标志（0表示禁用）
+    mirostat: ctypes.c_int32
+    # Mirostat采样Tau参数
+    mirostat_tau: ctypes.c_float
+    # Mirostat采样Eta参数
+    mirostat_eta: ctypes.c_float
+    # 是否跳过特殊token
+    skip_special_token: ctypes.c_bool
+    # 是否异步推理
+    is_async: ctypes.c_bool
+    # 多模态输入中图像的起始Token
+    img_start: ctypes.c_char_p
+    # 多模态输入中图像的结束Token
+    img_end: ctypes.c_char_p
+    # 图像内容指针
+    img_content: ctypes.c_char_p
+    # 扩展参数
+    extend_param: RKLLMExtendParam
+    _fields_ = [
+        ("model_path", ctypes.c_char_p),         # 模型文件路径
+        ("max_context_len", ctypes.c_int32),     # 上下文窗口最大token数
+        ("max_new_tokens", ctypes.c_int32),      # 最大生成新token数
+        ("top_k", ctypes.c_int32),               # Top-K采样参数
+        ("n_keep", ctypes.c_int32),              # 上下文窗口移动时保留的kv缓存数量
+        ("top_p", ctypes.c_float),               # Top-P（nucleus）采样参数
+        ("temperature", ctypes.c_float),         # 采样温度，影响token选择的随机性
+        ("repeat_penalty", ctypes.c_float),      # 重复token惩罚
+        ("frequency_penalty", ctypes.c_float),   # 频繁token惩罚
+        ("presence_penalty", ctypes.c_float),    # 输入中已存在token的惩罚
+        ("mirostat", ctypes.c_int32),            # Mirostat采样策略标志（0表示禁用）
+        ("mirostat_tau", ctypes.c_float),        # Mirostat采样Tau参数
+        ("mirostat_eta", ctypes.c_float),        # Mirostat采样Eta参数
+        ("skip_special_token", ctypes.c_bool),   # 是否跳过特殊token
+        ("is_async", ctypes.c_bool),             # 是否异步推理
+        ("img_start", ctypes.c_char_p),          # 多模态输入中图像的起始Token
+        ("img_end", ctypes.c_char_p),            # 多模态输入中图像的结束Token
+        ("img_content", ctypes.c_char_p),        # 图像内容指针
+        ("extend_param", RKLLMExtendParam)       # 扩展参数
+    ]
+class RKLLMLoraAdapter(ctypes.Structure):
+    lora_adapter_path: ctypes.c_char_p
+    lora_adapter_name: ctypes.c_char_p
+    scale: ctypes.c_float
+    _fields_ = [
+        ("lora_adapter_path", ctypes.c_char_p),
+        ("lora_adapter_name", ctypes.c_char_p),
+        ("scale", ctypes.c_float)
+    ]
+class RKLLMEmbedInput(ctypes.Structure):
+    # Shape: [n_tokens, embed_size]
+    embed: ctypes.POINTER(ctypes.c_float)
+    n_tokens: ctypes.c_size_t
+    _fields_ = [
+        ("embed", ctypes.POINTER(ctypes.c_float)),
+        ("n_tokens", ctypes.c_size_t)
+    ]
+class RKLLMTokenInput(ctypes.Structure):
+    # Shape: [n_tokens]
+    input_ids: ctypes.POINTER(ctypes.c_int32)
+    n_tokens: ctypes.c_size_t
+    _fields_ = [
+        ("input_ids", ctypes.POINTER(ctypes.c_int32)),
+        ("n_tokens", ctypes.c_size_t)
+    ]
+class RKLLMMultiModelInput(ctypes.Structure):
+    prompt: ctypes.c_char_p
+    image_embed: ctypes.POINTER(ctypes.c_float)
+    n_image_tokens: ctypes.c_size_t
+    n_image: ctypes.c_size_t
+    image_width: ctypes.c_size_t
+    image_height: ctypes.c_size_t
+    _fields_ = [
+        ("prompt", ctypes.c_char_p),
+        ("image_embed", ctypes.POINTER(ctypes.c_float)),
+        ("n_image_tokens", ctypes.c_size_t),
+        ("n_image", ctypes.c_size_t),
+        ("image_width", ctypes.c_size_t),
+        ("image_height", ctypes.c_size_t)
+    ]
+class _RKLLMInputUnion(ctypes.Union):
+    prompt_input: ctypes.c_char_p
+    embed_input: RKLLMEmbedInput
+    token_input: RKLLMTokenInput
+    multimodal_input: RKLLMMultiModelInput
+    _fields_ = [
+        ("prompt_input", ctypes.c_char_p),
+        ("embed_input", RKLLMEmbedInput),
+        ("token_input", RKLLMTokenInput),
+        ("multimodal_input", RKLLMMultiModelInput)
+    ]
+class RKLLMInput(ctypes.Structure):
+    input_type: ctypes.c_int
+    _union_data: _RKLLMInputUnion
+    _fields_ = [
+        ("input_type", ctypes.c_int), # Enum will be passed as int, changed RKLLMInputType to ctypes.c_int
+        ("_union_data", _RKLLMInputUnion)
+    ]
+    # Properties to make accessing union members easier
+    @property
+    def prompt_input(self) -> bytes: # Assuming c_char_p maps to bytes
+        if self.input_type == RKLLMInputType.RKLLM_INPUT_PROMPT:
+            return self._union_data.prompt_input
+        raise AttributeError("Not a prompt input")
+    @prompt_input.setter
+    def prompt_input(self, value: bytes): # Assuming c_char_p maps to bytes
+        if self.input_type == RKLLMInputType.RKLLM_INPUT_PROMPT:
+            self._union_data.prompt_input = value
+        else:
+            raise AttributeError("Not a prompt input")
+    @property
+    def embed_input(self) -> RKLLMEmbedInput:
+        if self.input_type == RKLLMInputType.RKLLM_INPUT_EMBED:
+            return self._union_data.embed_input
+        raise AttributeError("Not an embed input")
+    @embed_input.setter
+    def embed_input(self, value: RKLLMEmbedInput):
+        if self.input_type == RKLLMInputType.RKLLM_INPUT_EMBED:
+            self._union_data.embed_input = value
+        else:
+            raise AttributeError("Not an embed input")
+    @property
+    def token_input(self) -> RKLLMTokenInput:
+        if self.input_type == RKLLMInputType.RKLLM_INPUT_TOKEN:
+            return self._union_data.token_input
+        raise AttributeError("Not a token input")
+    @token_input.setter
+    def token_input(self, value: RKLLMTokenInput):
+        if self.input_type == RKLLMInputType.RKLLM_INPUT_TOKEN:
+            self._union_data.token_input = value
+        else:
+            raise AttributeError("Not a token input")
+    @property
+    def multimodal_input(self) -> RKLLMMultiModelInput:
+        if self.input_type == RKLLMInputType.RKLLM_INPUT_MULTIMODAL:
+            return self._union_data.multimodal_input
+        raise AttributeError("Not a multimodal input")
+    @multimodal_input.setter
+    def multimodal_input(self, value: RKLLMMultiModelInput):
+        if self.input_type == RKLLMInputType.RKLLM_INPUT_MULTIMODAL:
+            self._union_data.multimodal_input = value
+        else:
+            raise AttributeError("Not a multimodal input")
+class RKLLMLoraParam(ctypes.Structure): # For inference
+    lora_adapter_name: ctypes.c_char_p
+    _fields_ = [
+        ("lora_adapter_name", ctypes.c_char_p)
+    ]
+class RKLLMPromptCacheParam(ctypes.Structure): # For inference
+    save_prompt_cache: ctypes.c_int # bool-like
+    prompt_cache_path: ctypes.c_char_p
+    _fields_ = [
+        ("save_prompt_cache", ctypes.c_int), # bool-like
+        ("prompt_cache_path", ctypes.c_char_p)
+    ]
+class RKLLMInferParam(ctypes.Structure):
+    mode: ctypes.c_int
+    lora_params: ctypes.POINTER(RKLLMLoraParam)
+    prompt_cache_params: ctypes.POINTER(RKLLMPromptCacheParam)
+    keep_history: ctypes.c_int # bool-like
+    _fields_ = [
+        ("mode", ctypes.c_int), # Enum will be passed as int, changed RKLLMInferMode to ctypes.c_int
+        ("lora_params", ctypes.POINTER(RKLLMLoraParam)),
+        ("prompt_cache_params", ctypes.POINTER(RKLLMPromptCacheParam)),
+        ("keep_history", ctypes.c_int) # bool-like
+    ]
+class RKLLMResultLastHiddenLayer(ctypes.Structure):
+    # Shape: [num_tokens, embd_size]
+    hidden_states: ctypes.POINTER(ctypes.c_float)
+    # 隐藏层大小
+    embd_size: ctypes.c_int
+    # 输出token数
+    num_tokens: ctypes.c_int
+    _fields_ = [
+        ("hidden_states", ctypes.POINTER(ctypes.c_float)),
+        ("embd_size", ctypes.c_int),
+        ("num_tokens", ctypes.c_int)
+    ]
+class RKLLMResultLogits(ctypes.Structure):
+    # Shape: [num_tokens, vocab_size]
+    logits: ctypes.POINTER(ctypes.c_float)
+    # 词汇表大小
+    vocab_size: ctypes.c_int
+    # 输出token数
+    num_tokens: ctypes.c_int
+    _fields_ = [
+        ("logits", ctypes.POINTER(ctypes.c_float)),
+        ("vocab_size", ctypes.c_int),
+        ("num_tokens", ctypes.c_int)
+    ]
+class RKLLMResult(ctypes.Structure):
+    text: ctypes.c_char_p
+    token_id: ctypes.c_int32
+    last_hidden_layer: RKLLMResultLastHiddenLayer
+    logits: RKLLMResultLogits
+    _fields_ = [
+        ("text", ctypes.c_char_p),
+        ("token_id", ctypes.c_int32),
+        ("last_hidden_layer", RKLLMResultLastHiddenLayer),
+        ("logits", RKLLMResultLogits)
+    ]
+# --- Typedefs ---
+LLMHandle = ctypes.c_void_p
+# --- Callback Function Type ---
+LLMResultCallback = ctypes.CFUNCTYPE(
+    None,  # return type: void
+    ctypes.POINTER(RKLLMResult),
+    ctypes.c_void_p,  # userdata
+    ctypes.c_int      # enum, will be passed as int. Changed LLMCallState to ctypes.c_int
+)
+class RKLLMRuntime:
+    def __init__(self, library_path="./librkllmrt.so"):
+        try:
+            self.lib = ctypes.CDLL(library_path)
+        except OSError as e:
+            raise OSError(f"Failed to load RKLLM library from {library_path}. "
+                          f"Ensure it's in your LD_LIBRARY_PATH or provide the full path. Error: {e}")
+        self._setup_functions()
+        self.llm_handle = LLMHandle()
+        self._c_callback = None # To keep the callback object alive
+    def _setup_functions(self):
+        # RKLLMParam rkllm_createDefaultParam();
+        self.lib.rkllm_createDefaultParam.restype = RKLLMParam
+        self.lib.rkllm_createDefaultParam.argtypes = []
+        # int rkllm_init(LLMHandle* handle, RKLLMParam* param, LLMResultCallback callback);
+        self.lib.rkllm_init.restype = ctypes.c_int
+        self.lib.rkllm_init.argtypes = [
+            ctypes.POINTER(LLMHandle),
+            ctypes.POINTER(RKLLMParam),
+            LLMResultCallback
+        ]
+        # int rkllm_load_lora(LLMHandle handle, RKLLMLoraAdapter* lora_adapter);
+        self.lib.rkllm_load_lora.restype = ctypes.c_int
+        self.lib.rkllm_load_lora.argtypes = [LLMHandle, ctypes.POINTER(RKLLMLoraAdapter)]
+        # int rkllm_load_prompt_cache(LLMHandle handle, const char* prompt_cache_path);
+        self.lib.rkllm_load_prompt_cache.restype = ctypes.c_int
+        self.lib.rkllm_load_prompt_cache.argtypes = [LLMHandle, ctypes.c_char_p]
+        # int rkllm_release_prompt_cache(LLMHandle handle);
+        self.lib.rkllm_release_prompt_cache.restype = ctypes.c_int
+        self.lib.rkllm_release_prompt_cache.argtypes = [LLMHandle]
+        # int rkllm_destroy(LLMHandle handle);
+        self.lib.rkllm_destroy.restype = ctypes.c_int
+        self.lib.rkllm_destroy.argtypes = [LLMHandle]
+        # int rkllm_run(LLMHandle handle, RKLLMInput* rkllm_input, RKLLMInferParam* rkllm_infer_params, void* userdata);
+        self.lib.rkllm_run.restype = ctypes.c_int
+        self.lib.rkllm_run.argtypes = [
+            LLMHandle,
+            ctypes.POINTER(RKLLMInput),
+            ctypes.POINTER(RKLLMInferParam),
+            ctypes.c_void_p # userdata
+        ]
+        # int rkllm_run_async(LLMHandle handle, RKLLMInput* rkllm_input, RKLLMInferParam* rkllm_infer_params, void* userdata);
+        # Assuming async also takes userdata for the callback context
+        self.lib.rkllm_run_async.restype = ctypes.c_int
+        self.lib.rkllm_run_async.argtypes = [
+            LLMHandle,
+            ctypes.POINTER(RKLLMInput),
+            ctypes.POINTER(RKLLMInferParam),
+            ctypes.c_void_p # userdata
+        ]
+        # int rkllm_abort(LLMHandle handle);
+        self.lib.rkllm_abort.restype = ctypes.c_int
+        self.lib.rkllm_abort.argtypes = [LLMHandle]
+        # int rkllm_is_running(LLMHandle handle);
+        self.lib.rkllm_is_running.restype = ctypes.c_int # 0 if running, non-zero otherwise
+        self.lib.rkllm_is_running.argtypes = [LLMHandle]
+        # int rkllm_clear_kv_cache(LLMHandle handle, int keep_system_prompt);
+        self.lib.rkllm_clear_kv_cache.restype = ctypes.c_int
+        self.lib.rkllm_clear_kv_cache.argtypes = [LLMHandle, ctypes.c_int]
+        # int rkllm_set_chat_template(LLMHandle handle, const char* system_prompt, const char* prompt_prefix, const char* prompt_postfix);
+        self.lib.rkllm_set_chat_template.restype = ctypes.c_int
+        self.lib.rkllm_set_chat_template.argtypes = [
+            LLMHandle,
+            ctypes.c_char_p,
+            ctypes.c_char_p,
+            ctypes.c_char_p
+        ]
+    def create_default_param(self) -> RKLLMParam:
+        """Creates a default RKLLMParam structure."""
+        return self.lib.rkllm_createDefaultParam()
+    def init(self, param: RKLLMParam, callback_func) -> int:
+        """
+        Initializes the LLM.
+        :param param: RKLLMParam structure.
+        :param callback_func: A Python function that matches the signature:
+                              def my_callback(result_ptr, userdata_ptr, state_enum):
+                                  result = result_ptr.contents # RKLLMResult
+                                  # Process result
+                                  # userdata can be retrieved if passed during run, or ignored
+                                  # state = LLMCallState(state_enum)
+        :return: 0 for success, non-zero for failure.
+        """
+        if not callable(callback_func):
+            raise ValueError("callback_func must be a callable Python function.")
+        # Keep a reference to the ctypes callback object to prevent it from being garbage collected
+        self._c_callback = LLMResultCallback(callback_func)
+        ret = self.lib.rkllm_init(ctypes.byref(self.llm_handle), ctypes.byref(param), self._c_callback)
+        if ret != 0:
+            raise RuntimeError(f"rkllm_init failed with error code {ret}")
+        return ret
+    def load_lora(self, lora_adapter: RKLLMLoraAdapter) -> int:
+        """Loads a Lora adapter."""
+        ret = self.lib.rkllm_load_lora(self.llm_handle, ctypes.byref(lora_adapter))
+        if ret != 0:
+            raise RuntimeError(f"rkllm_load_lora failed with error code {ret}")
+        return ret
+    def load_prompt_cache(self, prompt_cache_path: str) -> int:
+        """Loads a prompt cache from a file."""
+        c_path = prompt_cache_path.encode('utf-8')
+        ret = self.lib.rkllm_load_prompt_cache(self.llm_handle, c_path)
+        if ret != 0:
+            raise RuntimeError(f"rkllm_load_prompt_cache failed for {prompt_cache_path} with error code {ret}")
+        return ret
+    def release_prompt_cache(self) -> int:
+        """Releases the prompt cache from memory."""
+        ret = self.lib.rkllm_release_prompt_cache(self.llm_handle)
+        if ret != 0:
+            raise RuntimeError(f"rkllm_release_prompt_cache failed with error code {ret}")
+        return ret
+    def destroy(self) -> int:
+        """Destroys the LLM instance and releases resources."""
+        if self.llm_handle and self.llm_handle.value: # Check if handle is not NULL
+            ret = self.lib.rkllm_destroy(self.llm_handle)
+            self.llm_handle = LLMHandle() # Reset handle
+            if ret != 0:
+                # Don't raise here as it might be called in __del__
+                print(f"Warning: rkllm_destroy failed with error code {ret}")
+            return ret
+        return 0 # Already destroyed or not initialized
+    def run(self, rkllm_input: RKLLMInput, rkllm_infer_params: RKLLMInferParam, userdata=None) -> int:
+        """Runs an LLM inference task synchronously."""
+        # userdata can be a ctypes.py_object if you want to pass Python objects,
+        # then cast to c_void_p. Or simply None.
+        if userdata is not None:
+            # Store the userdata object to keep it alive during the call
+            self._userdata_ref = userdata
+            c_userdata = ctypes.cast(ctypes.pointer(ctypes.py_object(userdata)), ctypes.c_void_p)
+        else:
+            c_userdata = None
+        ret = self.lib.rkllm_run(self.llm_handle, ctypes.byref(rkllm_input), ctypes.byref(rkllm_infer_params), c_userdata)
+        if ret != 0:
+            raise RuntimeError(f"rkllm_run failed with error code {ret}")
+        return ret
+    def run_async(self, rkllm_input: RKLLMInput, rkllm_infer_params: RKLLMInferParam, userdata=None) -> int:
+        """Runs an LLM inference task asynchronously."""
+        if userdata is not None:
+            # Store the userdata object to keep it alive during the call
+            self._userdata_ref = userdata
+            c_userdata = ctypes.cast(ctypes.pointer(ctypes.py_object(userdata)), ctypes.c_void_p)
+        else:
+            c_userdata = None
+        ret = self.lib.rkllm_run_async(self.llm_handle, ctypes.byref(rkllm_input), ctypes.byref(rkllm_infer_params), c_userdata)
+        if ret != 0:
+            raise RuntimeError(f"rkllm_run_async failed with error code {ret}")
+        return ret
+    def abort(self) -> int:
+        """Aborts an ongoing LLM task."""
+        ret = self.lib.rkllm_abort(self.llm_handle)
+        if ret != 0:
+            raise RuntimeError(f"rkllm_abort failed with error code {ret}")
+        return ret
+    def is_running(self) -> bool:
+        """Checks if an LLM task is currently running. Returns True if running."""
+        # The C API returns 0 if running, non-zero otherwise.
+        # This is a bit counter-intuitive for a boolean "is_running".
+        return self.lib.rkllm_is_running(self.llm_handle) == 0
+    def clear_kv_cache(self, keep_system_prompt: bool) -> int:
+        """Clears the key-value cache."""
+        ret = self.lib.rkllm_clear_kv_cache(self.llm_handle, ctypes.c_int(1 if keep_system_prompt else 0))
+        if ret != 0:
+            raise RuntimeError(f"rkllm_clear_kv_cache failed with error code {ret}")
+        return ret
+    def set_chat_template(self, system_prompt: str, prompt_prefix: str, prompt_postfix: str) -> int:
+        """Sets the chat template for the LLM."""
+        c_system = system_prompt.encode('utf-8') if system_prompt else b""
+        c_prefix = prompt_prefix.encode('utf-8') if prompt_prefix else b""
+        c_postfix = prompt_postfix.encode('utf-8') if prompt_postfix else b""
+        ret = self.lib.rkllm_set_chat_template(self.llm_handle, c_system, c_prefix, c_postfix)
+        if ret != 0:
+            raise RuntimeError(f"rkllm_set_chat_template failed with error code {ret}")
+        return ret
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.destroy()
+    def __del__(self):
+        self.destroy() # Ensure resources are freed if object is garbage collected
+# --- Example Usage (Illustrative) ---
+if __name__ == "__main__":
+    # This is a placeholder for how you might use it.
+    # You'll need a valid .rkllm model and librkllmrt.so in your path.
+    # Global list to store results from callback for demonstration
+    results_buffer = []
+    def my_python_callback(result_ptr, userdata_ptr, state_enum):
+        """
+        Callback function to be called by the C library.
+        """
+        global results_buffer
+        state = LLMCallState(state_enum)
+        result = result_ptr.contents
+        current_text = ""
+        if result.text: # Check if the char_p is not NULL
+            current_text = result.text.decode('utf-8', errors='ignore')
+        print(f"Callback: State={state.name}, TokenID={result.token_id}, Text='{current_text}'")
+        results_buffer.append(current_text)
+        if state == LLMCallState.RKLLM_RUN_FINISH:
+            print("Inference finished.")
+        elif state == LLMCallState.RKLLM_RUN_ERROR:
+            print("Inference error.")
+        # Example: Accessing logits if available (and if mode was set to get logits)
+        # if result.logits.logits and result.logits.vocab_size > 0:
+        #     print(f"  Logits (first 5 of vocab_size {result.logits.vocab_size}):")
+        #     for i in range(min(5, result.logits.vocab_size)):
+        #         print(f"    {result.logits.logits[i]:.4f}", end=" ")
+        #     print()
+    # --- Attempt to use the wrapper ---
+    try:
+        print("Initializing RKLLMRuntime...")
+        # Adjust library_path if librkllmrt.so is not in default search paths
+        # e.g., library_path="./path/to/librkllmrt.so"
+        rk_llm = RKLLMRuntime()
+        print("Creating default parameters...")
+        params = rk_llm.create_default_param()
+        # --- Configure parameters ---
+        # THIS IS CRITICAL: model_path must point to an actual .rkllm file
+        # For this example to run, you need a model file.
+        # Let's assume a dummy path for now, this will fail at init if not valid.
+        model_file = "dummy_model.rkllm"
+        if not os.path.exists(model_file):
+            print(f"Warning: Model file '{model_file}' does not exist. Init will likely fail.")
+            # Create a dummy file for the example to proceed further, though init will still fail
+            # with a real library unless it's a valid model.
+            with open(model_file, "w") as f:
+                f.write("dummy content")
+        params.model_path = model_file.encode('utf-8')
+        params.max_context_len = 512
+        params.max_new_tokens = 128
+        params.top_k = 1 # Greedy
+        params.temperature = 0.7
+        params.repeat_penalty = 1.1
+        # ... set other params as needed
+        print(f"Initializing LLM with model: {params.model_path.decode()}...")
+        # This will likely fail if dummy_model.rkllm is not a valid model recognized by the library
+        try:
+            rk_llm.init(params, my_python_callback)
+            print("LLM Initialized.")
+        except RuntimeError as e:
+            print(f"Error during LLM initialization: {e}")
+            print("This is expected if 'dummy_model.rkllm' is not a valid model.")
+            print("Replace 'dummy_model.rkllm' with a real model path to test further.")
+            exit()
+        # --- Prepare input ---
+        print("Preparing input...")
+        rk_input = RKLLMInput()
+        rk_input.input_type = RKLLMInputType.RKLLM_INPUT_PROMPT
+        prompt_text = "Translate the following English text to French: 'Hello, world!'"
+        c_prompt = prompt_text.encode('utf-8')
+        rk_input._union_data.prompt_input = c_prompt # Accessing union member directly
+        # --- Prepare inference parameters ---
+        print("Preparing inference parameters...")
+        infer_params = RKLLMInferParam()
+        infer_params.mode = RKLLMInferMode.RKLLM_INFER_GENERATE
+        infer_params.keep_history = 1 # True
+        # infer_params.lora_params = None # or set up RKLLMLoraParam if using LoRA
+        # infer_params.prompt_cache_params = None # or set up RKLLMPromptCacheParam
+        # --- Run inference ---
+        print(f"Running inference with prompt: '{prompt_text}'")
+        results_buffer.clear()
+        try:
+            rk_llm.run(rk_input, infer_params) # Userdata is None by default
+            print("\n--- Full Response ---")
+            print("".join(results_buffer))
+            print("---------------------\n")
+        except RuntimeError as e:
+            print(f"Error during LLM run: {e}")
+        # --- Example: Set chat template (if model supports it) ---
+        # print("Setting chat template...")
+        # try:
+        #     rk_llm.set_chat_template("You are a helpful assistant.", "<user>: ", "<assistant>: ")
+        #     print("Chat template set.")
+        # except RuntimeError as e:
+        #     print(f"Error setting chat template: {e}")
+        # --- Example: Clear KV Cache ---
+        # print("Clearing KV cache (keeping system prompt if any)...")
+        # try:
+        #     rk_llm.clear_kv_cache(keep_system_prompt=True)
+        #     print("KV cache cleared.")
+        # except RuntimeError as e:
+        #     print(f"Error clearing KV cache: {e}")
+    except OSError as e:
+        print(f"OSError: {e}. Could not load the RKLLM library.")
+        print("Please ensure 'librkllmrt.so' is in your LD_LIBRARY_PATH or provide the full path.")
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+    finally:
+        if 'rk_llm' in locals() and rk_llm.llm_handle and rk_llm.llm_handle.value:
+            print("Destroying LLM instance...")
+            rk_llm.destroy()
+            print("LLM instance destroyed.")
+        if os.path.exists(model_file) and model_file == "dummy_model.rkllm":
+             os.remove(model_file) # Clean up dummy file
+    print("Example finished.")

test_reranker.py ADDED Viewed

	@@ -0,0 +1,551 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Qwen3-Reranker 推理测试代码
+使用 RKLLM API 进行文本重排序推理
+"""
+import faulthandler
+faulthandler.enable()
+import os
+os.environ["RKLLM_LOG_LEVEL"] = "1"
+import numpy as np
+import time
+import re
+from typing import List, Dict, Any, Tuple
+from rkllm_binding import *
+class Qwen3RerankerTester:
+    def __init__(self, model_path, library_path="./librkllmrt.so"):
+        """
+        初始化 Qwen3 重排序模型测试器
+        Args:
+            model_path: 模型文件路径（.rkllm 格式）
+            library_path: RKLLM 库文件路径
+        """
+        self.model_path = model_path
+        self.library_path = library_path
+        self.runtime = None
+        self.current_result = None
+        # 根据官方 README 设置的格式
+        self.system_prompt = "Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\"."
+        # "yes" 和 "no" 的可能 token IDs（需要通过实际测试确定）
+        # 这些是常见的token ID，实际使用中可能需要调整
+        self.yes_token_candidates = [9693]
+        self.no_token_candidates = [2152]
+    def callback_function(self, result_ptr, userdata_ptr, state_enum):
+        """
+        推理回调函数
+        Args:
+            result_ptr: 结果指针
+            userdata_ptr: 用户数据指针
+            state_enum: 状态枚举
+        """
+        state = LLMCallState(state_enum)
+        if state == LLMCallState.RKLLM_RUN_NORMAL:
+            result = result_ptr.contents
+            print(f"result: {result}")
+            # 获取 logits
+            if result.logits.logits and result.logits.vocab_size > 0:
+                vocab_size = result.logits.vocab_size
+                num_tokens = result.logits.num_tokens
+                print(f"获取到 logits：vocab_size={vocab_size}, num_tokens={num_tokens}")
+                # 获取最后一个 token 的 logits
+                if num_tokens > 0:
+                    last_token_logits = []
+                    start_idx = (num_tokens - 1) * vocab_size
+                    for i in range(vocab_size):
+                        last_token_logits.append(result.logits.logits[start_idx + i])
+                    self.current_result = {
+                        'logits': last_token_logits,
+                        'vocab_size': vocab_size,
+                        'num_tokens': num_tokens
+                    }
+                    print(f"最后一个 token 的 logits 范围: [{min(last_token_logits):.4f}, {max(last_token_logits):.4f}]")
+            else:
+                print("警告: 未能获取到 logits")
+        elif state == LLMCallState.RKLLM_RUN_ERROR:
+            print("推理过程发生错误")
+    def find_best_yes_no_tokens(self, logits):
+        """
+        找到最可能的 "yes" 和 "no" token IDs
+        Args:
+            logits: 词汇表大小的 logits 数组
+        Returns:
+            (yes_token_id, no_token_id, yes_logit, no_logit)
+        """
+        vocab_size = len(logits)
+        # 找到 yes token 的最大 logit
+        best_yes_id = None
+        best_yes_logit = float('-inf')
+        for token_id in self.yes_token_candidates:
+            if token_id < vocab_size:
+                if logits[token_id] > best_yes_logit:
+                    best_yes_logit = logits[token_id]
+                    best_yes_id = token_id
+        # 找到 no token 的最大 logit
+        best_no_id = None
+        best_no_logit = float('-inf')
+        for token_id in self.no_token_candidates:
+            if token_id < vocab_size:
+                if logits[token_id] > best_no_logit:
+                    best_no_logit = logits[token_id]
+                    best_no_id = token_id
+        # 如果找不到预定义的 token，使用启发式方法
+        if best_yes_id is None or best_no_id is None:
+            print("警告: 使用启发式方法寻找 yes/no tokens")
+            # 找到 logits 最高的几个 token
+            sorted_indices = np.argsort(logits)[::-1]
+            top_tokens = sorted_indices[:20]  # 取前20个最高的 logits
+            # 简单启发式：假设较高的 logit 对应 "yes"，较低的对应 "no"
+            if best_yes_id is None:
+                best_yes_id = top_tokens[0]
+                best_yes_logit = logits[best_yes_id]
+            if best_no_id is None:
+                # 寻找一个相对较低但合理的 logit 作为 "no"
+                best_no_id = top_tokens[min(10, len(top_tokens)-1)]
+                best_no_logit = logits[best_no_id]
+        return best_yes_id, best_no_id, best_yes_logit, best_no_logit
+    def calculate_reranker_score(self, logits):
+        """
+        计算重排序分数（基于 "yes" 和 "no" token 的 softmax 概率）
+        Args:
+            logits: 词汇表大小的 logits 数组
+        Returns:
+            相关性分数 (0-1之间，越高越相关)
+        """
+        try:
+            # 找到 yes 和 no token 的 logits
+            yes_id, no_id, yes_logit, no_logit = self.find_best_yes_no_tokens(logits)
+            print(f"Yes token ID: {yes_id}, logit: {yes_logit:.4f}")
+            print(f"No token ID: {no_id}, logit: {no_logit:.4f}")
+            # 计算 softmax 概率
+            # 只考虑 yes 和 no 两个 token 的相对概率
+            max_logit = max(yes_logit, no_logit)
+            yes_exp = np.exp(yes_logit - max_logit)  # 数值稳定性
+            no_exp = np.exp(no_logit - max_logit)
+            sum_exp = yes_exp + no_exp
+            yes_prob = yes_exp / sum_exp
+            no_prob = no_exp / sum_exp
+            print(f"Yes 概率: {yes_prob:.4f}, No 概率: {no_prob:.4f}")
+            # 返回 yes 的概率作为相关性分数
+            return float(yes_prob)
+        except Exception as e:
+            print(f"计算 reranker 分数时发生错误: {e}")
+            # 回退到简单的启发式方法
+            return self.fallback_score_calculation(logits)
+    def fallback_score_calculation(self, logits):
+        """
+        备用分数计算方法（当无法找到 yes/no tokens 时）
+        Args:
+            logits: 词汇表大小的 logits 数组
+        Returns:
+            相关性分数 (0-1之间)
+        """
+        print("使用备用分数计算方法")
+        # 使用 logits 的分布特征计算分数
+        logits_array = np.array(logits)
+        # 计算 softmax 分布的熵
+        softmax_probs = np.exp(logits_array - np.max(logits_array))
+        softmax_probs = softmax_probs / np.sum(softmax_probs)
+        # 熵越低，模型越确信（越相关）
+        entropy = -np.sum(softmax_probs * np.log(softmax_probs + 1e-10))
+        max_entropy = np.log(len(logits))
+        normalized_entropy = entropy / max_entropy
+        # 转换为相关性分数（熵低 = 相关性高）
+        confidence_score = 1.0 - normalized_entropy
+        # 结合最大 logit 的信息
+        max_logit_score = (np.max(logits_array) - np.mean(logits_array)) / (np.std(logits_array) + 1e-8)
+        max_logit_score = max(0, min(1, max_logit_score / 10))  # 归一化
+        # 综合分数
+        final_score = 0.7 * confidence_score + 0.3 * max_logit_score
+        final_score = max(0.0, min(1.0, final_score))
+        print(f"备用计算 - 熵分数: {confidence_score:.4f}, 最大logit分数: {max_logit_score:.4f}, 最终分数: {final_score:.4f}")
+        return final_score
+    def init_model(self):
+        """初始化模型"""
+        try:
+            print(f"初始化 RKLLM 运行时，库路径: {self.library_path}")
+            self.runtime = RKLLMRuntime(self.library_path)
+            print("创建默认参数...")
+            params = self.runtime.create_default_param()
+            # 配置参数
+            params.model_path = self.model_path.encode('utf-8')
+            params.max_context_len = 1024
+            params.max_new_tokens = 1       # reranker 只需要生成一个 token
+            params.temperature = 0.0        # 确定性输出
+            params.top_k = 1               # 贪心解码
+            params.top_p = 1.0             # 禁用nucleus采样
+            # 扩展参数配置
+            params.extend_param.base_domain_id = 1
+            params.extend_param.embed_flash = 0
+            params.extend_param.enabled_cpus_num = 4
+            params.extend_param.enabled_cpus_mask = 0x0F
+            print(f"初始化模型: {self.model_path}")
+            self.runtime.init(params, self.callback_function)
+            # 设置聊天模板
+            self.runtime.set_chat_template(
+                "",
+                "",                  # prefix
+                ""                   # suffix
+            )
+            print("模型初始化成功！")
+        except Exception as e:
+            print(f"模型初始化失败: {e}")
+            raise
+    def format_rerank_input(self, instruction, query, document):
+        """
+        格式化重排序输入（根据官方 README 格式）
+        Args:
+            instruction: 任务指令
+            query: 查询文本
+            document: 文档文本
+        Returns:
+            格式化的输入文本
+        """
+        if instruction is None:
+            instruction = 'Given a web search query, retrieve relevant passages that answer the query'
+        # 根据官方 README 的格式
+        formatted_input = f"<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {document}"
+        return formatted_input
+    def get_reranker_score(self, instruction, query, document):
+        """
+        获取重排序分数（通过 logits）
+        Args:
+            instruction: 任务指令
+            query: 查询文本
+            document: 文档文本
+        Returns:
+            相关性分数 (0-1之间)
+        """
+        try:
+            # 格式化输入
+            input_text = self.format_rerank_input(instruction, query, document)
+            print(f"\n重排序输入: {input_text[:200]}{'...' if len(input_text) > 200 else ''}")
+            # 准备输入
+            rk_input = RKLLMInput()
+            rk_input.input_type = RKLLMInputType.RKLLM_INPUT_PROMPT
+            c_prompt = input_text.encode('utf-8')
+            rk_input._union_data.prompt_input = c_prompt
+            # 准备推理参数 - 使用 GET_LOGITS 模式
+            infer_params = RKLLMInferParam()
+            infer_params.mode = RKLLMInferMode.RKLLM_INFER_GET_LOGITS  # 获取 logits
+            infer_params.keep_history = 0
+            # 清空之前的结果
+            self.current_result = None
+            self.runtime.clear_kv_cache(False)
+            # 执行推理
+            start_time = time.time()
+            self.runtime.run(rk_input, infer_params)
+            end_time = time.time()
+            print(f"\n推理耗时: {end_time - start_time:.3f}秒")
+            if self.current_result and 'logits' in self.current_result:
+                # 使用正确的方法计算 reranker 分数
+                logits = self.current_result['logits']
+                score = self.calculate_reranker_score(logits)
+                print(f"计算得分: {score:.4f}")
+                return score
+            else:
+                print("警告: 未能获取到有效的 logits，返回默认分数")
+                return 0.0
+        except Exception as e:
+            print(f"重排序评分时发生错误: {e}")
+            import traceback
+            traceback.print_exc()
+            return 0.0
+    def rerank_documents(self, query, documents, instruction=None):
+        """
+        对文档列表进行重排序
+        Args:
+            query: 查询文本
+            documents: 文档列表
+            instruction: 可选的任务指令
+        Returns:
+            按相关性分数降序排列的(文档, 分数)元组列表
+        """
+        print(f"\n对 {len(documents)} 个文档进行重排序")
+        print(f"查询: {query}")
+        if instruction:
+            print(f"指令: {instruction}")
+        scored_docs = []
+        for i, doc in enumerate(documents):
+            print(f"\n--- 处理文档 {i+1}/{len(documents)} ---")
+            print(f"文档: {doc[:100]}{'...' if len(doc) > 100 else ''}")
+            score = self.get_reranker_score(instruction, query, doc)
+            scored_docs.append((doc, score))
+            print(f"得分: {score:.4f}")
+        # 按分数降序排序
+        scored_docs.sort(key=lambda x: x[1], reverse=True)
+        return scored_docs
+    def test_basic_reranking(self):
+        """测试基础重排序功能"""
+        print("\n" + "="*60)
+        print("测试基础重排序功能")
+        print("="*60)
+        # 测试查询
+        query = "What is the capital of China?"
+        # 候选文档（包含相关和不相关的）
+        documents = [
+            "Beijing is the capital city of China, located in northern China.",
+            "The Great Wall of China is an ancient fortification built to protect Chinese states.",
+            "Python is a high-level programming language used for software development.",
+            "China's capital Beijing is home to over 21 million people.",
+            "Machine learning is a subset of artificial intelligence that uses algorithms."
+        ]
+        # 执行重排序
+        instruction = "Given a web search query, retrieve relevant passages that answer the query"
+        ranked_docs = self.rerank_documents(query, documents, instruction)
+        # 显示结果
+        print(f"\n重排序结果（查询: {query}）:")
+        print("-" * 80)
+        for i, (doc, score) in enumerate(ranked_docs):
+            print(f"排名 {i+1}: 分数 {score:.4f}")
+            print(f"文档: {doc}")
+            print()
+        return ranked_docs
+    def test_multilingual_reranking(self):
+        """测试多语言重排序"""
+        print("\n" + "="*60)
+        print("测试多语言重排序功能")
+        print("="*60)
+        # 中文查询
+        query = "中国的首都是什么？"
+        documents = [
+            "北京是中华人民共和国的首都，位于中国北部。",
+            "上海是中国的经济中心，人口超过2400万。",
+            "Python 是一种高级编程语言。",
+            "The capital of China is Beijing.",
+            "长城是中国古代的军事防御工程。"
+        ]
+        instruction = "Given a web search query, retrieve relevant passages that answer the query"
+        ranked_docs = self.rerank_documents(query, documents, instruction)
+        print(f"\n多语言重排序结果（查询: {query}）:")
+        print("-" * 80)
+        for i, (doc, score) in enumerate(ranked_docs):
+            print(f"排名 {i+1}: 分数 {score:.4f}")
+            print(f"文档: {doc}")
+            print()
+        return ranked_docs
+    def test_domain_specific_reranking(self):
+        """测试领域特定的重排序"""
+        print("\n" + "="*60)
+        print("测试领域特定重排序（技术文档）")
+        print("="*60)
+        query = "How to implement a neural network in Python?"
+        documents = [
+            "PyTorch is a deep learning framework that provides tensor computations with GPU acceleration.",
+            "TensorFlow is an open-source machine learning library developed by Google.",
+            "Neural networks are computing systems inspired by biological neural networks.",
+            "Python is a programming language with simple syntax and powerful libraries.",
+            "To implement a neural network in Python, you can use libraries like PyTorch or TensorFlow to define layers, loss functions, and optimization algorithms.",
+            "Cooking recipes often require precise measurements and cooking times.",
+            "Backpropagation is the algorithm used to train neural networks by computing gradients."
+        ]
+        # 使用自定义指令
+        instruction = "Given a technical query and a document, determine if the document provides practical information for implementing the requested technical solution"
+        ranked_docs = self.rerank_documents(query, documents, instruction)
+        print(f"\n技术文档重排序结果（查询: {query}）:")
+        print("-" * 80)
+        for i, (doc, score) in enumerate(ranked_docs):
+            print(f"排名 {i+1}: 分数 {score:.4f}")
+            print(f"文档: {doc}")
+            print()
+        return ranked_docs
+    def test_comparison_with_official_example(self):
+        """测试与官方示例的对比"""
+        print("\n" + "="*60)
+        print("测试与官方示例的对比")
+        print("="*60)
+        # 使用官方 README 中的示例
+        task = 'Given a web search query, retrieve relevant passages that answer the query'
+        queries = [
+            "What is the capital of China?",
+            "Explain gravity",
+        ]
+        documents = [
+            "The capital of China is Beijing.",
+            "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
+        ]
+        print("测试官方示例的查询-文档对:")
+        for i, (query, doc) in enumerate(zip(queries, documents)):
+            print(f"\n=== 查询-文档对 {i+1} ===")
+            print(f"查询: {query}")
+            print(f"文档: {doc}")
+            score = self.get_reranker_score(task, query, doc)
+            print(f"相关性分数: {score:.4f}")
+    def cleanup(self):
+        """清理资源"""
+        if self.runtime:
+            try:
+                self.runtime.destroy()
+                print("模型资源已清理")
+            except Exception as e:
+                print(f"清理资源时发生错误: {e}")
+def main():
+    """主函数"""
+    import argparse
+    # 解析命令行参数
+    parser = argparse.ArgumentParser(description='Qwen3-Reranker-0.6B 推理测试')
+    parser.add_argument('model_path', help='模型文件路径(.rkllm格式)')
+    parser.add_argument('--library_path', default="./librkllmrt.so", help='RKLLM库文件路径(默认为./librkllmrt.so)')
+    args = parser.parse_args()
+    # 检查文件是否存在
+    if not os.path.exists(args.model_path):
+        print(f"错误: 模型文件不存在: {args.model_path}")
+        print("请确保:")
+        print("1. 已下载 Qwen3-Reranker-0.6B 模型")
+        print("2. 已使用 rkllm-convert.py 将模型转换为 .rkllm 格式")
+        return
+    if not os.path.exists(args.library_path):
+        print(f"错误: RKLLM 库文件不存在: {args.library_path}")
+        print("请确保 librkllmrt.so 在当前目录或 LD_LIBRARY_PATH 中")
+        return
+    print("Qwen3-Reranker-0.6B 推理测试")
+    print("=" * 60)
+    print("基于官方 README 的正确实现")
+    print("=" * 60)
+    # 创建测试器
+    tester = Qwen3RerankerTester(args.model_path, args.library_path)
+    try:
+        # 初始化模型
+        tester.init_model()
+        # 运行测试
+        print("\n开始运行重排序测试...")
+        # 测试官方示例对比
+        tester.test_comparison_with_official_example()
+        # 测试基础重排序功能
+        tester.test_basic_reranking()
+        # 测试多语言重排序
+        tester.test_multilingual_reranking()
+        # 测试领域特定重排序
+        tester.test_domain_specific_reranking()
+        print("\n" + "="*60)
+        print("所有重排序测试完成！")
+        print("="*60)
+    except KeyboardInterrupt:
+        print("\n测试被用户中断")
+    except Exception as e:
+        print(f"\n测试过程中发生错误: {e}")
+        import traceback
+        traceback.print_exc()
+    finally:
+        # 清理资源
+        tester.cleanup()
+if __name__ == "__main__":
+    main()