Integrate with transformers, sentence transformers

Files changed (4) hide show

README.md +1 -1
config.json +7 -1
modeling_zeranker.py +128 -206
tokenizer_config.json +4 -1

README.md CHANGED Viewed

@@ -41,8 +41,8 @@ query_documents = [
 ]
 scores = model.predict(query_documents)
 print(scores)
 ```
 The model can also be inferenced using ZeroEntropy's [/models/rerank](https://docs.zeroentropy.dev/api-reference/models/rerank) endpoint, and on [AWS Marketplace](https://aws.amazon.com/marketplace/pp/prodview-o7avk66msiukc).

 ]
 scores = model.predict(query_documents)
 print(scores)
+# [0.7531883  0.28894895]
 ```
 The model can also be inferenced using ZeroEntropy's [/models/rerank](https://docs.zeroentropy.dev/api-reference/models/rerank) endpoint, and on [AWS Marketplace](https://aws.amazon.com/marketplace/pp/prodview-o7avk66msiukc).

config.json CHANGED Viewed

@@ -1,9 +1,13 @@
 {
   "architectures": [
-    "Qwen3ForCausalLM"
   ],
   "attention_bias": false,
   "attention_dropout": 0.0,
   "bos_token_id": 151643,
   "dtype": "bfloat16",
   "eos_token_id": 151645,
@@ -56,6 +60,8 @@
   "num_attention_heads": 32,
   "num_hidden_layers": 36,
   "num_key_value_heads": 8,
   "rms_norm_eps": 1e-06,
   "rope_scaling": null,
   "rope_theta": 1000000,

 {
   "architectures": [
+    "ZeroEntropyForSequenceClassification"
   ],
   "attention_bias": false,
   "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "modeling_zeranker.ZeroEntropyConfig",
+    "AutoModelForSequenceClassification": "modeling_zeranker.ZeroEntropyForSequenceClassification"
+  },
   "bos_token_id": 151643,
   "dtype": "bfloat16",
   "eos_token_id": 151645,
   "num_attention_heads": 32,
   "num_hidden_layers": 36,
   "num_key_value_heads": 8,
+  "num_labels": 1,
+  "pad_token_id": 151643,
   "rms_norm_eps": 1e-06,
   "rope_scaling": null,
   "rope_theta": 1000000,

modeling_zeranker.py CHANGED Viewed

@@ -1,216 +1,138 @@
-from sentence_transformers import CrossEncoder as _CE
-import math
-from typing import cast, Any
-import types
 import torch
-from transformers.configuration_utils import PretrainedConfig
-from transformers.models.auto.configuration_auto import AutoConfig
-from transformers.models.auto.modeling_auto import AutoModelForCausalLM
-from transformers.models.auto.tokenization_auto import AutoTokenizer
-from transformers.models.gemma3.modeling_gemma3 import (
-    Gemma3ForCausalLM,
-    Gemma3ForConditionalGeneration,
-)
-from transformers.models.llama.modeling_llama import LlamaForCausalLM
-from transformers.models.qwen3.modeling_qwen3 import Qwen3ForCausalLM
-from transformers.tokenization_utils_base import BatchEncoding
 from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
-# pyright: reportUnknownMemberType=false
-# pyright: reportUnknownVariableType=false
-MODEL_PATH = "zeroentropy/zerank-2"
-PER_DEVICE_BATCH_SIZE_TOKENS = 15_000
-global_device = (
-    torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-)
-def format_pointwise_datapoints(
-    tokenizer: PreTrainedTokenizerFast,
-    query_documents: list[tuple[str, str]],
-) -> BatchEncoding:
-    input_texts: list[str] = []
-    for query, document in query_documents:
-        system_prompt = f"""
-{query}
-""".strip()
-        user_message = f"""
-{document}
-""".strip()
-        messages = [
-            {"role": "system", "content": system_prompt},
-            {"role": "user", "content": user_message},
-        ]
-        input_text = tokenizer.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True,
-        )
-        assert isinstance(input_text, str)
-        input_texts.append(input_text)
-    batch_inputs = tokenizer(
-        input_texts,
-        padding=True,
-        return_tensors="pt",
-    )
-    return batch_inputs
-def load_model(
-    device: torch.device | None = None,
-) -> tuple[
-    PreTrainedTokenizerFast,
-    LlamaForCausalLM
-    | Gemma3ForConditionalGeneration
-    | Gemma3ForCausalLM
-    | Qwen3ForCausalLM,
-]:
-    if device is None:
-        device = global_device
-    config = AutoConfig.from_pretrained(MODEL_PATH)
-    assert isinstance(config, PretrainedConfig)
-    model = AutoModelForCausalLM.from_pretrained(
-        MODEL_PATH,
-        torch_dtype="auto",
-        quantization_config=None,
-        device_map={"": device},
-    )
-    if config.model_type == "llama":
-        model.config.attn_implementation = "flash_attention_2"
-    assert isinstance(
-        model,
-        LlamaForCausalLM
-        | Gemma3ForConditionalGeneration
-        | Gemma3ForCausalLM
-        | Qwen3ForCausalLM,
-    )
-    tokenizer = cast(
-        AutoTokenizer,
-        AutoTokenizer.from_pretrained(
-            MODEL_PATH,
-            padding_side="right",
-        ),
-    )
-    assert isinstance(tokenizer, PreTrainedTokenizerFast)
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-    return tokenizer, model
-def predict(
-    self,
-    query_documents: list[tuple[str, str]] | None = None,
-    *,
-    sentences: Any = None,
-    batch_size: Any = None,
-    show_progress_bar: Any = None,
-    activation_fn: Any = None,
-    apply_softmax: Any = None,
-    convert_to_numpy: Any = None,
-    convert_to_tensor: Any = None,
-) -> list[float]:
-    if query_documents is None:
-        if sentences is None:
-            raise ValueError("query_documents or sentences must be provided")
-        query_documents = [[sentence[0], sentence[1]] for sentence in sentences]
-    if not hasattr(self, "inner_model"):
-        self.inner_tokenizer, self.inner_model = load_model(global_device)
-        self.inner_model.gradient_checkpointing_enable()
-        self.inner_model.eval()
-        self.inner_yes_token_id = self.inner_tokenizer.encode(
-            "Yes", add_special_tokens=False
-        )[0]
-    model = self.inner_model
-    tokenizer = self.inner_tokenizer
-    query_documents = [
-        (query[:2_000], document[:10_000]) for query, document in query_documents
-    ]
-    # Sort
-    permutation = list(range(len(query_documents)))
-    permutation.sort(
-        key=lambda i: -len(query_documents[i][0]) - len(query_documents[i][1])
-    )
-    query_documents = [query_documents[i] for i in permutation]
-    # Extract document batches from this line of datapoints
-    max_length = 0
-    batches: list[list[tuple[str, str]]] = []
-    for query, document in query_documents:
-        if (
-            len(batches) == 0
-            or (len(batches[-1]) + 1) * max(max_length, len(query) + len(document))
-            > PER_DEVICE_BATCH_SIZE_TOKENS
-        ):
-            batches.append([])
-            max_length = 0
-        batches[-1].append((query, document))
-        max_length = max(max_length, 20 + len(query) + len(document))
-    # Inference all of the document batches
-    all_logits: list[float] = []
-    for batch in batches:
-        batch_inputs = format_pointwise_datapoints(
-            tokenizer,
-            batch,
         )
-        batch_inputs = batch_inputs.to(global_device)
-        try:
-            outputs = model(**batch_inputs, use_cache=False)
-        except torch.OutOfMemoryError:
-            print(f"GPU OOM! {torch.cuda.memory_reserved()}")
-            torch.cuda.empty_cache()
-            print(f"GPU After OOM Cache Clear: {torch.cuda.memory_reserved()}")
-            outputs = model(**batch_inputs, use_cache=False)
-        # Extract the logits
-        logits = cast(torch.Tensor, outputs.logits)
-        attention_mask = cast(torch.Tensor, batch_inputs.attention_mask)
         last_positions = attention_mask.sum(dim=1) - 1
         batch_size = logits.shape[0]
-        batch_indices = torch.arange(batch_size, device=global_device)
-        last_logits = logits[batch_indices, last_positions]
-        yes_logits = last_logits[:, self.inner_yes_token_id]
-        all_logits.extend([float(logit) / 5.0 for logit in yes_logits])
-    def sigmoid(x: float) -> float:
-        return 1 / (1 + math.exp(-x))
-    scores = [sigmoid(logit) for logit in all_logits]
-    # Unsort by indices
-    scores = [score for _, score in sorted(zip(permutation, scores, strict=True))]
-    return scores
-def to_device(self: _CE, new_device: torch.device) -> None:
-    global global_device
-    global_device = new_device
-_CE.predict = predict
-from transformers import Qwen3Config
-ZEConfig = Qwen3Config
-_CE.to = to_device

+from torch import nn
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+)
+from transformers.utils import auto_docstring
+from transformers.utils.generic import TransformersKwargs, can_return_tuple
+from typing import Optional, Union
+from transformers.processing_utils import Unpack
 import torch
+from transformers import Cache, Qwen3Config
+from transformers.models.qwen3.modeling_qwen3 import Qwen3PreTrainedModel, Qwen3Model
 from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class ZeroEntropyTokenizer(PreTrainedTokenizerFast):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    def __call__(self, pairs, *args, **kwargs):
+        input_texts: list[str] = []
+        for query, document in pairs:
+            messages = [
+                {"role": "system", "content": query.strip()},
+                {"role": "user", "content": document.strip()},
+            ]
+            input_text = self.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True
+            )
+            assert isinstance(input_text, str)
+            input_texts.append(input_text)
+        batch_inputs = super().__call__(input_texts, *args, **kwargs)
+        return batch_inputs
+class ZeroEntropyConfig(Qwen3Config):
+    model_type = "zeroentropy"
+    def __init__(self, yes_token_id: int = 9454, **kwargs):
+        super().__init__(**kwargs)
+        self.yes_token_id = yes_token_id
+class ZeroEntropyForSequenceClassification(Qwen3PreTrainedModel):
+    config: ZeroEntropyConfig
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Qwen3Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, Qwen3ForCausalLM
+        >>> model = Qwen3ForCausalLM.from_pretrained("Qwen/Qwen3-8B")
+        >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
         )
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = (
+            slice(-logits_to_keep, None)
+            if isinstance(logits_to_keep, int)
+            else logits_to_keep
+        )
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
         last_positions = attention_mask.sum(dim=1) - 1
         batch_size = logits.shape[0]
+        batch_indices = torch.arange(batch_size, device=logits.device)
+        yes_logits = logits[batch_indices, last_positions, self.config.yes_token_id]
+        yes_logits = yes_logits / 5.0
+        yes_logits = yes_logits.unsqueeze(-1)
+        return SequenceClassifierOutputWithPast(
+            loss=None,
+            logits=yes_logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

tokenizer_config.json CHANGED Viewed

@@ -226,6 +226,9 @@
     "<|image_pad|>",
     "<|video_pad|>"
   ],
   "bos_token": null,
   "clean_up_tokenization_spaces": false,
   "eos_token": "<|im_end|>",
@@ -235,6 +238,6 @@
   "pad_token": "<|endoftext|>",
   "padding_side": "right",
   "split_special_tokens": false,
-  "tokenizer_class": "Qwen2Tokenizer",
   "unk_token": null
 }

     "<|image_pad|>",
     "<|video_pad|>"
   ],
+  "auto_map": {
+    "AutoTokenizer": [null, "modeling_zeranker.ZeroEntropyTokenizer"]
+  },
   "bos_token": null,
   "clean_up_tokenization_spaces": false,
   "eos_token": "<|im_end|>",
   "pad_token": "<|endoftext|>",
   "padding_side": "right",
   "split_special_tokens": false,
+  "tokenizer_class": "ZeroEntropyTokenizer",
   "unk_token": null
 }