Add new ColBERT model
Browse files- README.md +13 -12
- config_sentence_transformers.json +2 -2
- sentence_bert_config.json +1 -1
- tokenizer.json +1 -1
README.md
CHANGED
|
@@ -5,7 +5,8 @@ license: apache-2.0
|
|
| 5 |
tags:
|
| 6 |
- colbert
|
| 7 |
- PyLate
|
| 8 |
-
- feature-
|
|
|
|
| 9 |
- sentence-pair-classification
|
| 10 |
- semantic-similarity
|
| 11 |
- semantic-search
|
|
@@ -30,8 +31,8 @@ This is a [PyLate](https://github.com/lightonai/pylate) model finetuned from [li
|
|
| 30 |
### Model Description
|
| 31 |
- **Model Type:** PyLate model
|
| 32 |
- **Base model:** [lightonai/GTE-ModernColBERT-v1](https://huggingface.co/lightonai/GTE-ModernColBERT-v1) <!-- at revision 6605e431bed9b582d3eff7699911d2b64e8ccd3f -->
|
| 33 |
-
- **Document Length:**
|
| 34 |
-
- **Query Length:**
|
| 35 |
- **Output Dimensionality:** 128 tokens
|
| 36 |
- **Similarity Function:** MaxSim
|
| 37 |
- **Training Dataset:**
|
|
@@ -49,7 +50,7 @@ This is a [PyLate](https://github.com/lightonai/pylate) model finetuned from [li
|
|
| 49 |
|
| 50 |
```
|
| 51 |
ColBERT(
|
| 52 |
-
(0): Transformer({'max_seq_length':
|
| 53 |
(1): Dense({'in_features': 768, 'out_features': 128, 'bias': False, 'activation_function': 'torch.nn.modules.linear.Identity', 'use_residual': False})
|
| 54 |
)
|
| 55 |
```
|
|
@@ -224,10 +225,10 @@ You can finetune this model on your own dataset.
|
|
| 224 |
* Size: 1,452,533 training samples
|
| 225 |
* Columns: <code>anchor</code>, <code>positive</code>, and <code>negative_1</code>
|
| 226 |
* Approximate statistics based on the first 1000 samples:
|
| 227 |
-
| | anchor
|
| 228 |
-
|
| 229 |
-
| type | string
|
| 230 |
-
| details | <ul><li>min: 9 tokens</li><li>mean:
|
| 231 |
* Samples:
|
| 232 |
| anchor | positive | negative_1 |
|
| 233 |
|:-----------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------|
|
|
@@ -244,10 +245,10 @@ You can finetune this model on your own dataset.
|
|
| 244 |
* Size: 110,066 evaluation samples
|
| 245 |
* Columns: <code>anchor</code>, <code>positive</code>, and <code>negative_1</code>
|
| 246 |
* Approximate statistics based on the first 1000 samples:
|
| 247 |
-
| | anchor
|
| 248 |
-
|
| 249 |
-
| type | string
|
| 250 |
-
| details | <ul><li>min: 5 tokens</li><li>mean:
|
| 251 |
* Samples:
|
| 252 |
| anchor | positive | negative_1 |
|
| 253 |
|:----------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------|
|
|
|
|
| 5 |
tags:
|
| 6 |
- colbert
|
| 7 |
- PyLate
|
| 8 |
+
- feature-extraction
|
| 9 |
+
- text-classification
|
| 10 |
- sentence-pair-classification
|
| 11 |
- semantic-similarity
|
| 12 |
- semantic-search
|
|
|
|
| 31 |
### Model Description
|
| 32 |
- **Model Type:** PyLate model
|
| 33 |
- **Base model:** [lightonai/GTE-ModernColBERT-v1](https://huggingface.co/lightonai/GTE-ModernColBERT-v1) <!-- at revision 6605e431bed9b582d3eff7699911d2b64e8ccd3f -->
|
| 34 |
+
- **Document Length:** 512 tokens
|
| 35 |
+
- **Query Length:** 512 tokens
|
| 36 |
- **Output Dimensionality:** 128 tokens
|
| 37 |
- **Similarity Function:** MaxSim
|
| 38 |
- **Training Dataset:**
|
|
|
|
| 50 |
|
| 51 |
```
|
| 52 |
ColBERT(
|
| 53 |
+
(0): Transformer({'max_seq_length': 511, 'do_lower_case': False, 'architecture': 'ModernBertModel'})
|
| 54 |
(1): Dense({'in_features': 768, 'out_features': 128, 'bias': False, 'activation_function': 'torch.nn.modules.linear.Identity', 'use_residual': False})
|
| 55 |
)
|
| 56 |
```
|
|
|
|
| 225 |
* Size: 1,452,533 training samples
|
| 226 |
* Columns: <code>anchor</code>, <code>positive</code>, and <code>negative_1</code>
|
| 227 |
* Approximate statistics based on the first 1000 samples:
|
| 228 |
+
| | anchor | positive | negative_1 |
|
| 229 |
+
|:--------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|
|
| 230 |
+
| type | string | string | string |
|
| 231 |
+
| details | <ul><li>min: 9 tokens</li><li>mean: 28.67 tokens</li><li>max: 79 tokens</li></ul> | <ul><li>min: 8 tokens</li><li>mean: 28.51 tokens</li><li>max: 57 tokens</li></ul> | <ul><li>min: 5 tokens</li><li>mean: 24.02 tokens</li><li>max: 50 tokens</li></ul> |
|
| 232 |
* Samples:
|
| 233 |
| anchor | positive | negative_1 |
|
| 234 |
|:-----------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------|
|
|
|
|
| 245 |
* Size: 110,066 evaluation samples
|
| 246 |
* Columns: <code>anchor</code>, <code>positive</code>, and <code>negative_1</code>
|
| 247 |
* Approximate statistics based on the first 1000 samples:
|
| 248 |
+
| | anchor | positive | negative_1 |
|
| 249 |
+
|:--------|:-----------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|
|
| 250 |
+
| type | string | string | string |
|
| 251 |
+
| details | <ul><li>min: 5 tokens</li><li>mean: 26.68 tokens</li><li>max: 104 tokens</li></ul> | <ul><li>min: 5 tokens</li><li>mean: 26.34 tokens</li><li>max: 104 tokens</li></ul> | <ul><li>min: 6 tokens</li><li>mean: 20.39 tokens</li><li>max: 69 tokens</li></ul> |
|
| 252 |
* Samples:
|
| 253 |
| anchor | positive | negative_1 |
|
| 254 |
|:----------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------|
|
config_sentence_transformers.json
CHANGED
|
@@ -12,8 +12,8 @@
|
|
| 12 |
"similarity_fn_name": "MaxSim",
|
| 13 |
"query_prefix": "[Q] ",
|
| 14 |
"document_prefix": "[D] ",
|
| 15 |
-
"query_length":
|
| 16 |
-
"document_length":
|
| 17 |
"attend_to_expansion_tokens": false,
|
| 18 |
"skiplist_words": [
|
| 19 |
"!",
|
|
|
|
| 12 |
"similarity_fn_name": "MaxSim",
|
| 13 |
"query_prefix": "[Q] ",
|
| 14 |
"document_prefix": "[D] ",
|
| 15 |
+
"query_length": 512,
|
| 16 |
+
"document_length": 512,
|
| 17 |
"attend_to_expansion_tokens": false,
|
| 18 |
"skiplist_words": [
|
| 19 |
"!",
|
sentence_bert_config.json
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
{
|
| 2 |
-
"max_seq_length":
|
| 3 |
"do_lower_case": false
|
| 4 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"max_seq_length": 511,
|
| 3 |
"do_lower_case": false
|
| 4 |
}
|
tokenizer.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
"version": "1.0",
|
| 3 |
"truncation": {
|
| 4 |
"direction": "Right",
|
| 5 |
-
"max_length":
|
| 6 |
"strategy": "LongestFirst",
|
| 7 |
"stride": 0
|
| 8 |
},
|
|
|
|
| 2 |
"version": "1.0",
|
| 3 |
"truncation": {
|
| 4 |
"direction": "Right",
|
| 5 |
+
"max_length": 511,
|
| 6 |
"strategy": "LongestFirst",
|
| 7 |
"stride": 0
|
| 8 |
},
|