mratsim commited on
Commit
c6e1b37
·
verified ·
1 Parent(s): 17e2020

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ model.safetensors.index.json filter=lfs diff=lfs merge=lfs -text
37
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ pipeline_tag: text-generation
3
+ license: other
4
+ license_name: modified-mit
5
+ license_link: https://github.com/MiniMax-AI/MiniMax-M2.1/blob/main/LICENSE
6
+ library_name: llm-compressor
7
+ tags:
8
+ - fp8
9
+ - awq
10
+ - conversational
11
+ - vllm
12
+ - code
13
+ - devops
14
+ - software engineering
15
+ - engineer
16
+ - developer
17
+ - architect
18
+ - stem
19
+ - agent
20
+ datasets:
21
+ - HuggingFaceH4/ultrachat_200k
22
+ - databricks/databricks-dolly-15k
23
+ - neuralmagic/calibration
24
+ - HuggingFaceH4/no_robots
25
+ - nvidia/HelpSteer
26
+ - garage-bAInd/Open-Platypus
27
+ - PJMixers/grimulkan_physical-reasoning-ShareGPT
28
+ - PJMixers/grimulkan_theory-of-mind-ShareGPT
29
+ - HuggingFaceH4/Multilingual-Thinking
30
+ - ServiceNow-AI/M2Lingual
31
+ - interstellarninja/hermes_reasoning_tool_use
32
+ - deepmind/code_contests
33
+ - dh02391735/stackoverflow-kubernetes-questions
34
+ - diversoailab/humaneval-rust
35
+ - ammarnasr/the-stack-rust-clean
36
+ - CSJianYang/CodeArena
37
+ - nvidia/OpenCodeInstruct
38
+ - nvidia/Llama-Nemotron-Post-Training-Dataset
39
+ - nvidia/Nemotron-Competitive-Programming-v1
40
+ - rombodawg/code_bagel_hermes-2.5
41
+ - MathArena/project_euler
42
+ - nvidia/Nemotron-Math-Proofs-v1
43
+ - nvidia/OpenMathInstruct-2
44
+ - nvidia/OpenScienceReasoning-2
45
+ - MegaScience/MegaScience
46
+ - OpenMed/Medical-Reasoning-SFT-GPT-OSS-120B
47
+ - ccdv/pubmed-summarization
48
+ - gbharti/finance-alpaca
49
+ - vladlen32230/summarization-yahoo-stock-finance-article-text
50
+ - fka/awesome-chatgpt-prompts
51
+ - theoldmandthesea/17k_business_book
52
+ - ruggsea/stanford-encyclopedia-of-philosophy_instruct
53
+ - mlfoundations-dev/stackexchange_philosophy
54
+ - FreedomIntelligence/SocraticChat
55
+ - Gryphe/Opus-WritingPrompts
56
+ - anthracite-org/nopm_claude_writing_fixed
57
+ - zerofata/Roleplay-Anime-Characters
58
+ - zerofata/Instruct-Anime
59
+ - zerofata/Instruct-Anime-CreativeWriting
60
+ - sam-paech/gutenberg3-generalfiction-scifi-fantasy-romance-adventure-dpo
61
+ - PocketDoc/Dans-Prosemaxx-Adventure
62
+ - anthracite-org/stheno-filtered-v1.1
63
+ - KaraKaraWitch/TvTroper-2025
64
+ - AquaV/US-Army-Survival-Sharegpt
65
+ - AquaV/Interrogation-Sharegpt
66
+ - AquaV/Multi-Environment-Operations-Sharegpt
67
+ - AquaV/Resistance-Sharegpt
68
+ - PocketDoc/Dans-Kinomaxx-VanillaBackrooms
69
+ base_model:
70
+ - MiniMaxAI/MiniMax-M2.1
71
+ ---
72
+
73
+ # MiniMax M2.1 (Mixed-Precision BF16 + INT4 AWQ)
74
+
75
+ This strives to be the highest quality quant that can run on 192GiB VRAM
76
+
77
+ > [!TIP]
78
+ > 💡This is a sister model to [mratsim/MiniMax-M2.1-FP8-INT4-AWQ](https://huggingface.co/mratsim/MiniMax-M2.1-FP8-INT4-AWQ)
79
+ > with the original model FP8 weights pre-dequantized to BF16.
80
+ >
81
+ > This makes it compatible with 8x3090 systems (which don't have hardware FP8)
82
+ > and also compatible with SGLang.
83
+
84
+ It features:
85
+ - That model has ensured that all experts are calibrated, not doing so is extremely detrimental, PR: https://github.com/vllm-project/llm-compressor/pull/2171
86
+ <details>
87
+ <summary>Visual showcase of why ensuring quantization of all MoE experts is important</summary>
88
+
89
+ - Source: https://avtc.github.io/aquarium-side-by-side/
90
+ - Context: https://github.com/ModelCloud/GPTQModel/pull/2235
91
+
92
+ ![image](https://cdn-uploads.huggingface.co/production/uploads/67f26fd2c7b14380431d1f5a/BDc3-0m3_WLl3ZmbBMhmd.png)
93
+
94
+ </details>
95
+ - Mixed precision with:
96
+ - self-attention weights copied directly from the official version (default FP8 with 2D-blocks)
97
+ - experts weights quantized using AWQ W4A16G32 scheme (4-bit weights, 16-bit activations, scaling factor per group of 32 weights)
98
+ - High-quality large and diverse dataset with programming and devops focus
99
+ as well as domain-specific knowledge (math, sciences, medical, finance, business, humanities, philosophy, creative writing), general knowledge, pop culture and behavioral situations because we never code in a vacuum. And we want to make sure all experts are calibrated to the full range of their activations.
100
+ - Calibration explicitly tests multilingual capabilities:
101
+ - Asia: Chinese, Hindi, Korean, Japanese
102
+ - Europe: French, German, Portuguese, Russian, Spanish
103
+ - Middle-East: Arabic, Hebrew, Turkish
104
+ - Calibration explicitly tests 60 programming languages and not just Python:
105
+ - Imperative programming: C, C++, Go, Zig, ...
106
+ - Functional programming: Haskell, F#, OCaml, Erlang, Lisp, Clojure ...
107
+ - Web-focused: HTML/CSS, Typescript, PHP, ...
108
+ - Mixed paradigm: D, Kotlin, Nim, Rust, Swift, ...
109
+ - Theorem provers: Coq, Lean
110
+ - Low-level: ARM64 assembly, x86-64 assembly, LLVM IR
111
+ - GPU Programming: Cuda, Vulkan, Apple Metal
112
+ - Game Programming: GDScript, GLSL
113
+ - Domain-specific: MATLAB, Julia, Solidity, R
114
+ - Calibration tries to ensure coverage for a wide variety of experience (from explaining concepts to your grandmother to debugging Kubernetes logs)
115
+ - Built by a dev, for devs (and it looks very good for STEM as well)
116
+
117
+ It uses my new declarative quantization framework https://github.com/mratsim/quantizers which facilitates highly-tuned calibration sets: [calibrate_software_engineer.yaml](./calibrate_software_engineer.yaml)
118
+
119
+ <details>
120
+ <summary>This has taken several days and contribution and bug reports to the ecosystem, I hope you find it useful.</summary>
121
+
122
+ - https://github.com/vllm-project/llm-compressor/pull/2171
123
+ - https://github.com/vllm-project/llm-compressor/issues/2172
124
+ - https://github.com/vllm-project/vllm/issues/31623
125
+ - https://github.com/sgl-project/sglang/issues/16276
126
+ - https://github.com/sgl-project/sglang/issues/16295
127
+
128
+ </details>
129
+
130
+ ## 📥 Usage & Running Instructions
131
+
132
+ The model was tested with SGLang + 2x RTX Pro 6000, here is a script suitable for such configuration with the maximum 196,608 context length. This uses 92.5GiB of VRAM with the flashinfer backend.
133
+
134
+ Please refer to [mratsim/MiniMax-M2.1-FP8-INT4-AWQ#running-script](https://huggingface.co/mratsim/MiniMax-M2.1-FP8-INT4-AWQ#running-script)
135
+ for running it in vLLM
136
+
137
+ ### Running script
138
+
139
+ `--trust-remote-code` is necessary until the transformers team merges github.com/huggingface/transformers/pull/42028
140
+
141
+ You have 2 reasoning parsers;
142
+ - `minimax`, puts the reasoning content in a special field like DeepSeek models that is usually rendered in a specific manner in frontends.
143
+ - `minimax_append_think`, puts the reasoning into `<think>reasoning_content</think>` and that is sent as normal text. Few frontends properly render that, I'm aware of [Cherry Studio](https://github.com/CherryHQ/cherry-studio) on Desktop and [ChatterUI](https://github.com/Vali-98/ChatterUI) on Android.
144
+
145
+ The reason why `minimax_append_think` was introduced was Interleaved Thinking and having the model build upon it's previous thinking (usually frontends discard the thinking trace)
146
+
147
+ > [!TIP]
148
+ > 💡In the sister model, I mentioned that with the recommended parameters the model tends to get stuck in repetition loops.\
149
+ > This does not seem to happen with SGLang hence "repetition_penalty: 1.10, frequency_penalty: 0.40" are not used. \
150
+ > There is no way to override such settings without editing generation_config.json anyway: https://github.com/sgl-project/sglang/issues/15487
151
+
152
+ > [!NOTE]
153
+ > I have not yet found a way to enable speculative decoding and the max 196608 context size and over 10 concurrent requests within 192 GIB of VRAM
154
+
155
+ ```bash
156
+ # Model configuration (Mandatory)
157
+ MODEL="mratsim/MiniMax-M2.1-BF16-INT4-AWQ"
158
+ MODELNAME="MiniMax-M2.1"
159
+ GPU_UTIL=0.97
160
+ SGLANG_PORT=8000
161
+
162
+ # Prevent memory fragmentation
163
+ export PYTORCH_ALLOC_CONF=expandable_segments:True,max_split_size_mb:512
164
+
165
+ python3 -m sglang.launch_server \
166
+ --host 0.0.0.0 \
167
+ --port "${SGLANG_PORT}" \
168
+ --sleep-on-idle \
169
+ --disable-custom-all-reduce \
170
+ --max-running-requests 12 \
171
+ --cuda-graph-max-bs 12 \
172
+ --attention-backend flashinfer \
173
+ --served-model-name "${MODELNAME}" \
174
+ --model-path "${MODEL}" \
175
+ --tool-call-parser minimax-m2 \
176
+ --reasoning-parser minimax \
177
+ --trust-remote-code \
178
+ --tp 2 \
179
+ --mem-fraction-static ${GPU_UTIL} \
180
+ "$@"
181
+ # --reasoning-parser minimax-append-think \
182
+ ```
183
+
184
+ ## 🔬 Quantization method
185
+
186
+ Quantization was quite complex for this model and was done in 3 steps:
187
+ 1. Original weights are in FP8, they were dequantized to FP16 due to llm-compressor not being able to process FP8.
188
+ 2. llm-compressor was used to quantize the MLP experts projection using AWQ, with [PR #2171](https://github.com/vllm-project/llm-compressor/pull/2171) to ensure they were all activated.
189
+ 3. Stitching the FrankenQuant: I combined the original weights, including the 2D-block FP8, with the experts-only AWQ weights.
190
+
191
+ The llmcompressor library was used with the following recipe:
192
+
193
+ ```yaml
194
+ default_stage:
195
+ default_modifiers:
196
+ AWQModifier:
197
+ config_groups:
198
+ mlp_experts_projections:
199
+ # Include only MLP expert weights for 4-bit quantization
200
+ targets: ["re:.*block_sparse_moe\\.experts\\.\\d+\\.(w1|w2|w3)$"]
201
+ weights:
202
+ num_bits: 4
203
+ type: int
204
+ symmetric: true
205
+ group_size: 32
206
+ strategy: group
207
+ dynamic: false
208
+ # actorder: group
209
+ observer: memoryless_minmax
210
+
211
+ mappings:
212
+ - smooth_layer: re:.*post_attention_layernorm$
213
+ balance_layers: ["re:.*w1$", "re:.*w3$"]
214
+ - smooth_layer: re:.*w3$
215
+ balance_layers: ["re:.*w2$"]
216
+ duo_scaling: true
217
+ ```
218
+
219
+ The calibration set had 590 examples, 8192 sequence length, 60 programming languages, 12 spoken languages and is detailed at [calibrate_software_engineer.yaml](./calibrate_software_engineer.yaml)
220
+
221
+ ## Quantization theory and heuristics for manual tuning
222
+
223
+ <details>
224
+ <summary>In-depth overview of quantization theory and heuristics for manual tuning</summary>
225
+
226
+ ### Layers to quantize
227
+
228
+ Quantization should be focused on Linear layers (also called Dense or Fully-Connected layers i.e. MatMul+Bias)
229
+ In particular quantizing LayerNorm/RMSnorm layer is strongly discouraged, see [1]
230
+ > LayerNorm in Quantization. Kovaleva et al. (2021); Wei et al. (2022) find that outliers in the
231
+ > LayerNorm parameters of BERT (Devlin et al., 2019) cause difficulties in model compression.
232
+ > Given the importance of LayerNorm, all the quantization methods we discuss above leave LayerNorm unquantized.
233
+
234
+ This is also reported in Intel and Nvidia repo:
235
+ - https://github.com/intel/neural-compressor/issues/1963#issuecomment-2274873441
236
+ - https://github.com/NVIDIA/TensorRT/issues/4084#issuecomment-2294513950
237
+
238
+ ### Tensors to up-quantize
239
+
240
+ If there is enough bits, down projections should be prioritized.
241
+
242
+ According to [4]
243
+ > Fig. 3: Maximum absolute value over layers for a LLaMA3-8B.
244
+ > Each color represent a different projection and we clearly see that down_proj has the biggest
245
+ > spikes in input and output. We also observe that RMSNorm propagate spikes through the entire model
246
+
247
+ According to [5]
248
+ > Figure 5(a) illustrates the extremal ratio across layers and modules in LLaMA2-7B, highlighting
249
+ > that weight outliers are concentrated in the down-projection matrices Wdown
250
+ > ℓ of the second layer and
251
+ > the last two layers. Figures 5(b) and 5(c) provide detailed visualizations of these outliers in the last
252
+ > two layers.
253
+
254
+ ### Mixture-of-Experts quantization (MoE)
255
+
256
+ Mixture-of-Experts require specific quantization techniques.
257
+
258
+ #### Mixed-precision quantization
259
+
260
+ Some layers have a higher impact on LLM performance.
261
+ According to [2], spending more bits in attention layers results in large gain compared to spending them in FFN layers.
262
+ According to [3] on 2-bit quantization:
263
+ - quantizing expert FFN layers do not seriously impact model quality
264
+ - quantizing cross-attention has some impact
265
+ - quantizing self-attention has a large impact
266
+ - quantizing dense FFN has a very significant impact
267
+
268
+ Hence to preserve model quality we should choose not to quantize dense FFN layers and self-attention layers.
269
+
270
+ We notice that:
271
+ - official MXFP4 weights of gpt-oss-120b from OpenAI keep self-attention in BF16:
272
+ - https://huggingface.co/openai/gpt-oss-120b/blob/main/model.safetensors.index.json
273
+ - NVFP4 weights of DeepSeek-R1 quantized by Nvidia also keep self-attention in BF16:
274
+ - https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4/blob/main/model.safetensors.index.json
275
+
276
+ #### Layers with high-impact
277
+
278
+ According to [2], giving more bits to the first `k` blocks have a significantly higher impact on model quality than for the same last `k` blocks.
279
+
280
+ #### Expert quantization
281
+
282
+ When quantizing MoE, quantizing activations is tricky as only a subset of experts are activated per request. You have to make sure all experts are calibrated.
283
+
284
+ <details>
285
+ <summary>Visual showcase of why ensuring quantization of all MoE experts is important</summary>
286
+
287
+ - Source: https://avtc.github.io/aquarium-side-by-side/
288
+ - Context: https://github.com/ModelCloud/GPTQModel/pull/2235
289
+
290
+ ![image](https://cdn-uploads.huggingface.co/production/uploads/67f26fd2c7b14380431d1f5a/BDc3-0m3_WLl3ZmbBMhmd.png)
291
+
292
+ </details>
293
+
294
+ ## References
295
+
296
+ 1. Why Do Some Inputs Break Low-Bit LLM Quantization? (2025)\
297
+ Ting-Yun Chang, Muru Zhang, Jesse Thomason, Robin Jia\
298
+ https://arxiv.org/pdf/2506.12044
299
+
300
+ 2. Examining Post-Training Quantization for Mixture-of-Experts: A Benchmark (2024)\
301
+ Pingzhi Li, Xiaolong Jin, Yu Cheng, Tianlong Chen\
302
+ https://arxiv.org/pdf/2406.08155v1
303
+
304
+ 3. Mixture of Quantized Experts (MoQE): Complementary Effect of Low-bit Quantization and Robustness (2023)\
305
+ Young Jin Kim, Raffy Fahim, Hany Hassan Awadalla\
306
+ https://arxiv.org/pdf/2310.02410
307
+
308
+ 4. Precision Where It Matters: A Novel Spike\
309
+ Aware Mixed-Precision Quantization Strategy for\
310
+ LLaMA-based Language Models (2025)\
311
+ Lucas Maisonnave, Cyril Moineau, Olivier Bichler, and Fabrice Rastello\
312
+ https://arxiv.org/pdf/2504.21553
313
+
314
+ 5. Systematic Outliers in Large Language Models (2025)\
315
+ Yongqi An, Xu Zhao, Tao Yu, Ming Tang, Jinqiao Wang\
316
+ https://arxiv.org/pdf/2502.06415v2
317
+
318
+ </details>
added_tokens.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</minimax:tool_call>": 200053,
3
+ "</think>": 200051,
4
+ "<add_file>": 200036,
5
+ "<code_context>": 200043,
6
+ "<code_interpreter>": 200023,
7
+ "<commit_after>": 200018,
8
+ "<commit_before>": 200016,
9
+ "<commit_message>": 200040,
10
+ "<commit_msg>": 200017,
11
+ "<delete_file>": 200037,
12
+ "<edit_file>": 200039,
13
+ "<empty_output>": 200015,
14
+ "<empty_source_file>": 200041,
15
+ "<file_content>": 200044,
16
+ "<file_sep>": 200049,
17
+ "<filename>": 200006,
18
+ "<filepath>": 200048,
19
+ "<fim_middle>": 200002,
20
+ "<fim_pad>": 200004,
21
+ "<fim_prefix>": 200001,
22
+ "<fim_suffix>": 200003,
23
+ "<function_call>": 200022,
24
+ "<gh_stars>": 200007,
25
+ "<issue_closed>": 200010,
26
+ "<issue_comment>": 200009,
27
+ "<issue_start>": 200008,
28
+ "<jupyter_code>": 200013,
29
+ "<jupyter_error>": 200035,
30
+ "<jupyter_output>": 200014,
31
+ "<jupyter_start>": 200011,
32
+ "<jupyter_text>": 200012,
33
+ "<minimax:tool_call>": 200052,
34
+ "<pr_start>": 200046,
35
+ "<rename_file>": 200038,
36
+ "<repo_struct>": 200042,
37
+ "<reponame>": 200005,
38
+ "<review_comment>": 200047,
39
+ "<source_files>": 200045,
40
+ "<think>": 200050,
41
+ "[e~[": 200020,
42
+ "]!d~[": 200021,
43
+ "]!p~[": 200000,
44
+ "]<]end of image[>[": 200030,
45
+ "]<]end of speech[>[": 200028,
46
+ "]<]end of video[>[": 200032,
47
+ "]<]image[>[": 200025,
48
+ "]<]speech[>[": 200024,
49
+ "]<]start of image[>[": 200029,
50
+ "]<]start of speech[>[": 200027,
51
+ "]<]start of video[>[": 200031,
52
+ "]<]video[>[": 200026,
53
+ "]<]vision pad[>[": 200033,
54
+ "]~!b[": 200034,
55
+ "]~b]": 200019
56
+ }
chat_template.jinja ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {# ----------‑‑‑ special token variables ‑‑‑---------- #}
2
+ {%- set toolcall_begin_token = '<minimax:tool_call>' -%}
3
+ {%- set toolcall_end_token = '</minimax:tool_call>' -%}
4
+ {#- Tool Rendering Functions ============================================== -#}
5
+ {%- macro render_tool_namespace(namespace_name, tool_list) -%}
6
+ {%- for tool in tool_list -%}
7
+ <tool>{{ tool.function | tojson(ensure_ascii=False) }}</tool>
8
+ {% endfor -%}
9
+ {%- endmacro -%}
10
+ {%- macro visible_text(content) -%}
11
+ {%- if content is string -%}
12
+ {{ content }}
13
+ {%- elif content is iterable and content is not mapping -%}
14
+ {%- for item in content -%}
15
+ {%- if item is mapping and item.type == 'text' -%}
16
+ {{- item.text }}
17
+ {%- elif item is string -%}
18
+ {{- item }}
19
+ {%- endif -%}
20
+ {%- endfor -%}
21
+ {%- elif content is none -%}
22
+ {{- '' }}
23
+ {%- else -%}
24
+ {{- content }}
25
+ {%- endif -%}
26
+ {%- endmacro -%}
27
+ {#- System Message Construction ============================================ -#}
28
+ {%- macro build_system_message(system_message) -%}
29
+ {%- if system_message and system_message.content -%}
30
+ {{- visible_text(system_message.content) }}
31
+ {%- else -%}
32
+ {%- if model_identity is not defined -%}
33
+ {%- set model_identity = "You are a helpful assistant. Your name is MiniMax-M2.1 and is built by MiniMax." -%}
34
+ {%- endif -%}
35
+ {{- model_identity }}
36
+ {%- endif -%}
37
+
38
+ {#- Handle current_date -#}
39
+ {%- if system_message and system_message.current_date -%}
40
+ {{- '\n' ~ 'Current date: ' + system_message.current_date }}
41
+ {%- endif -%}
42
+ {#- Handle current_location -#}
43
+ {%- if system_message and system_message.current_location -%}
44
+ {{- '\n' ~ 'Current location: ' + system_message.current_location }}
45
+ {%- endif -%}
46
+ {%- endmacro -%}
47
+ {#- Main Template Logic ================================================= -#}
48
+ {#- Extract system message (only first message if it's system) -#}
49
+ {%- set system_message = none -%}
50
+ {%- set conversation_messages = messages -%}
51
+ {%- if messages and messages[0].role == "system" -%}
52
+ {%- set system_message = messages[0] -%}
53
+ {%- set conversation_messages = messages[1:] -%}
54
+ {%- endif -%}
55
+ {#- Get the last user message turn, for interleved thinking -#}
56
+ {%- set ns = namespace(last_user_index=-1) %}
57
+ {% for m in conversation_messages %}
58
+ {%- if m.role == 'user' %}
59
+ {% set ns.last_user_index = loop.index0 -%}
60
+ {%- endif %}
61
+ {%- endfor %}
62
+ {#- Render system message -#}
63
+ {{- ']~!b[' ~ ']~b]system' ~ '\n' }}
64
+ {{- build_system_message(system_message) }}
65
+ {#- Render tools if available -#}
66
+ {%- if tools -%}
67
+ {{- '\n\n' ~ '# Tools' ~ '\n' ~ 'You may call one or more tools to assist with the user query.\nHere are the tools available in JSONSchema format:' ~ '\n' }}
68
+ {{- '\n' ~ '<tools>' ~ '\n' }}
69
+ {{- render_tool_namespace("functions", tools) }}
70
+ {{- '</tools>' ~ '\n\n' }}
71
+ {{- 'When making tool calls, use XML format to invoke tools and pass parameters:' ~ '\n' }}
72
+ {{- '\n' ~ toolcall_begin_token }}
73
+ <invoke name="tool-name-1">
74
+ <parameter name="param-key-1">param-value-1</parameter>
75
+ <parameter name="param-key-2">param-value-2</parameter>
76
+ ...
77
+ </invoke>
78
+ {{- '\n' ~ toolcall_end_token }}
79
+ {%- endif -%}
80
+ {{- '[e~[\n' }}
81
+
82
+ {#- Render messages -#}
83
+ {%- set last_tool_call = namespace(name=none) -%}
84
+ {%- for message in conversation_messages -%}
85
+ {%- if message.role == 'assistant' -%}
86
+ {#- Only render reasoning_content if no user message follows -#}
87
+ {{- ']~b]ai' ~ '\n' }}
88
+
89
+ {%- set reasoning_content = '' %}
90
+ {%- set content = visible_text(message.content) %}
91
+ {%- if message.reasoning_content is string %}
92
+ {%- set reasoning_content = message.reasoning_content %}
93
+ {%- else %}
94
+ {%- if '</think>' in content %}
95
+ {%- set reasoning_content = content.split('</think>')[0].strip('\n').split('<think>')[-1].strip('\n') %}
96
+ {%- set content = content.split('</think>')[-1].strip('\n') %}
97
+ {%- endif %}
98
+ {%- endif %}
99
+ {%- if reasoning_content and loop.index0 > ns.last_user_index -%}
100
+ {{- '<think>' ~ '\n' ~ reasoning_content ~ '\n' ~ '</think>' ~ '\n\n' }}
101
+ {%- endif -%}
102
+ {%- if content -%}
103
+ {{- content }}
104
+ {%- endif -%}
105
+ {%- if message.tool_calls -%}
106
+ {{- '\n' ~ toolcall_begin_token ~ '\n' }}
107
+
108
+ {%- for tool_call in message.tool_calls -%}
109
+ {%- if tool_call.function %}
110
+ {%- set tool_call = tool_call.function %}
111
+ {%- endif %}
112
+ {{- '<invoke name="' + tool_call.name + '">' }}
113
+ {% set _args = tool_call.arguments %}
114
+ {%- for k, v in _args.items() %}
115
+ {{- '<parameter name="' + k + '">' }}
116
+ {{- v | tojson(ensure_ascii=False) if v is not string else v }}
117
+ {{- '</parameter>' }}
118
+ {% endfor %}
119
+ {{- '</invoke>' ~ '\n' }}
120
+ {%- endfor -%}
121
+
122
+ {{- toolcall_end_token}}
123
+ {%- if message.tool_calls[-1].function -%}
124
+ {%- set last_tool_call.name = message.tool_calls[-1].function.name -%}
125
+ {%- else -%}
126
+ {%- set last_tool_call.name = message.tool_calls[-1].name -%}
127
+ {%- endif -%}
128
+ {%- else -%}
129
+ {%- set last_tool_call.name = none -%}
130
+ {%- endif -%}
131
+ {{- '[e~[' ~ '\n' }}
132
+
133
+ {%- elif message.role == 'tool' -%}
134
+ {%- if last_tool_call.name is none -%}
135
+ {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }}
136
+ {%- endif -%}
137
+ {%- if loop.first or (conversation_messages[loop.index0 - 1].role != 'tool') -%}
138
+ {{- ']~b]tool' }}
139
+ {%- endif -%}
140
+ {%- if message.content is string -%}
141
+ {{- '\n<response>' }}
142
+ {{- message.content }}
143
+ {{- '</response>' }}
144
+ {%- else -%}
145
+ {%- for tr in message.content -%}
146
+ {{- '\n<response>' }}
147
+ {{- tr.output if tr.output is defined else (tr.text if tr.type == 'text' and tr.text is defined else tr) }}
148
+ {{- '\n</response>' }}
149
+ {%- endfor -%}
150
+ {%- endif -%}
151
+ {%- if loop.last or (conversation_messages[loop.index0 + 1].role != 'tool') -%}
152
+ {{- '[e~[\n' -}}
153
+ {%- endif -%}
154
+
155
+ {%- elif message.role == 'user' -%}
156
+ {{- ']~b]user' ~ '\n' }}
157
+ {{- visible_text(message.content) }}
158
+ {{- '[e~[' ~ '\n' }}
159
+ {%- endif -%}
160
+ {%- endfor -%}
161
+
162
+ {#- Generation prompt -#}
163
+ {%- if add_generation_prompt -%}
164
+ {{- ']~b]ai' ~ '\n' ~ '<think>' ~ '\n' }}
165
+ {%- endif -%}
config.json ADDED
@@ -0,0 +1,518 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "MiniMaxM2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "attn_type_list": [
7
+ 1,
8
+ 1,
9
+ 1,
10
+ 1,
11
+ 1,
12
+ 1,
13
+ 1,
14
+ 1,
15
+ 1,
16
+ 1,
17
+ 1,
18
+ 1,
19
+ 1,
20
+ 1,
21
+ 1,
22
+ 1,
23
+ 1,
24
+ 1,
25
+ 1,
26
+ 1,
27
+ 1,
28
+ 1,
29
+ 1,
30
+ 1,
31
+ 1,
32
+ 1,
33
+ 1,
34
+ 1,
35
+ 1,
36
+ 1,
37
+ 1,
38
+ 1,
39
+ 1,
40
+ 1,
41
+ 1,
42
+ 1,
43
+ 1,
44
+ 1,
45
+ 1,
46
+ 1,
47
+ 1,
48
+ 1,
49
+ 1,
50
+ 1,
51
+ 1,
52
+ 1,
53
+ 1,
54
+ 1,
55
+ 1,
56
+ 1,
57
+ 1,
58
+ 1,
59
+ 1,
60
+ 1,
61
+ 1,
62
+ 1,
63
+ 1,
64
+ 1,
65
+ 1,
66
+ 1,
67
+ 1,
68
+ 1
69
+ ],
70
+ "auto_map": {
71
+ "AutoConfig": "configuration_minimax_m2.MiniMaxM2Config",
72
+ "AutoModelForCausalLM": "modeling_minimax_m2.MiniMaxM2ForCausalLM"
73
+ },
74
+ "bos_token_id": 1,
75
+ "dtype": "bfloat16",
76
+ "eos_token_id": 2,
77
+ "head_dim": 128,
78
+ "hidden_act": "silu",
79
+ "hidden_size": 3072,
80
+ "initializer_range": 0.02,
81
+ "intermediate_size": 1536,
82
+ "max_position_embeddings": 196608,
83
+ "model_type": "minimax_m2",
84
+ "mtp_transformer_layers": 1,
85
+ "num_attention_heads": 48,
86
+ "num_experts_per_tok": 8,
87
+ "num_hidden_layers": 62,
88
+ "num_key_value_heads": 8,
89
+ "num_local_experts": 256,
90
+ "num_mtp_modules": 3,
91
+ "output_router_logits": false,
92
+ "partial_rotary_factor": 0.5,
93
+ "qk_norm_type": "per_layer",
94
+ "quantization_config": {
95
+ "config_groups": {
96
+ "group_0": {
97
+ "format": "pack-quantized",
98
+ "input_activations": null,
99
+ "output_activations": null,
100
+ "targets": [
101
+ "Linear",
102
+ "re:.*block_sparse_moe\\.experts\\.\\d+\\.(w1|w2|w3)$",
103
+ "re:.*self_attn\\.qkv_proj$"
104
+ ],
105
+ "weights": {
106
+ "actorder": null,
107
+ "block_structure": null,
108
+ "dynamic": false,
109
+ "group_size": 32,
110
+ "num_bits": 4,
111
+ "observer": "memoryless_minmax",
112
+ "observer_kwargs": {},
113
+ "strategy": "group",
114
+ "symmetric": true,
115
+ "type": "int"
116
+ }
117
+ }
118
+ },
119
+ "format": "pack-quantized",
120
+ "global_compression_ratio": null,
121
+ "ignore": [
122
+ "model.layers.0.self_attn.q_proj",
123
+ "model.layers.0.self_attn.k_proj",
124
+ "model.layers.0.self_attn.v_proj",
125
+ "model.layers.0.self_attn.o_proj",
126
+ "model.layers.0.self_attn.qkv_proj",
127
+ "model.layers.0.block_sparse_moe.gate",
128
+ "model.layers.1.self_attn.q_proj",
129
+ "model.layers.1.self_attn.k_proj",
130
+ "model.layers.1.self_attn.v_proj",
131
+ "model.layers.1.self_attn.o_proj",
132
+ "model.layers.1.self_attn.qkv_proj",
133
+ "model.layers.1.block_sparse_moe.gate",
134
+ "model.layers.2.self_attn.q_proj",
135
+ "model.layers.2.self_attn.k_proj",
136
+ "model.layers.2.self_attn.v_proj",
137
+ "model.layers.2.self_attn.o_proj",
138
+ "model.layers.2.self_attn.qkv_proj",
139
+ "model.layers.2.block_sparse_moe.gate",
140
+ "model.layers.3.self_attn.q_proj",
141
+ "model.layers.3.self_attn.k_proj",
142
+ "model.layers.3.self_attn.v_proj",
143
+ "model.layers.3.self_attn.o_proj",
144
+ "model.layers.3.self_attn.qkv_proj",
145
+ "model.layers.3.block_sparse_moe.gate",
146
+ "model.layers.4.self_attn.q_proj",
147
+ "model.layers.4.self_attn.k_proj",
148
+ "model.layers.4.self_attn.v_proj",
149
+ "model.layers.4.self_attn.o_proj",
150
+ "model.layers.4.self_attn.qkv_proj",
151
+ "model.layers.4.block_sparse_moe.gate",
152
+ "model.layers.5.self_attn.q_proj",
153
+ "model.layers.5.self_attn.k_proj",
154
+ "model.layers.5.self_attn.v_proj",
155
+ "model.layers.5.self_attn.o_proj",
156
+ "model.layers.5.self_attn.qkv_proj",
157
+ "model.layers.5.block_sparse_moe.gate",
158
+ "model.layers.6.self_attn.q_proj",
159
+ "model.layers.6.self_attn.k_proj",
160
+ "model.layers.6.self_attn.v_proj",
161
+ "model.layers.6.self_attn.o_proj",
162
+ "model.layers.6.self_attn.qkv_proj",
163
+ "model.layers.6.block_sparse_moe.gate",
164
+ "model.layers.7.self_attn.q_proj",
165
+ "model.layers.7.self_attn.k_proj",
166
+ "model.layers.7.self_attn.v_proj",
167
+ "model.layers.7.self_attn.o_proj",
168
+ "model.layers.7.self_attn.qkv_proj",
169
+ "model.layers.7.block_sparse_moe.gate",
170
+ "model.layers.8.self_attn.q_proj",
171
+ "model.layers.8.self_attn.k_proj",
172
+ "model.layers.8.self_attn.v_proj",
173
+ "model.layers.8.self_attn.o_proj",
174
+ "model.layers.8.self_attn.qkv_proj",
175
+ "model.layers.8.block_sparse_moe.gate",
176
+ "model.layers.9.self_attn.q_proj",
177
+ "model.layers.9.self_attn.k_proj",
178
+ "model.layers.9.self_attn.v_proj",
179
+ "model.layers.9.self_attn.o_proj",
180
+ "model.layers.9.self_attn.qkv_proj",
181
+ "model.layers.9.block_sparse_moe.gate",
182
+ "model.layers.10.self_attn.q_proj",
183
+ "model.layers.10.self_attn.k_proj",
184
+ "model.layers.10.self_attn.v_proj",
185
+ "model.layers.10.self_attn.o_proj",
186
+ "model.layers.10.self_attn.qkv_proj",
187
+ "model.layers.10.block_sparse_moe.gate",
188
+ "model.layers.11.self_attn.q_proj",
189
+ "model.layers.11.self_attn.k_proj",
190
+ "model.layers.11.self_attn.v_proj",
191
+ "model.layers.11.self_attn.o_proj",
192
+ "model.layers.11.self_attn.qkv_proj",
193
+ "model.layers.11.block_sparse_moe.gate",
194
+ "model.layers.12.self_attn.q_proj",
195
+ "model.layers.12.self_attn.k_proj",
196
+ "model.layers.12.self_attn.v_proj",
197
+ "model.layers.12.self_attn.o_proj",
198
+ "model.layers.12.self_attn.qkv_proj",
199
+ "model.layers.12.block_sparse_moe.gate",
200
+ "model.layers.13.self_attn.q_proj",
201
+ "model.layers.13.self_attn.k_proj",
202
+ "model.layers.13.self_attn.v_proj",
203
+ "model.layers.13.self_attn.o_proj",
204
+ "model.layers.13.self_attn.qkv_proj",
205
+ "model.layers.13.block_sparse_moe.gate",
206
+ "model.layers.14.self_attn.q_proj",
207
+ "model.layers.14.self_attn.k_proj",
208
+ "model.layers.14.self_attn.v_proj",
209
+ "model.layers.14.self_attn.o_proj",
210
+ "model.layers.14.self_attn.qkv_proj",
211
+ "model.layers.14.block_sparse_moe.gate",
212
+ "model.layers.15.self_attn.q_proj",
213
+ "model.layers.15.self_attn.k_proj",
214
+ "model.layers.15.self_attn.v_proj",
215
+ "model.layers.15.self_attn.o_proj",
216
+ "model.layers.15.self_attn.qkv_proj",
217
+ "model.layers.15.block_sparse_moe.gate",
218
+ "model.layers.16.self_attn.q_proj",
219
+ "model.layers.16.self_attn.k_proj",
220
+ "model.layers.16.self_attn.v_proj",
221
+ "model.layers.16.self_attn.o_proj",
222
+ "model.layers.16.self_attn.qkv_proj",
223
+ "model.layers.16.block_sparse_moe.gate",
224
+ "model.layers.17.self_attn.q_proj",
225
+ "model.layers.17.self_attn.k_proj",
226
+ "model.layers.17.self_attn.v_proj",
227
+ "model.layers.17.self_attn.o_proj",
228
+ "model.layers.17.self_attn.qkv_proj",
229
+ "model.layers.17.block_sparse_moe.gate",
230
+ "model.layers.18.self_attn.q_proj",
231
+ "model.layers.18.self_attn.k_proj",
232
+ "model.layers.18.self_attn.v_proj",
233
+ "model.layers.18.self_attn.o_proj",
234
+ "model.layers.18.self_attn.qkv_proj",
235
+ "model.layers.18.block_sparse_moe.gate",
236
+ "model.layers.19.self_attn.q_proj",
237
+ "model.layers.19.self_attn.k_proj",
238
+ "model.layers.19.self_attn.v_proj",
239
+ "model.layers.19.self_attn.o_proj",
240
+ "model.layers.19.self_attn.qkv_proj",
241
+ "model.layers.19.block_sparse_moe.gate",
242
+ "model.layers.20.self_attn.q_proj",
243
+ "model.layers.20.self_attn.k_proj",
244
+ "model.layers.20.self_attn.v_proj",
245
+ "model.layers.20.self_attn.o_proj",
246
+ "model.layers.20.self_attn.qkv_proj",
247
+ "model.layers.20.block_sparse_moe.gate",
248
+ "model.layers.21.self_attn.q_proj",
249
+ "model.layers.21.self_attn.k_proj",
250
+ "model.layers.21.self_attn.v_proj",
251
+ "model.layers.21.self_attn.o_proj",
252
+ "model.layers.21.self_attn.qkv_proj",
253
+ "model.layers.21.block_sparse_moe.gate",
254
+ "model.layers.22.self_attn.q_proj",
255
+ "model.layers.22.self_attn.k_proj",
256
+ "model.layers.22.self_attn.v_proj",
257
+ "model.layers.22.self_attn.o_proj",
258
+ "model.layers.22.self_attn.qkv_proj",
259
+ "model.layers.22.block_sparse_moe.gate",
260
+ "model.layers.23.self_attn.q_proj",
261
+ "model.layers.23.self_attn.k_proj",
262
+ "model.layers.23.self_attn.v_proj",
263
+ "model.layers.23.self_attn.o_proj",
264
+ "model.layers.23.self_attn.qkv_proj",
265
+ "model.layers.23.block_sparse_moe.gate",
266
+ "model.layers.24.self_attn.q_proj",
267
+ "model.layers.24.self_attn.k_proj",
268
+ "model.layers.24.self_attn.v_proj",
269
+ "model.layers.24.self_attn.o_proj",
270
+ "model.layers.24.self_attn.qkv_proj",
271
+ "model.layers.24.block_sparse_moe.gate",
272
+ "model.layers.25.self_attn.q_proj",
273
+ "model.layers.25.self_attn.k_proj",
274
+ "model.layers.25.self_attn.v_proj",
275
+ "model.layers.25.self_attn.o_proj",
276
+ "model.layers.25.self_attn.qkv_proj",
277
+ "model.layers.25.block_sparse_moe.gate",
278
+ "model.layers.26.self_attn.q_proj",
279
+ "model.layers.26.self_attn.k_proj",
280
+ "model.layers.26.self_attn.v_proj",
281
+ "model.layers.26.self_attn.o_proj",
282
+ "model.layers.26.self_attn.qkv_proj",
283
+ "model.layers.26.block_sparse_moe.gate",
284
+ "model.layers.27.self_attn.q_proj",
285
+ "model.layers.27.self_attn.k_proj",
286
+ "model.layers.27.self_attn.v_proj",
287
+ "model.layers.27.self_attn.o_proj",
288
+ "model.layers.27.self_attn.qkv_proj",
289
+ "model.layers.27.block_sparse_moe.gate",
290
+ "model.layers.28.self_attn.q_proj",
291
+ "model.layers.28.self_attn.k_proj",
292
+ "model.layers.28.self_attn.v_proj",
293
+ "model.layers.28.self_attn.o_proj",
294
+ "model.layers.28.self_attn.qkv_proj",
295
+ "model.layers.28.block_sparse_moe.gate",
296
+ "model.layers.29.self_attn.q_proj",
297
+ "model.layers.29.self_attn.k_proj",
298
+ "model.layers.29.self_attn.v_proj",
299
+ "model.layers.29.self_attn.o_proj",
300
+ "model.layers.29.self_attn.qkv_proj",
301
+ "model.layers.29.block_sparse_moe.gate",
302
+ "model.layers.30.self_attn.q_proj",
303
+ "model.layers.30.self_attn.k_proj",
304
+ "model.layers.30.self_attn.v_proj",
305
+ "model.layers.30.self_attn.o_proj",
306
+ "model.layers.30.self_attn.qkv_proj",
307
+ "model.layers.30.block_sparse_moe.gate",
308
+ "model.layers.31.self_attn.q_proj",
309
+ "model.layers.31.self_attn.k_proj",
310
+ "model.layers.31.self_attn.v_proj",
311
+ "model.layers.31.self_attn.o_proj",
312
+ "model.layers.31.self_attn.qkv_proj",
313
+ "model.layers.31.block_sparse_moe.gate",
314
+ "model.layers.32.self_attn.q_proj",
315
+ "model.layers.32.self_attn.k_proj",
316
+ "model.layers.32.self_attn.v_proj",
317
+ "model.layers.32.self_attn.o_proj",
318
+ "model.layers.32.self_attn.qkv_proj",
319
+ "model.layers.32.block_sparse_moe.gate",
320
+ "model.layers.33.self_attn.q_proj",
321
+ "model.layers.33.self_attn.k_proj",
322
+ "model.layers.33.self_attn.v_proj",
323
+ "model.layers.33.self_attn.o_proj",
324
+ "model.layers.33.self_attn.qkv_proj",
325
+ "model.layers.33.block_sparse_moe.gate",
326
+ "model.layers.34.self_attn.q_proj",
327
+ "model.layers.34.self_attn.k_proj",
328
+ "model.layers.34.self_attn.v_proj",
329
+ "model.layers.34.self_attn.o_proj",
330
+ "model.layers.34.self_attn.qkv_proj",
331
+ "model.layers.34.block_sparse_moe.gate",
332
+ "model.layers.35.self_attn.q_proj",
333
+ "model.layers.35.self_attn.k_proj",
334
+ "model.layers.35.self_attn.v_proj",
335
+ "model.layers.35.self_attn.o_proj",
336
+ "model.layers.35.self_attn.qkv_proj",
337
+ "model.layers.35.block_sparse_moe.gate",
338
+ "model.layers.36.self_attn.q_proj",
339
+ "model.layers.36.self_attn.k_proj",
340
+ "model.layers.36.self_attn.v_proj",
341
+ "model.layers.36.self_attn.o_proj",
342
+ "model.layers.36.self_attn.qkv_proj",
343
+ "model.layers.36.block_sparse_moe.gate",
344
+ "model.layers.37.self_attn.q_proj",
345
+ "model.layers.37.self_attn.k_proj",
346
+ "model.layers.37.self_attn.v_proj",
347
+ "model.layers.37.self_attn.o_proj",
348
+ "model.layers.37.self_attn.qkv_proj",
349
+ "model.layers.37.block_sparse_moe.gate",
350
+ "model.layers.38.self_attn.q_proj",
351
+ "model.layers.38.self_attn.k_proj",
352
+ "model.layers.38.self_attn.v_proj",
353
+ "model.layers.38.self_attn.o_proj",
354
+ "model.layers.38.self_attn.qkv_proj",
355
+ "model.layers.38.block_sparse_moe.gate",
356
+ "model.layers.39.self_attn.q_proj",
357
+ "model.layers.39.self_attn.k_proj",
358
+ "model.layers.39.self_attn.v_proj",
359
+ "model.layers.39.self_attn.o_proj",
360
+ "model.layers.39.self_attn.qkv_proj",
361
+ "model.layers.39.block_sparse_moe.gate",
362
+ "model.layers.40.self_attn.q_proj",
363
+ "model.layers.40.self_attn.k_proj",
364
+ "model.layers.40.self_attn.v_proj",
365
+ "model.layers.40.self_attn.o_proj",
366
+ "model.layers.40.self_attn.qkv_proj",
367
+ "model.layers.40.block_sparse_moe.gate",
368
+ "model.layers.41.self_attn.q_proj",
369
+ "model.layers.41.self_attn.k_proj",
370
+ "model.layers.41.self_attn.v_proj",
371
+ "model.layers.41.self_attn.o_proj",
372
+ "model.layers.41.self_attn.qkv_proj",
373
+ "model.layers.41.block_sparse_moe.gate",
374
+ "model.layers.42.self_attn.q_proj",
375
+ "model.layers.42.self_attn.k_proj",
376
+ "model.layers.42.self_attn.v_proj",
377
+ "model.layers.42.self_attn.o_proj",
378
+ "model.layers.42.self_attn.qkv_proj",
379
+ "model.layers.42.block_sparse_moe.gate",
380
+ "model.layers.43.self_attn.q_proj",
381
+ "model.layers.43.self_attn.k_proj",
382
+ "model.layers.43.self_attn.v_proj",
383
+ "model.layers.43.self_attn.o_proj",
384
+ "model.layers.43.self_attn.qkv_proj",
385
+ "model.layers.43.block_sparse_moe.gate",
386
+ "model.layers.44.self_attn.q_proj",
387
+ "model.layers.44.self_attn.k_proj",
388
+ "model.layers.44.self_attn.v_proj",
389
+ "model.layers.44.self_attn.o_proj",
390
+ "model.layers.44.self_attn.qkv_proj",
391
+ "model.layers.44.block_sparse_moe.gate",
392
+ "model.layers.45.self_attn.q_proj",
393
+ "model.layers.45.self_attn.k_proj",
394
+ "model.layers.45.self_attn.v_proj",
395
+ "model.layers.45.self_attn.o_proj",
396
+ "model.layers.45.self_attn.qkv_proj",
397
+ "model.layers.45.block_sparse_moe.gate",
398
+ "model.layers.46.self_attn.q_proj",
399
+ "model.layers.46.self_attn.k_proj",
400
+ "model.layers.46.self_attn.v_proj",
401
+ "model.layers.46.self_attn.o_proj",
402
+ "model.layers.46.self_attn.qkv_proj",
403
+ "model.layers.46.block_sparse_moe.gate",
404
+ "model.layers.47.self_attn.q_proj",
405
+ "model.layers.47.self_attn.k_proj",
406
+ "model.layers.47.self_attn.v_proj",
407
+ "model.layers.47.self_attn.o_proj",
408
+ "model.layers.47.self_attn.qkv_proj",
409
+ "model.layers.47.block_sparse_moe.gate",
410
+ "model.layers.48.self_attn.q_proj",
411
+ "model.layers.48.self_attn.k_proj",
412
+ "model.layers.48.self_attn.v_proj",
413
+ "model.layers.48.self_attn.o_proj",
414
+ "model.layers.48.self_attn.qkv_proj",
415
+ "model.layers.48.block_sparse_moe.gate",
416
+ "model.layers.49.self_attn.q_proj",
417
+ "model.layers.49.self_attn.k_proj",
418
+ "model.layers.49.self_attn.v_proj",
419
+ "model.layers.49.self_attn.o_proj",
420
+ "model.layers.49.self_attn.qkv_proj",
421
+ "model.layers.49.block_sparse_moe.gate",
422
+ "model.layers.50.self_attn.q_proj",
423
+ "model.layers.50.self_attn.k_proj",
424
+ "model.layers.50.self_attn.v_proj",
425
+ "model.layers.50.self_attn.o_proj",
426
+ "model.layers.50.self_attn.qkv_proj",
427
+ "model.layers.50.block_sparse_moe.gate",
428
+ "model.layers.51.self_attn.q_proj",
429
+ "model.layers.51.self_attn.k_proj",
430
+ "model.layers.51.self_attn.v_proj",
431
+ "model.layers.51.self_attn.o_proj",
432
+ "model.layers.51.self_attn.qkv_proj",
433
+ "model.layers.51.block_sparse_moe.gate",
434
+ "model.layers.52.self_attn.q_proj",
435
+ "model.layers.52.self_attn.k_proj",
436
+ "model.layers.52.self_attn.v_proj",
437
+ "model.layers.52.self_attn.o_proj",
438
+ "model.layers.52.self_attn.qkv_proj",
439
+ "model.layers.52.block_sparse_moe.gate",
440
+ "model.layers.53.self_attn.q_proj",
441
+ "model.layers.53.self_attn.k_proj",
442
+ "model.layers.53.self_attn.v_proj",
443
+ "model.layers.53.self_attn.o_proj",
444
+ "model.layers.53.self_attn.qkv_proj",
445
+ "model.layers.53.block_sparse_moe.gate",
446
+ "model.layers.54.self_attn.q_proj",
447
+ "model.layers.54.self_attn.k_proj",
448
+ "model.layers.54.self_attn.v_proj",
449
+ "model.layers.54.self_attn.o_proj",
450
+ "model.layers.54.self_attn.qkv_proj",
451
+ "model.layers.54.block_sparse_moe.gate",
452
+ "model.layers.55.self_attn.q_proj",
453
+ "model.layers.55.self_attn.k_proj",
454
+ "model.layers.55.self_attn.v_proj",
455
+ "model.layers.55.self_attn.o_proj",
456
+ "model.layers.55.self_attn.qkv_proj",
457
+ "model.layers.55.block_sparse_moe.gate",
458
+ "model.layers.56.self_attn.q_proj",
459
+ "model.layers.56.self_attn.k_proj",
460
+ "model.layers.56.self_attn.v_proj",
461
+ "model.layers.56.self_attn.o_proj",
462
+ "model.layers.56.self_attn.qkv_proj",
463
+ "model.layers.56.block_sparse_moe.gate",
464
+ "model.layers.57.self_attn.q_proj",
465
+ "model.layers.57.self_attn.k_proj",
466
+ "model.layers.57.self_attn.v_proj",
467
+ "model.layers.57.self_attn.o_proj",
468
+ "model.layers.57.self_attn.qkv_proj",
469
+ "model.layers.57.block_sparse_moe.gate",
470
+ "model.layers.58.self_attn.q_proj",
471
+ "model.layers.58.self_attn.k_proj",
472
+ "model.layers.58.self_attn.v_proj",
473
+ "model.layers.58.self_attn.o_proj",
474
+ "model.layers.58.self_attn.qkv_proj",
475
+ "model.layers.58.block_sparse_moe.gate",
476
+ "model.layers.59.self_attn.q_proj",
477
+ "model.layers.59.self_attn.k_proj",
478
+ "model.layers.59.self_attn.v_proj",
479
+ "model.layers.59.self_attn.o_proj",
480
+ "model.layers.59.self_attn.qkv_proj",
481
+ "model.layers.59.block_sparse_moe.gate",
482
+ "model.layers.60.self_attn.q_proj",
483
+ "model.layers.60.self_attn.k_proj",
484
+ "model.layers.60.self_attn.v_proj",
485
+ "model.layers.60.self_attn.o_proj",
486
+ "model.layers.60.self_attn.qkv_proj",
487
+ "model.layers.60.block_sparse_moe.gate",
488
+ "model.layers.61.self_attn.q_proj",
489
+ "model.layers.61.self_attn.k_proj",
490
+ "model.layers.61.self_attn.v_proj",
491
+ "model.layers.61.self_attn.o_proj",
492
+ "model.layers.61.self_attn.qkv_proj",
493
+ "model.layers.61.block_sparse_moe.gate",
494
+ "lm_head"
495
+ ],
496
+ "kv_cache_scheme": null,
497
+ "quant_method": "compressed-tensors",
498
+ "quantization_status": "compressed",
499
+ "sparsity_config": {},
500
+ "transform_config": {},
501
+ "version": "0.13.1.dev0+g797d301.d20251228"
502
+ },
503
+ "rms_norm_eps": 1e-06,
504
+ "rope_theta": 5000000,
505
+ "rotary_dim": 64,
506
+ "router_aux_loss_coef": 0.001,
507
+ "router_jitter_noise": 0.0,
508
+ "scoring_func": "sigmoid",
509
+ "shared_intermediate_size": 0,
510
+ "sliding_window": null,
511
+ "tie_word_embeddings": false,
512
+ "transformers_version": "4.57.3",
513
+ "use_cache": true,
514
+ "use_mtp": true,
515
+ "use_qk_norm": true,
516
+ "use_routing_bias": true,
517
+ "vocab_size": 200064
518
+ }
configuration_minimax_m2.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
2
+ # This file was automatically generated from src/transformers/models/minimax_m2/modular_minimax_m2.py.
3
+ # Do NOT edit this file manually as any edits will be overwritten by the generation of
4
+ # the file from the modular. If any change should be done, please apply the change to the
5
+ # modular_minimax_m2.py file directly. One of our CI enforces this.
6
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
7
+ # coding=utf-8
8
+ # Copyright 2025 the HuggingFace Team. All rights reserved.
9
+ #
10
+ # Licensed under the Apache License, Version 2.0 (the "License");
11
+ # you may not use this file except in compliance with the License.
12
+ # You may obtain a copy of the License at
13
+ #
14
+ # http://www.apache.org/licenses/LICENSE-2.0
15
+ #
16
+ # Unless required by applicable law or agreed to in writing, software
17
+ # distributed under the License is distributed on an "AS IS" BASIS,
18
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19
+ # See the License for the specific language governing permissions and
20
+ # limitations under the License.
21
+
22
+
23
+ from transformers.configuration_utils import PretrainedConfig
24
+
25
+
26
+ class MiniMaxM2Config(PretrainedConfig):
27
+ r"""
28
+ This is the configuration class to store the configuration of a [`MiniMaxM2Model`]. It is used to instantiate an
29
+ MiniMaxM2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
30
+ with the defaults will yield a similar configuration to that of the MiniMaxM2-7B-v0.1 or MiniMaxM2-7B-Instruct-v0.1.
31
+
32
+ [minimax_m2ai/MiniMaxM2-8x7B](https://huggingface.co/minimax_m2ai/MiniMaxM2-8x7B)
33
+ [minimax_m2ai/MiniMaxM2-7B-Instruct-v0.1](https://huggingface.co/minimax_m2ai/MiniMaxM2-7B-Instruct-v0.1)
34
+
35
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
36
+ documentation from [`PretrainedConfig`] for more information.
37
+
38
+
39
+ Args:
40
+ vocab_size (`int`, *optional*, defaults to 32000):
41
+ Vocabulary size of the MiniMaxM2 model. Defines the number of different tokens that can be represented by the
42
+ `inputs_ids` passed when calling [`MiniMaxM2Model`]
43
+ hidden_size (`int`, *optional*, defaults to 4096):
44
+ Dimension of the hidden representations.
45
+ intermediate_size (`int`, *optional*, defaults to 14336):
46
+ Dimension of the MLP representations.
47
+ num_hidden_layers (`int`, *optional*, defaults to 32):
48
+ Number of hidden layers in the Transformer encoder.
49
+ num_attention_heads (`int`, *optional*, defaults to 32):
50
+ Number of attention heads for each attention layer in the Transformer encoder.
51
+ num_key_value_heads (`int`, *optional*, defaults to 8):
52
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
53
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
54
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
55
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
56
+ by meanpooling all the original heads within that group. For more details, check out [this
57
+ paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `8`.
58
+ head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`):
59
+ The attention head dimension.
60
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
61
+ The non-linear activation function (function or string) in the decoder.
62
+ max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
63
+ The maximum sequence length that this model might ever be used with. MiniMaxM2's sliding window attention
64
+ allows sequence of up to 4096*32 tokens.
65
+ initializer_range (`float`, *optional*, defaults to 0.02):
66
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
67
+ rms_norm_eps (`float`, *optional*, defaults to 1e-05):
68
+ The epsilon used by the rms normalization layers.
69
+ use_cache (`bool`, *optional*, defaults to `True`):
70
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
71
+ relevant if `config.is_decoder=True`.
72
+ pad_token_id (`int`, *optional*):
73
+ The id of the padding token.
74
+ bos_token_id (`int`, *optional*, defaults to 1):
75
+ The id of the "beginning-of-sequence" token.
76
+ eos_token_id (`int`, *optional*, defaults to 2):
77
+ The id of the "end-of-sequence" token.
78
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
79
+ Whether the model's input and output word embeddings should be tied.
80
+ rope_theta (`float`, *optional*, defaults to 1000000.0):
81
+ The base period of the RoPE embeddings.
82
+ sliding_window (`int`, *optional*):
83
+ Sliding window attention window size. If not specified, will default to `4096`.
84
+ attention_dropout (`float`, *optional*, defaults to 0.0):
85
+ The dropout ratio for the attention probabilities.
86
+ num_experts_per_tok (`int`, *optional*, defaults to 2):
87
+ The number of experts to route per-token, can be also interpreted as the `top-k` routing
88
+ parameter
89
+ num_local_experts (`int`, *optional*, defaults to 8):
90
+ Number of experts per Sparse MLP layer.
91
+ output_router_logits (`bool`, *optional*, defaults to `False`):
92
+ Whether or not the router logits should be returned by the model. Enabling this will also
93
+ allow the model to output the auxiliary loss. See [here]() for more details
94
+ router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
95
+ The aux loss factor for the total loss.
96
+ router_jitter_noise (`float`, *optional*, defaults to 0.0):
97
+ Amount of noise to add to the router.
98
+
99
+ ```python
100
+ >>> from transformers import MiniMaxM2Model, MiniMaxM2Config
101
+
102
+ >>> # Initializing a MiniMaxM2 7B style configuration
103
+ >>> configuration = MiniMaxM2Config()
104
+
105
+ >>> # Initializing a model from the MiniMaxM2 7B style configuration
106
+ >>> model = MiniMaxM2Model(configuration)
107
+
108
+ >>> # Accessing the model configuration
109
+ >>> configuration = model.config
110
+ ```"""
111
+
112
+ model_type = "minimax_m2"
113
+ keys_to_ignore_at_inference = ["past_key_values"]
114
+ base_model_tp_plan = {
115
+ "layers.*.self_attn.q_proj": "colwise",
116
+ "layers.*.self_attn.k_proj": "colwise",
117
+ "layers.*.self_attn.v_proj": "colwise",
118
+ "layers.*.self_attn.o_proj": "rowwise",
119
+ "layers.*.block_sparse_moe.gate": "colwise_rep", # we need to replicate here to correctly route experts
120
+ "layers.*.block_sparse_moe.experts.*.w1": "colwise",
121
+ "layers.*.block_sparse_moe.experts.*.w2": "rowwise",
122
+ "layers.*.block_sparse_moe.experts.*.w3": "colwise",
123
+ }
124
+ base_model_pp_plan = {
125
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
126
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
127
+ "norm": (["hidden_states"], ["hidden_states"]),
128
+ }
129
+
130
+ def __init__(
131
+ self,
132
+ vocab_size=32000,
133
+ hidden_size=4096,
134
+ intermediate_size=14336,
135
+ num_hidden_layers=32,
136
+ num_attention_heads=32,
137
+ num_key_value_heads=8,
138
+ head_dim=None,
139
+ hidden_act="silu",
140
+ max_position_embeddings=4096 * 32,
141
+ initializer_range=0.02,
142
+ rms_norm_eps=1e-5,
143
+ use_cache=True,
144
+ pad_token_id=None,
145
+ bos_token_id=1,
146
+ eos_token_id=2,
147
+ tie_word_embeddings=False,
148
+ rope_theta=1e6,
149
+ sliding_window=None,
150
+ attention_dropout=0.0,
151
+ num_experts_per_tok=2,
152
+ num_local_experts=8,
153
+ output_router_logits=False,
154
+ router_aux_loss_coef=0.001,
155
+ router_jitter_noise=0.0,
156
+ **kwargs,
157
+ ):
158
+ self.vocab_size = vocab_size
159
+ self.max_position_embeddings = max_position_embeddings
160
+ self.hidden_size = hidden_size
161
+ self.intermediate_size = intermediate_size
162
+ self.num_hidden_layers = num_hidden_layers
163
+ self.num_attention_heads = num_attention_heads
164
+ self.sliding_window = sliding_window
165
+
166
+ # for backward compatibility
167
+ if num_key_value_heads is None:
168
+ num_key_value_heads = num_attention_heads
169
+
170
+ self.num_key_value_heads = num_key_value_heads
171
+ self.hidden_act = hidden_act
172
+ self.initializer_range = initializer_range
173
+ self.rms_norm_eps = rms_norm_eps
174
+ self.use_cache = use_cache
175
+ self.rope_theta = rope_theta
176
+ self.attention_dropout = attention_dropout
177
+ self.head_dim = head_dim
178
+
179
+ self.num_experts_per_tok = num_experts_per_tok
180
+ self.num_local_experts = num_local_experts
181
+ self.output_router_logits = output_router_logits
182
+ self.router_aux_loss_coef = router_aux_loss_coef
183
+ self.router_jitter_noise = router_jitter_noise
184
+
185
+ self.use_qk_norm = kwargs.pop("use_qk_norm", False)
186
+ self.rotary_dim = kwargs.pop("rotary_dim", self.head_dim)
187
+ self.partial_rotary_factor = kwargs.pop("partial_rotary_factor", 1)
188
+ if self.head_dim is not None:
189
+ self.partial_rotary_factor = self.rotary_dim / self.head_dim
190
+
191
+ super().__init__(
192
+ pad_token_id=pad_token_id,
193
+ bos_token_id=bos_token_id,
194
+ eos_token_id=eos_token_id,
195
+ tie_word_embeddings=tie_word_embeddings,
196
+ **kwargs,
197
+ )
198
+
199
+ def __repr__(self):
200
+ """
201
+ Overriding __repr__ to ensure that quantization_config does not contain Enum objects.
202
+ This prevents SyntaxError in torch.fx generated code which relies on repr() of the config.
203
+ """
204
+ print("Config sanitized in configuration_minimax_m2.py")
205
+ q_config = getattr(self, "quantization_config", None)
206
+ if isinstance(q_config, dict):
207
+ for k, v in list(q_config.items()):
208
+ if hasattr(v, "value"):
209
+ # Replace Enum with its value
210
+ q_config[k] = v.value
211
+ print(f" Enum: q_config[{k}] = {v.value}")
212
+ return super().__repr__()
213
+
214
+ __all__ = ["MiniMaxM2Config"]
generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 200019,
3
+ "do_sample": true,
4
+ "eos_token_id": 200020,
5
+ "top_k": 40,
6
+ "top_p": 0.95,
7
+ "transformers_version": "4.57.3"
8
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
minimax_to_bf16.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import json
4
+ from argparse import ArgumentParser
5
+ from glob import glob
6
+ from tqdm import tqdm
7
+
8
+ import torch
9
+ from safetensors.torch import load_file, save_file
10
+
11
+
12
+ def weight_dequant_fp8(weight_fp8, scale_inv):
13
+ """
14
+ Dequantize FP8 weights to BF16 using scale_inv.
15
+
16
+ Args:
17
+ weight_fp8: FP8 tensor
18
+ scale_inv: Inverse scale tensor (F32)
19
+
20
+ Returns:
21
+ BF16 tensor
22
+ """
23
+ # Convert FP8 to float32 first
24
+ weight_f32 = weight_fp8.to(torch.float32)
25
+
26
+ # Apply inverse scaling
27
+ # scale_inv shape is typically [out_features_blocks, in_features_blocks]
28
+ # We need to broadcast it properly to match weight dimensions
29
+ if scale_inv.dim() == 2:
30
+ # Expand scale_inv to match weight dimensions
31
+ out_blocks, in_blocks = scale_inv.shape
32
+ weight_blocks_out = weight_fp8.shape[0] // out_blocks
33
+ weight_blocks_in = weight_fp8.shape[1] // in_blocks
34
+
35
+ # Repeat scale_inv to match weight shape
36
+ scale_inv_expanded = scale_inv.repeat_interleave(weight_blocks_out, dim=0)
37
+ scale_inv_expanded = scale_inv_expanded.repeat_interleave(weight_blocks_in, dim=1)
38
+
39
+ weight_f32 = weight_f32 * scale_inv_expanded
40
+ else:
41
+ weight_f32 = weight_f32 * scale_inv
42
+
43
+ # Convert to BF16
44
+ return weight_f32.to(torch.bfloat16)
45
+
46
+
47
+ def main(fp8_path, bf16_path):
48
+ torch.set_default_dtype(torch.bfloat16)
49
+ os.makedirs(bf16_path, exist_ok=True)
50
+
51
+ model_index_file = os.path.join(fp8_path, "model.safetensors.index.json")
52
+ with open(model_index_file, "r") as f:
53
+ model_index = json.load(f)
54
+
55
+ weight_map = model_index["weight_map"]
56
+
57
+ # Cache for loaded safetensor files
58
+ loaded_files = {}
59
+ fp8_weight_names = []
60
+
61
+ # Helper function to get tensor from the correct file
62
+ def get_tensor(tensor_name):
63
+ if tensor_name not in weight_map:
64
+ return None
65
+ file_name = weight_map[tensor_name]
66
+ if file_name not in loaded_files:
67
+ file_path = os.path.join(fp8_path, file_name)
68
+ loaded_files[file_name] = load_file(file_path, device="cuda")
69
+ return loaded_files[file_name][tensor_name]
70
+
71
+ safetensor_files = list(glob(os.path.join(fp8_path, "*.safetensors")))
72
+ safetensor_files = [f for f in safetensor_files if not f.endswith(".index.json")]
73
+ safetensor_files.sort()
74
+
75
+ print(f"Found {len(safetensor_files)} safetensor files to convert")
76
+
77
+ for safetensor_file in tqdm(safetensor_files, desc="Converting files"):
78
+ file_name = os.path.basename(safetensor_file)
79
+ current_state_dict = load_file(safetensor_file, device="cuda")
80
+ loaded_files[file_name] = current_state_dict
81
+
82
+ new_state_dict = {}
83
+
84
+ for weight_name, weight in current_state_dict.items():
85
+ # Skip scale_inv tensors
86
+ if weight_name.endswith("_scale_inv"):
87
+ continue
88
+
89
+ # Check if this is an FP8 weight (F8_E4M3 has element_size of 1)
90
+ if weight.dtype == torch.float8_e4m3fn or weight.element_size() == 1:
91
+ scale_inv_name = f"{weight_name}_scale_inv"
92
+ scale_inv = get_tensor(scale_inv_name)
93
+
94
+ if scale_inv is not None:
95
+ fp8_weight_names.append(weight_name)
96
+ new_state_dict[weight_name] = weight_dequant_fp8(weight, scale_inv)
97
+ else:
98
+ print(f"Warning: Missing scale_inv tensor for {weight_name}, keeping as-is")
99
+ new_state_dict[weight_name] = weight
100
+ else:
101
+ # Already BF16 or F32, keep as-is
102
+ new_state_dict[weight_name] = weight
103
+
104
+ # Save converted weights
105
+ new_safetensor_file = os.path.join(bf16_path, file_name)
106
+ save_file(new_state_dict, new_safetensor_file)
107
+
108
+ # Memory management: keep only the 2 most recently used files
109
+ if len(loaded_files) > 2:
110
+ oldest_file = next(iter(loaded_files))
111
+ del loaded_files[oldest_file]
112
+ torch.cuda.empty_cache()
113
+
114
+ # Update model index - remove all _scale_inv entries
115
+ print("Updating model index...")
116
+ new_weight_map = {}
117
+ for weight_name, file_name in weight_map.items():
118
+ if not weight_name.endswith("_scale_inv"):
119
+ new_weight_map[weight_name] = file_name
120
+
121
+ new_model_index = {
122
+ "metadata": model_index.get("metadata", {}),
123
+ "weight_map": new_weight_map
124
+ }
125
+
126
+ new_model_index_file = os.path.join(bf16_path, "model.safetensors.index.json")
127
+ with open(new_model_index_file, "w") as f:
128
+ json.dump(new_model_index, f, indent=2)
129
+
130
+ print(f"Conversion complete! Converted {len(fp8_weight_names)} FP8 weights to BF16")
131
+ print(f"Output saved to: {bf16_path}")
132
+
133
+
134
+ if __name__ == "__main__":
135
+ parser = ArgumentParser(description="Convert MiniMax-M2 from FP8 to BF16")
136
+ parser.add_argument("--input-fp8-hf-path", type=str, required=True,
137
+ help="Path to the FP8 model directory")
138
+ parser.add_argument("--output-bf16-hf-path", type=str, required=True,
139
+ help="Path to save the BF16 model")
140
+ args = parser.parse_args()
141
+
142
+ main(args.input_fp8_hf_path, args.output_bf16_hf_path)
model-00001-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fd947e3a34495c361fa273f04cd1ddcdcd652dfd97cb1c830777ee0f7192e17
3
+ size 5000235232
model-00002-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf2068a3e58418d0c28a49ae0277bcf80bbe48fe7b58c2bbbe288b8578f198f3
3
+ size 4999565832
model-00003-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5192599924c00ca7cc0b7eeba473001ec87662297f082245de350fa56f863dc2
3
+ size 5000140792
model-00004-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7ad9284db7d17eeaf648b1b24e8f76574b12ba0c9b634d64345d969cc692c14
3
+ size 5000141504
model-00005-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03b08a5bd89174904b6b264f35289cd78eba8c2e447481188023b51cb55a6847
3
+ size 4999568344
model-00006-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1b09414abf5089823fd17c7bdac530b2b953398e3ff8893f98f05e60cdebc16
3
+ size 5000146376
model-00007-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ca955c56a35c0474cf135607f5e1ccc986dc5fc9d922ec24a5c2a80734bf225
3
+ size 5000146968
model-00008-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3eea2dd55548cc9e9a54c513c9a7a14a72d973a8e73dc75472d43176a2432292
3
+ size 4999570984
model-00009-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3849ce508301c78a4b9df9b553a25c5421901f0be35d03b6520405a76aacc12
3
+ size 5000146488
model-00010-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5461b48f80774f438967b0fbe614aaaa757e8c1312386af5f8b5b510ea9e17ec
3
+ size 5000146968
model-00011-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc655de5faaa6c6ca3b58fa72cc156911ede65e2abd3e18b6d7b51b47f579f33
3
+ size 4999570864
model-00012-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58decbdba998a995cdfe4c8aea8f4d8b2936f49e50b9964a14042eee250d5b73
3
+ size 5000146600
model-00013-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cecce84d9d57d2e269832b4d9e80a9e855e28061dd009748a331fe60b18f5373
3
+ size 5000146968
model-00014-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:686ba28eebe91f8e0e15d8a3940bc47dce4a9008cad4893d4a79f7da94404d7f
3
+ size 4999570752
model-00015-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a751935c3135867ca8b7d1a2ccc10f788ff3e23c36c3e264fa98c22f9f1bf290
3
+ size 5000146712
model-00016-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c7b10931032f3fd098e0278f1e0ce9ee6e064e1ba183ba0d1dd56f5509b52e7
3
+ size 4981577024
model-00017-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19b22fddf120bd654b496aed01fedacbbea07d55a877fcd830fe9011b5875f6f
3
+ size 4999558272
model-00018-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c24b4f46ca4910889fc10809e81202581036e68090891e723830d4526b3dc15
3
+ size 5000146808
model-00019-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:deed0a529777b09ff6b2a04ec53d9afea5ad0b54fa67eccd829dd28e87258127
3
+ size 4999571464
model-00020-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6b0ab9cefc9b4671f78713d9966e8ab509037a5a0df4c13f8a8f4eefae5d35e
3
+ size 5000146048
model-00021-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03dd85c6ad514ad8b65a844f67d54275031c444984879593ef71f268a5c3703c
3
+ size 5000146920
model-00022-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a19326e82ca0b741e37e75e41a022f4628fd003d5acc7a3840001ec84f2dc453
3
+ size 4999571272
model-00023-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5dc2b75fd3a288a43a6bafd712d6eaad0c54f00feb669e902739ac752312fa5
3
+ size 5000146192
model-00024-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11e49d2957c2784937c166bd55e6baf60002527d1a5236e48102f7ce8185922b
3
+ size 5000146968
model-00025-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:763a7e002a0a8ed2316d73ba3ea744e83784ce7bdf3fb25b0983845770f914f0
3
+ size 4999571160
model-00026-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28a16be45fdcb11e636b5741d995b8804980c392079537bbfedd7cd34b65ddcc
3
+ size 5000146312
model-00027-of-00027.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c12ec8e012ef4d311a0fdd81d51896432a01752c51346a945fd50edd39c2593
3
+ size 4440722160
model.safetensors.index.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf9222d462bd87f371ed549bbb43ead4f6d4cbd8ab53f95d5b089818e4b91c9e
3
+ size 14872848
modeling_minimax_m2.py ADDED
@@ -0,0 +1,725 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
2
+ # This file was automatically generated from src/transformers/models/minimax_m2/modular_minimax_m2.py.
3
+ # Do NOT edit this file manually as any edits will be overwritten by the generation of
4
+ # the file from the modular. If any change should be done, please apply the change to the
5
+ # modular_minimax_m2.py file directly. One of our CI enforces this.
6
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
7
+ # coding=utf-8
8
+ # Copyright 2025 the HuggingFace Team. All rights reserved.
9
+ #
10
+ # Licensed under the Apache License, Version 2.0 (the "License");
11
+ # you may not use this file except in compliance with the License.
12
+ # You may obtain a copy of the License at
13
+ #
14
+ # http://www.apache.org/licenses/LICENSE-2.0
15
+ #
16
+ # Unless required by applicable law or agreed to in writing, software
17
+ # distributed under the License is distributed on an "AS IS" BASIS,
18
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19
+ # See the License for the specific language governing permissions and
20
+ # limitations under the License.
21
+
22
+
23
+ from collections.abc import Callable
24
+ from typing import Optional, Union, Unpack
25
+
26
+ import torch
27
+ from torch import nn
28
+
29
+ from transformers.activations import ACT2FN
30
+ from transformers.cache_utils import Cache, DynamicCache
31
+ from transformers.generation import GenerationMixin
32
+ from transformers.integrations import use_kernel_forward_from_hub
33
+ from transformers.masking_utils import create_causal_mask, create_sliding_window_causal_mask
34
+ from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
35
+ from transformers.modeling_layers import (
36
+ GenericForQuestionAnswering,
37
+ GenericForSequenceClassification,
38
+ GenericForTokenClassification,
39
+ GradientCheckpointingLayer,
40
+ )
41
+ from transformers.modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
42
+ from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
43
+ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
44
+ from transformers.utils import TransformersKwargs, auto_docstring, can_return_tuple
45
+ from transformers.utils.deprecation import deprecate_kwarg
46
+ from transformers.utils.generic import OutputRecorder, check_model_inputs
47
+ from .configuration_minimax_m2 import MiniMaxM2Config
48
+
49
+ def _sanitize_config(config):
50
+ """
51
+ Helper function to sanitize the configuration object.
52
+ Specifically, it converts Enum values in `quantization_config` to their string values.
53
+ This prevents SyntaxErrors during torch.fx tracing where repr(Enum) (<Enum.Val: 'val'>)
54
+ is emitted into the generated code.
55
+ """
56
+ print("Config sanitized in modeling_minimax_m2.py")
57
+ q_config = getattr(config, "quantization_config", None)
58
+ if isinstance(q_config, dict):
59
+ for k, v in list(q_config.items()):
60
+ # Check for Enum by looking for 'value' attr
61
+ if hasattr(v, "value"):
62
+ q_config[k] = v.value
63
+ print(f" Enum: q_config[{k}] = {v.value}")
64
+
65
+ class MiniMaxM2MLP(nn.Module):
66
+ def __init__(self, config: MiniMaxM2Config):
67
+ super().__init__()
68
+ self.ffn_dim = config.intermediate_size
69
+ self.hidden_dim = config.hidden_size
70
+
71
+ self.w1 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
72
+ self.w2 = nn.Linear(self.ffn_dim, self.hidden_dim, bias=False)
73
+ self.w3 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
74
+
75
+ self.act_fn = ACT2FN[config.hidden_act]
76
+
77
+ def forward(self, hidden_states):
78
+ current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.w3(hidden_states)
79
+ current_hidden_states = self.w2(current_hidden_states)
80
+ return current_hidden_states
81
+
82
+
83
+ class MiniMaxM2Experts(nn.ModuleList):
84
+ """
85
+ ModuleList of experts.
86
+ """
87
+
88
+ def __init__(self, config: MiniMaxM2Config):
89
+ super().__init__()
90
+ self.top_k = config.num_experts_per_tok
91
+ self.num_experts = config.num_local_experts
92
+ for _ in range(self.num_experts):
93
+ self.append(MiniMaxM2MLP(config))
94
+
95
+ def forward(
96
+ self, hidden_states: torch.Tensor, top_k_index: torch.Tensor, top_k_weights: torch.Tensor
97
+ ) -> torch.Tensor:
98
+ """
99
+ Args:
100
+ hidden_states: (batch_size * sequence_length, hidden_dim)
101
+ selected_experts: (batch_size * sequence_length, top_k)
102
+ routing_weights: (batch_size * sequence_length, top_k)
103
+ Returns:
104
+ (batch_size * sequence_length, hidden_dim)
105
+ """
106
+ final_hidden_states = torch.zeros_like(hidden_states)
107
+ expert_mask = torch.nn.functional.one_hot(top_k_index, num_classes=self.num_experts).permute(2, 1, 0)
108
+
109
+ expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
110
+ for expert_idx in expert_hit:
111
+ idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0))
112
+ current_state = hidden_states[None, top_x].reshape(-1, hidden_states.shape[-1])
113
+ current_hidden_states = self[expert_idx](current_state) * top_k_weights[top_x, idx, None]
114
+ final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
115
+ return final_hidden_states
116
+
117
+
118
+ class MiniMaxM2SparseMoeBlock(nn.Module):
119
+ def __init__(self, config):
120
+ super().__init__()
121
+ self.top_k = config.num_experts_per_tok
122
+ self.jitter_noise = config.router_jitter_noise
123
+ self.gate = nn.Linear(config.hidden_size, config.num_local_experts, bias=False)
124
+ self.experts = MiniMaxM2Experts(config)
125
+ self.register_buffer("e_score_correction_bias", torch.zeros(config.num_local_experts))
126
+
127
+ def route_tokens_to_experts(self, router_logits):
128
+ routing_weights = torch.nn.functional.sigmoid(router_logits.float())
129
+ scores_for_choice = routing_weights + self.e_score_correction_bias
130
+ _, top_k_index = torch.topk(scores_for_choice, self.top_k, dim=-1, sorted=False)
131
+ top_k_weights = routing_weights.gather(1, top_k_index)
132
+ top_k_weights /= top_k_weights.sum(dim=-1, keepdim=True)
133
+ return top_k_index, top_k_weights.to(router_logits.dtype)
134
+
135
+ def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
136
+ batch_size, sequence_length, hidden_dim = hidden_states.shape
137
+ if self.training and self.jitter_noise > 0:
138
+ hidden_states *= torch.empty_like(hidden_states).uniform_(1.0 - self.jitter_noise, 1.0 + self.jitter_noise)
139
+ hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
140
+ router_logits = self.gate(hidden_states)
141
+ top_k_index, top_k_weights = self.route_tokens_to_experts(router_logits)
142
+ hidden_states = self.experts(hidden_states, top_k_index, top_k_weights.to(hidden_states.dtype))
143
+ hidden_states = hidden_states.reshape(batch_size, sequence_length, hidden_dim)
144
+ return hidden_states, router_logits
145
+
146
+
147
+ @use_kernel_forward_from_hub("RMSNorm")
148
+ class MiniMaxM2RMSNorm(nn.Module):
149
+ def __init__(self, hidden_size, eps=1e-6):
150
+ """
151
+ MiniMaxM2RMSNorm is equivalent to T5LayerNorm
152
+ """
153
+ super().__init__()
154
+ self.weight = nn.Parameter(torch.ones(hidden_size))
155
+ self.variance_epsilon = eps
156
+
157
+ def forward(self, hidden_states):
158
+ input_dtype = hidden_states.dtype
159
+ hidden_states = hidden_states.to(torch.float32)
160
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
161
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
162
+ return self.weight * hidden_states.to(input_dtype)
163
+
164
+ def extra_repr(self):
165
+ return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
166
+
167
+
168
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
169
+ """
170
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
171
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
172
+ """
173
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
174
+ if n_rep == 1:
175
+ return hidden_states
176
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
177
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
178
+
179
+
180
+ def eager_attention_forward(
181
+ module: nn.Module,
182
+ query: torch.Tensor,
183
+ key: torch.Tensor,
184
+ value: torch.Tensor,
185
+ attention_mask: Optional[torch.Tensor],
186
+ scaling: float,
187
+ dropout: float = 0.0,
188
+ **kwargs: Unpack[TransformersKwargs],
189
+ ):
190
+ key_states = repeat_kv(key, module.num_key_value_groups)
191
+ value_states = repeat_kv(value, module.num_key_value_groups)
192
+
193
+ attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
194
+ if attention_mask is not None:
195
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
196
+ attn_weights = attn_weights + causal_mask
197
+
198
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
199
+ attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
200
+ attn_output = torch.matmul(attn_weights, value_states)
201
+ attn_output = attn_output.transpose(1, 2).contiguous()
202
+
203
+ return attn_output, attn_weights
204
+
205
+
206
+ def rotate_half(x):
207
+ """Rotates half the hidden dims of the input."""
208
+ x1 = x[..., : x.shape[-1] // 2]
209
+ x2 = x[..., x.shape[-1] // 2 :]
210
+ return torch.cat((-x2, x1), dim=-1)
211
+
212
+
213
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
214
+ """Applies Rotary Position Embedding to the query and key tensors.
215
+
216
+ Args:
217
+ q (`torch.Tensor`): The query tensor.
218
+ k (`torch.Tensor`): The key tensor.
219
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
220
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
221
+ position_ids (`torch.Tensor`, *optional*):
222
+ Deprecated and unused.
223
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
224
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
225
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
226
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
227
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
228
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
229
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
230
+ Returns:
231
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
232
+ """
233
+ cos = cos.unsqueeze(unsqueeze_dim)
234
+ sin = sin.unsqueeze(unsqueeze_dim)
235
+
236
+ # Keep half or full tensor for later concatenation
237
+ rotary_dim = cos.shape[-1]
238
+ q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
239
+ k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]
240
+
241
+ # Apply rotary embeddings on the first half or full tensor
242
+ q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin)
243
+ k_embed = (k_rot * cos) + (rotate_half(k_rot) * sin)
244
+
245
+ # Concatenate back to full shape
246
+ q_embed = torch.cat([q_embed, q_pass], dim=-1)
247
+ k_embed = torch.cat([k_embed, k_pass], dim=-1)
248
+ return q_embed, k_embed
249
+
250
+
251
+ class MiniMaxM2Attention(nn.Module):
252
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
253
+
254
+ def __init__(self, config: MiniMaxM2Config, layer_idx: int):
255
+ super().__init__()
256
+ self.config = config
257
+ self.layer_idx = layer_idx
258
+ self.head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
259
+ self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
260
+ self.scaling = self.head_dim**-0.5
261
+ self.attention_dropout = config.attention_dropout
262
+ self.is_causal = True
263
+ self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=False)
264
+ self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
265
+ self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
266
+ self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
267
+
268
+ self.use_qk_norm = config.use_qk_norm
269
+ if self.use_qk_norm:
270
+ self.q_norm = MiniMaxM2RMSNorm(self.head_dim * config.num_attention_heads, eps=config.rms_norm_eps)
271
+ self.k_norm = MiniMaxM2RMSNorm(self.head_dim * config.num_key_value_heads, eps=config.rms_norm_eps)
272
+
273
+ @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
274
+ def forward(
275
+ self,
276
+ hidden_states: torch.Tensor,
277
+ position_embeddings: tuple[torch.Tensor, torch.Tensor],
278
+ attention_mask: Optional[torch.Tensor],
279
+ past_key_values: Optional[Cache] = None,
280
+ cache_position: Optional[torch.LongTensor] = None,
281
+ **kwargs: Unpack[FlashAttentionKwargs],
282
+ ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
283
+ input_shape = hidden_states.shape[:-1]
284
+ hidden_shape = (*input_shape, -1, self.head_dim)
285
+
286
+ query_states = self.q_proj(hidden_states)
287
+ key_states = self.k_proj(hidden_states)
288
+ value_states = self.v_proj(hidden_states)
289
+
290
+ if self.use_qk_norm: # main diff from Llama
291
+ query_states = self.q_norm(query_states)
292
+ key_states = self.k_norm(key_states)
293
+
294
+ key_states = key_states.view(hidden_shape)
295
+ query_states = query_states.view(hidden_shape)
296
+ value_states = value_states.view(hidden_shape)
297
+
298
+ query_states = query_states.transpose(1, 2)
299
+ key_states = key_states.transpose(1, 2)
300
+ value_states = value_states.transpose(1, 2)
301
+
302
+ cos, sin = position_embeddings
303
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
304
+
305
+ if past_key_values is not None:
306
+ # sin and cos are specific to RoPE models; position_ids needed for the static cache
307
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
308
+ key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
309
+
310
+ attention_interface: Callable = eager_attention_forward
311
+ if self.config._attn_implementation != "eager":
312
+ attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
313
+
314
+ attn_output, attn_weights = attention_interface(
315
+ self,
316
+ query_states,
317
+ key_states,
318
+ value_states,
319
+ attention_mask,
320
+ dropout=0.0 if not self.training else self.attention_dropout,
321
+ scaling=self.scaling,
322
+ **kwargs,
323
+ )
324
+
325
+ attn_output = attn_output.reshape(*input_shape, -1).contiguous()
326
+ attn_output = self.o_proj(attn_output)
327
+ return attn_output, attn_weights
328
+
329
+
330
+ class MiniMaxM2DecoderLayer(GradientCheckpointingLayer):
331
+ def __init__(self, config: MiniMaxM2Config, layer_idx: int):
332
+ super().__init__()
333
+ self.hidden_size = config.hidden_size
334
+
335
+ self.self_attn = MiniMaxM2Attention(config, layer_idx)
336
+
337
+ self.block_sparse_moe = MiniMaxM2SparseMoeBlock(config)
338
+ self.input_layernorm = MiniMaxM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
339
+ self.post_attention_layernorm = MiniMaxM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
340
+
341
+ @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
342
+ def forward(
343
+ self,
344
+ hidden_states: torch.Tensor,
345
+ position_embeddings: tuple[torch.Tensor, torch.Tensor],
346
+ attention_mask: Optional[torch.Tensor] = None,
347
+ position_ids: Optional[torch.LongTensor] = None,
348
+ past_key_values: Optional[Cache] = None,
349
+ cache_position: Optional[torch.LongTensor] = None,
350
+ **kwargs: Unpack[TransformersKwargs],
351
+ ) -> torch.FloatTensor:
352
+ residual = hidden_states
353
+
354
+ hidden_states = self.input_layernorm(hidden_states)
355
+
356
+ # Self Attention
357
+ hidden_states, _ = self.self_attn(
358
+ hidden_states=hidden_states,
359
+ position_embeddings=position_embeddings,
360
+ attention_mask=attention_mask,
361
+ position_ids=position_ids,
362
+ past_key_values=past_key_values,
363
+ cache_position=cache_position,
364
+ **kwargs,
365
+ )
366
+ hidden_states = residual + hidden_states
367
+
368
+ # Fully Connected
369
+ residual = hidden_states
370
+ hidden_states = self.post_attention_layernorm(hidden_states)
371
+ hidden_states, _ = self.block_sparse_moe(hidden_states)
372
+ hidden_states = residual + hidden_states
373
+
374
+ return hidden_states
375
+
376
+
377
+ class MiniMaxM2RotaryEmbedding(nn.Module):
378
+ inv_freq: torch.Tensor # fix linting for `register_buffer`
379
+
380
+ def __init__(self, config: MiniMaxM2Config, device=None):
381
+ super().__init__()
382
+ # BC: "rope_type" was originally "type"
383
+ if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
384
+ self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
385
+ else:
386
+ self.rope_type = "default"
387
+ self.max_seq_len_cached = config.max_position_embeddings
388
+ self.original_max_seq_len = config.max_position_embeddings
389
+
390
+ self.config = config
391
+ self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
392
+
393
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
394
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
395
+ self.original_inv_freq = self.inv_freq
396
+
397
+ @torch.no_grad()
398
+ @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope)
399
+ def forward(self, x, position_ids):
400
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
401
+ position_ids_expanded = position_ids[:, None, :].float()
402
+
403
+ device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
404
+ with torch.autocast(device_type=device_type, enabled=False): # Force float32
405
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
406
+ emb = torch.cat((freqs, freqs), dim=-1)
407
+ cos = emb.cos() * self.attention_scaling
408
+ sin = emb.sin() * self.attention_scaling
409
+
410
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
411
+
412
+
413
+ # @auto_docstring
414
+ class MiniMaxM2PreTrainedModel(PreTrainedModel):
415
+ config: MiniMaxM2Config
416
+ base_model_prefix = "model"
417
+ supports_gradient_checkpointing = True
418
+ _no_split_modules = ["MiniMaxM2DecoderLayer"]
419
+ _skip_keys_device_placement = ["past_key_values"]
420
+ _supports_flash_attn = True
421
+ _supports_sdpa = True
422
+ _supports_flex_attn = True
423
+ _can_compile_fullgraph = False # MoE models don't work with torch.compile (`torch.where(condition)` not supported)
424
+ _supports_attention_backend = True
425
+ _can_record_outputs = {
426
+ "router_logits": OutputRecorder(MiniMaxM2SparseMoeBlock, index=1),
427
+ "hidden_states": MiniMaxM2DecoderLayer,
428
+ "attentions": MiniMaxM2Attention,
429
+ }
430
+
431
+
432
+ # @auto_docstring
433
+ class MiniMaxM2Model(MiniMaxM2PreTrainedModel):
434
+ def __init__(self, config: MiniMaxM2Config):
435
+ super().__init__(config)
436
+ _sanitize_config(config)
437
+
438
+ self.padding_idx = config.pad_token_id
439
+ self.vocab_size = config.vocab_size
440
+
441
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
442
+ self.layers = nn.ModuleList(
443
+ [MiniMaxM2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
444
+ )
445
+ self.norm = MiniMaxM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
446
+ self.rotary_emb = MiniMaxM2RotaryEmbedding(config=config)
447
+ self.gradient_checkpointing = False
448
+
449
+ # Initialize weights and apply final processing
450
+ self.post_init()
451
+
452
+ # @check_model_inputs
453
+ # @auto_docstring
454
+ def forward(
455
+ self,
456
+ input_ids: Optional[torch.LongTensor] = None,
457
+ attention_mask: Optional[torch.Tensor] = None,
458
+ position_ids: Optional[torch.LongTensor] = None,
459
+ past_key_values: Optional[Cache] = None,
460
+ inputs_embeds: Optional[torch.FloatTensor] = None,
461
+ use_cache: Optional[bool] = None,
462
+ cache_position: Optional[torch.LongTensor] = None,
463
+ **kwargs: Unpack[TransformersKwargs],
464
+ ) -> MoeModelOutputWithPast:
465
+ if (input_ids is None) ^ (inputs_embeds is not None):
466
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
467
+
468
+ if use_cache and past_key_values is None:
469
+ past_key_values = DynamicCache(config=self.config)
470
+
471
+ if inputs_embeds is None:
472
+ inputs_embeds = self.embed_tokens(input_ids)
473
+
474
+ if cache_position is None:
475
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
476
+ cache_position = torch.arange(
477
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
478
+ )
479
+ if position_ids is None:
480
+ position_ids = cache_position.unsqueeze(0)
481
+
482
+ mask_function = create_causal_mask if self.config.sliding_window is None else create_sliding_window_causal_mask
483
+ causal_mask = mask_function(
484
+ config=self.config,
485
+ input_embeds=inputs_embeds,
486
+ attention_mask=attention_mask,
487
+ cache_position=cache_position,
488
+ past_key_values=past_key_values,
489
+ position_ids=position_ids,
490
+ )
491
+
492
+ hidden_states = inputs_embeds
493
+
494
+ # create position embeddings to be shared across the decoder layers
495
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
496
+
497
+ for decoder_layer in self.layers[: self.config.num_hidden_layers]:
498
+ hidden_states = decoder_layer(
499
+ hidden_states,
500
+ position_embeddings=position_embeddings,
501
+ attention_mask=causal_mask,
502
+ position_ids=position_ids,
503
+ past_key_values=past_key_values,
504
+ use_cache=use_cache,
505
+ cache_position=cache_position,
506
+ **kwargs,
507
+ )
508
+
509
+ hidden_states = self.norm(hidden_states)
510
+
511
+ return MoeModelOutputWithPast( # only diff with Mistral is the output type, we need MoE
512
+ last_hidden_state=hidden_states,
513
+ past_key_values=past_key_values,
514
+ )
515
+
516
+
517
+ def load_balancing_loss_func(
518
+ gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
519
+ num_experts: Optional[int] = None,
520
+ top_k=2,
521
+ attention_mask: Optional[torch.Tensor] = None,
522
+ ) -> Union[torch.Tensor, int]:
523
+ r"""
524
+ Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
525
+
526
+ See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
527
+ function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
528
+ experts is too unbalanced.
529
+
530
+ Args:
531
+ gate_logits:
532
+ Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
533
+ shape [batch_size X sequence_length, num_experts].
534
+ num_experts:
535
+ Number of experts
536
+ top_k:
537
+ The number of experts to route per-token, can be also interpreted as the `top-k` routing
538
+ parameter.
539
+ attention_mask (`torch.Tensor`, *optional*):
540
+ The attention_mask used in forward function
541
+ shape [batch_size X sequence_length] if not None.
542
+
543
+ Returns:
544
+ The auxiliary loss.
545
+ """
546
+ if gate_logits is None or not isinstance(gate_logits, tuple):
547
+ return 0
548
+
549
+ if isinstance(gate_logits, tuple):
550
+ compute_device = gate_logits[0].device
551
+ concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
552
+
553
+ routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
554
+
555
+ _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
556
+
557
+ expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)
558
+
559
+ if attention_mask is None:
560
+ # Compute the percentage of tokens routed to each experts
561
+ tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
562
+
563
+ # Compute the average probability of routing to these experts
564
+ router_prob_per_expert = torch.mean(routing_weights, dim=0)
565
+ else:
566
+ batch_size, sequence_length = attention_mask.shape
567
+ num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
568
+
569
+ # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
570
+ expert_attention_mask = (
571
+ attention_mask[None, :, :, None, None]
572
+ .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
573
+ .reshape(-1, top_k, num_experts)
574
+ .to(compute_device)
575
+ )
576
+
577
+ # Compute the percentage of tokens routed to each experts
578
+ tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
579
+ expert_attention_mask, dim=0
580
+ )
581
+
582
+ # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
583
+ router_per_expert_attention_mask = (
584
+ attention_mask[None, :, :, None]
585
+ .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
586
+ .reshape(-1, num_experts)
587
+ .to(compute_device)
588
+ )
589
+
590
+ # Compute the average probability of routing to these experts
591
+ router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
592
+ router_per_expert_attention_mask, dim=0
593
+ )
594
+
595
+ overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
596
+ return overall_loss * num_experts
597
+
598
+
599
+ # @auto_docstring
600
+ class MiniMaxM2ForCausalLM(MiniMaxM2PreTrainedModel, GenerationMixin):
601
+ _tied_weights_keys = ["lm_head.weight"]
602
+ _tp_plan = {"lm_head": "colwise_rep"}
603
+ _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
604
+
605
+ def __init__(self, config):
606
+ super().__init__(config)
607
+ _sanitize_config(config)
608
+
609
+ self.model = MiniMaxM2Model(config)
610
+ self.vocab_size = config.vocab_size
611
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
612
+ self.router_aux_loss_coef = config.router_aux_loss_coef
613
+ self.num_experts = config.num_local_experts
614
+ self.num_experts_per_tok = config.num_experts_per_tok
615
+
616
+ # Initialize weights and apply final processing
617
+ self.post_init()
618
+
619
+ @can_return_tuple
620
+ # @auto_docstring
621
+ def forward(
622
+ self,
623
+ input_ids: Optional[torch.LongTensor] = None,
624
+ attention_mask: Optional[torch.Tensor] = None,
625
+ position_ids: Optional[torch.LongTensor] = None,
626
+ past_key_values: Optional[Cache] = None,
627
+ inputs_embeds: Optional[torch.FloatTensor] = None,
628
+ labels: Optional[torch.LongTensor] = None,
629
+ use_cache: Optional[bool] = None,
630
+ output_router_logits: Optional[bool] = None,
631
+ cache_position: Optional[torch.LongTensor] = None,
632
+ logits_to_keep: Union[int, torch.Tensor] = 0,
633
+ **kwargs: Unpack[TransformersKwargs],
634
+ ) -> MoeCausalLMOutputWithPast:
635
+ r"""
636
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
637
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
638
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
639
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
640
+
641
+ Example:
642
+
643
+ ```python
644
+ >>> from transformers import AutoTokenizer, MiniMaxM2ForCausalLM
645
+
646
+ >>> model = MiniMaxM2ForCausalLM.from_pretrained("mistralai/MiniMaxM2-8x7B-v0.1")
647
+ >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/MiniMaxM2-8x7B-v0.1")
648
+
649
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
650
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
651
+
652
+ >>> # Generate
653
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
654
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
655
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
656
+ ```"""
657
+
658
+ output_router_logits = (
659
+ output_router_logits if output_router_logits is not None else self.config.output_router_logits
660
+ )
661
+
662
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
663
+ outputs: MoeModelOutputWithPast = self.model(
664
+ input_ids=input_ids,
665
+ attention_mask=attention_mask,
666
+ position_ids=position_ids,
667
+ past_key_values=past_key_values,
668
+ inputs_embeds=inputs_embeds,
669
+ use_cache=use_cache,
670
+ output_router_logits=output_router_logits,
671
+ cache_position=cache_position,
672
+ **kwargs,
673
+ )
674
+
675
+ hidden_states = outputs.last_hidden_state
676
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
677
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
678
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
679
+
680
+ loss = None
681
+ if labels is not None:
682
+ loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)
683
+
684
+ aux_loss = None
685
+ if output_router_logits:
686
+ aux_loss = load_balancing_loss_func(
687
+ outputs.router_logits,
688
+ self.num_experts,
689
+ self.num_experts_per_tok,
690
+ attention_mask,
691
+ )
692
+ if labels is not None:
693
+ loss += self.router_aux_loss_coef * aux_loss.to(loss.device) # make sure to reside in the same device
694
+
695
+ return MoeCausalLMOutputWithPast(
696
+ loss=loss,
697
+ aux_loss=aux_loss,
698
+ logits=logits,
699
+ past_key_values=outputs.past_key_values,
700
+ hidden_states=outputs.hidden_states,
701
+ attentions=outputs.attentions,
702
+ router_logits=outputs.router_logits,
703
+ )
704
+
705
+
706
+ class MiniMaxM2ForSequenceClassification(GenericForSequenceClassification, MiniMaxM2PreTrainedModel):
707
+ pass
708
+
709
+
710
+ class MiniMaxM2ForTokenClassification(GenericForTokenClassification, MiniMaxM2PreTrainedModel):
711
+ pass
712
+
713
+
714
+ class MiniMaxM2ForQuestionAnswering(GenericForQuestionAnswering, MiniMaxM2PreTrainedModel):
715
+ pass
716
+
717
+
718
+ __all__ = [
719
+ "MiniMaxM2ForCausalLM",
720
+ "MiniMaxM2ForQuestionAnswering",
721
+ "MiniMaxM2Model",
722
+ "MiniMaxM2PreTrainedModel",
723
+ "MiniMaxM2ForSequenceClassification",
724
+ "MiniMaxM2ForTokenClassification",
725
+ ]
recipe.yaml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ default_stage:
2
+ default_modifiers:
3
+ AWQModifier:
4
+ config_groups:
5
+ mlp_experts_projections:
6
+ targets: ['re:.*block_sparse_moe\.experts\.\d+\.(w1|w2|w3)$']
7
+ weights:
8
+ num_bits: 4
9
+ type: int
10
+ symmetric: true
11
+ group_size: 32
12
+ strategy: group
13
+ block_structure: null
14
+ dynamic: false
15
+ actorder: null
16
+ scale_dtype: null
17
+ zp_dtype: null
18
+ observer: memoryless_minmax
19
+ observer_kwargs: {}
20
+ input_activations: null
21
+ output_activations: null
22
+ format: null
23
+ targets: [Linear]
24
+ ignore: []
25
+ mappings:
26
+ - smooth_layer: re:.*post_attention_layernorm$
27
+ balance_layers: ['re:.*w1$', 're:.*w3$']
28
+ - smooth_layer: re:.*w3$
29
+ balance_layers: ['re:.*w2$']
30
+ duo_scaling: true
31
+ n_grid: 20
special_tokens_map.json ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<code_interpreter>",
4
+ "<commit_after>",
5
+ "<commit_before>",
6
+ "<commit_msg>",
7
+ "<empty_output>",
8
+ "<filename>",
9
+ "<fim_middle>",
10
+ "<fim_pad>",
11
+ "<fim_prefix>",
12
+ "<fim_suffix>",
13
+ "<function_call>",
14
+ "<gh_stars>",
15
+ "]<]speech[>[",
16
+ "]<]image[>[",
17
+ "]<]video[>[",
18
+ "]<]start of speech[>[",
19
+ "]<]end of speech[>[",
20
+ "]<]start of image[>[",
21
+ "]<]end of image[>[",
22
+ "]<]start of video[>[",
23
+ "]<]end of video[>[",
24
+ "]<]vision pad[>[",
25
+ "]~!b[",
26
+ "<issue_closed>",
27
+ "<issue_comment>",
28
+ "<issue_start>",
29
+ "<jupyter_code>",
30
+ "<jupyter_output>",
31
+ "<jupyter_start>",
32
+ "<jupyter_text>",
33
+ "<reponame>",
34
+ "[e~[",
35
+ "]!d~[",
36
+ "]!p~[",
37
+ "]~b]",
38
+ "<jupyter_error>",
39
+ "<add_file>",
40
+ "<delete_file>",
41
+ "<rename_file>",
42
+ "<edit_file>",
43
+ "<commit_message>",
44
+ "<empty_source_file>",
45
+ "<repo_struct>",
46
+ "<code_context>",
47
+ "<file_content>",
48
+ "<source_files>",
49
+ "<pr_start>",
50
+ "<review_comment>",
51
+ "<filepath>",
52
+ "<file_sep>"
53
+ ],
54
+ "bos_token": {
55
+ "content": "]~!b[",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false
60
+ },
61
+ "eos_token": {
62
+ "content": "[e~[",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false
67
+ },
68
+ "unk_token": {
69
+ "content": "]!d~[",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false
74
+ }
75
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7b90ed7f55d905175bc26771d6d7d33b40b46742f073675bc816fedaf482ea1
3
+ size 15522763
tokenizer_config.json ADDED
@@ -0,0 +1,496 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "200000": {
5
+ "content": "]!p~[",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "200001": {
13
+ "content": "<fim_prefix>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "200002": {
21
+ "content": "<fim_middle>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "200003": {
29
+ "content": "<fim_suffix>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "200004": {
37
+ "content": "<fim_pad>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "200005": {
45
+ "content": "<reponame>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "200006": {
53
+ "content": "<filename>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "200007": {
61
+ "content": "<gh_stars>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "200008": {
69
+ "content": "<issue_start>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "200009": {
77
+ "content": "<issue_comment>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "200010": {
85
+ "content": "<issue_closed>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "200011": {
93
+ "content": "<jupyter_start>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "200012": {
101
+ "content": "<jupyter_text>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "200013": {
109
+ "content": "<jupyter_code>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "200014": {
117
+ "content": "<jupyter_output>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "200015": {
125
+ "content": "<empty_output>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": true
131
+ },
132
+ "200016": {
133
+ "content": "<commit_before>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": true
139
+ },
140
+ "200017": {
141
+ "content": "<commit_msg>",
142
+ "lstrip": false,
143
+ "normalized": false,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": true
147
+ },
148
+ "200018": {
149
+ "content": "<commit_after>",
150
+ "lstrip": false,
151
+ "normalized": false,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": true
155
+ },
156
+ "200019": {
157
+ "content": "]~b]",
158
+ "lstrip": false,
159
+ "normalized": false,
160
+ "rstrip": false,
161
+ "single_word": false,
162
+ "special": true
163
+ },
164
+ "200020": {
165
+ "content": "[e~[",
166
+ "lstrip": false,
167
+ "normalized": false,
168
+ "rstrip": false,
169
+ "single_word": false,
170
+ "special": true
171
+ },
172
+ "200021": {
173
+ "content": "]!d~[",
174
+ "lstrip": false,
175
+ "normalized": false,
176
+ "rstrip": false,
177
+ "single_word": false,
178
+ "special": true
179
+ },
180
+ "200022": {
181
+ "content": "<function_call>",
182
+ "lstrip": false,
183
+ "normalized": false,
184
+ "rstrip": false,
185
+ "single_word": false,
186
+ "special": true
187
+ },
188
+ "200023": {
189
+ "content": "<code_interpreter>",
190
+ "lstrip": false,
191
+ "normalized": false,
192
+ "rstrip": false,
193
+ "single_word": false,
194
+ "special": true
195
+ },
196
+ "200024": {
197
+ "content": "]<]speech[>[",
198
+ "lstrip": false,
199
+ "normalized": false,
200
+ "rstrip": false,
201
+ "single_word": false,
202
+ "special": true
203
+ },
204
+ "200025": {
205
+ "content": "]<]image[>[",
206
+ "lstrip": false,
207
+ "normalized": false,
208
+ "rstrip": false,
209
+ "single_word": false,
210
+ "special": true
211
+ },
212
+ "200026": {
213
+ "content": "]<]video[>[",
214
+ "lstrip": false,
215
+ "normalized": false,
216
+ "rstrip": false,
217
+ "single_word": false,
218
+ "special": true
219
+ },
220
+ "200027": {
221
+ "content": "]<]start of speech[>[",
222
+ "lstrip": false,
223
+ "normalized": false,
224
+ "rstrip": false,
225
+ "single_word": false,
226
+ "special": true
227
+ },
228
+ "200028": {
229
+ "content": "]<]end of speech[>[",
230
+ "lstrip": false,
231
+ "normalized": false,
232
+ "rstrip": false,
233
+ "single_word": false,
234
+ "special": true
235
+ },
236
+ "200029": {
237
+ "content": "]<]start of image[>[",
238
+ "lstrip": false,
239
+ "normalized": false,
240
+ "rstrip": false,
241
+ "single_word": false,
242
+ "special": true
243
+ },
244
+ "200030": {
245
+ "content": "]<]end of image[>[",
246
+ "lstrip": false,
247
+ "normalized": false,
248
+ "rstrip": false,
249
+ "single_word": false,
250
+ "special": true
251
+ },
252
+ "200031": {
253
+ "content": "]<]start of video[>[",
254
+ "lstrip": false,
255
+ "normalized": false,
256
+ "rstrip": false,
257
+ "single_word": false,
258
+ "special": true
259
+ },
260
+ "200032": {
261
+ "content": "]<]end of video[>[",
262
+ "lstrip": false,
263
+ "normalized": false,
264
+ "rstrip": false,
265
+ "single_word": false,
266
+ "special": true
267
+ },
268
+ "200033": {
269
+ "content": "]<]vision pad[>[",
270
+ "lstrip": false,
271
+ "normalized": false,
272
+ "rstrip": false,
273
+ "single_word": false,
274
+ "special": true
275
+ },
276
+ "200034": {
277
+ "content": "]~!b[",
278
+ "lstrip": false,
279
+ "normalized": false,
280
+ "rstrip": false,
281
+ "single_word": false,
282
+ "special": true
283
+ },
284
+ "200035": {
285
+ "content": "<jupyter_error>",
286
+ "lstrip": false,
287
+ "normalized": false,
288
+ "rstrip": false,
289
+ "single_word": false,
290
+ "special": true
291
+ },
292
+ "200036": {
293
+ "content": "<add_file>",
294
+ "lstrip": false,
295
+ "normalized": false,
296
+ "rstrip": false,
297
+ "single_word": false,
298
+ "special": true
299
+ },
300
+ "200037": {
301
+ "content": "<delete_file>",
302
+ "lstrip": false,
303
+ "normalized": false,
304
+ "rstrip": false,
305
+ "single_word": false,
306
+ "special": true
307
+ },
308
+ "200038": {
309
+ "content": "<rename_file>",
310
+ "lstrip": false,
311
+ "normalized": false,
312
+ "rstrip": false,
313
+ "single_word": false,
314
+ "special": true
315
+ },
316
+ "200039": {
317
+ "content": "<edit_file>",
318
+ "lstrip": false,
319
+ "normalized": false,
320
+ "rstrip": false,
321
+ "single_word": false,
322
+ "special": true
323
+ },
324
+ "200040": {
325
+ "content": "<commit_message>",
326
+ "lstrip": false,
327
+ "normalized": false,
328
+ "rstrip": false,
329
+ "single_word": false,
330
+ "special": true
331
+ },
332
+ "200041": {
333
+ "content": "<empty_source_file>",
334
+ "lstrip": false,
335
+ "normalized": false,
336
+ "rstrip": false,
337
+ "single_word": false,
338
+ "special": true
339
+ },
340
+ "200042": {
341
+ "content": "<repo_struct>",
342
+ "lstrip": false,
343
+ "normalized": false,
344
+ "rstrip": false,
345
+ "single_word": false,
346
+ "special": true
347
+ },
348
+ "200043": {
349
+ "content": "<code_context>",
350
+ "lstrip": false,
351
+ "normalized": false,
352
+ "rstrip": false,
353
+ "single_word": false,
354
+ "special": true
355
+ },
356
+ "200044": {
357
+ "content": "<file_content>",
358
+ "lstrip": false,
359
+ "normalized": false,
360
+ "rstrip": false,
361
+ "single_word": false,
362
+ "special": true
363
+ },
364
+ "200045": {
365
+ "content": "<source_files>",
366
+ "lstrip": false,
367
+ "normalized": false,
368
+ "rstrip": false,
369
+ "single_word": false,
370
+ "special": true
371
+ },
372
+ "200046": {
373
+ "content": "<pr_start>",
374
+ "lstrip": false,
375
+ "normalized": false,
376
+ "rstrip": false,
377
+ "single_word": false,
378
+ "special": true
379
+ },
380
+ "200047": {
381
+ "content": "<review_comment>",
382
+ "lstrip": false,
383
+ "normalized": false,
384
+ "rstrip": false,
385
+ "single_word": false,
386
+ "special": true
387
+ },
388
+ "200048": {
389
+ "content": "<filepath>",
390
+ "lstrip": false,
391
+ "normalized": false,
392
+ "rstrip": false,
393
+ "single_word": false,
394
+ "special": true
395
+ },
396
+ "200049": {
397
+ "content": "<file_sep>",
398
+ "lstrip": false,
399
+ "normalized": false,
400
+ "rstrip": false,
401
+ "single_word": false,
402
+ "special": true
403
+ },
404
+ "200050": {
405
+ "content": "<think>",
406
+ "lstrip": false,
407
+ "normalized": false,
408
+ "rstrip": false,
409
+ "single_word": false,
410
+ "special": false
411
+ },
412
+ "200051": {
413
+ "content": "</think>",
414
+ "lstrip": false,
415
+ "normalized": false,
416
+ "rstrip": false,
417
+ "single_word": false,
418
+ "special": false
419
+ },
420
+ "200052": {
421
+ "content": "<minimax:tool_call>",
422
+ "lstrip": false,
423
+ "normalized": false,
424
+ "rstrip": false,
425
+ "single_word": false,
426
+ "special": false
427
+ },
428
+ "200053": {
429
+ "content": "</minimax:tool_call>",
430
+ "lstrip": false,
431
+ "normalized": false,
432
+ "rstrip": false,
433
+ "single_word": false,
434
+ "special": false
435
+ }
436
+ },
437
+ "additional_special_tokens": [
438
+ "<code_interpreter>",
439
+ "<commit_after>",
440
+ "<commit_before>",
441
+ "<commit_msg>",
442
+ "<empty_output>",
443
+ "<filename>",
444
+ "<fim_middle>",
445
+ "<fim_pad>",
446
+ "<fim_prefix>",
447
+ "<fim_suffix>",
448
+ "<function_call>",
449
+ "<gh_stars>",
450
+ "]<]speech[>[",
451
+ "]<]image[>[",
452
+ "]<]video[>[",
453
+ "]<]start of speech[>[",
454
+ "]<]end of speech[>[",
455
+ "]<]start of image[>[",
456
+ "]<]end of image[>[",
457
+ "]<]start of video[>[",
458
+ "]<]end of video[>[",
459
+ "]<]vision pad[>[",
460
+ "]~!b[",
461
+ "<issue_closed>",
462
+ "<issue_comment>",
463
+ "<issue_start>",
464
+ "<jupyter_code>",
465
+ "<jupyter_output>",
466
+ "<jupyter_start>",
467
+ "<jupyter_text>",
468
+ "<reponame>",
469
+ "[e~[",
470
+ "]!d~[",
471
+ "]!p~[",
472
+ "]~b]",
473
+ "<jupyter_error>",
474
+ "<add_file>",
475
+ "<delete_file>",
476
+ "<rename_file>",
477
+ "<edit_file>",
478
+ "<commit_message>",
479
+ "<empty_source_file>",
480
+ "<repo_struct>",
481
+ "<code_context>",
482
+ "<file_content>",
483
+ "<source_files>",
484
+ "<pr_start>",
485
+ "<review_comment>",
486
+ "<filepath>",
487
+ "<file_sep>"
488
+ ],
489
+ "bos_token": "]~!b[",
490
+ "clean_up_tokenization_spaces": false,
491
+ "eos_token": "[e~[",
492
+ "extra_special_tokens": {},
493
+ "model_max_length": 40960000,
494
+ "tokenizer_class": "GPT2Tokenizer",
495
+ "unk_token": "]!d~["
496
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff