elephantmipt commited on Sep 25

Commit

a262a48

verified ·

1 Parent(s): 25069e6

Upload folder using huggingface_hub

Browse files

Files changed (18) hide show

.gitignore +3 -0
LICENSE +201 -0
NOTICE +16 -0
README.md +196 -3
build.toml +3 -0
build/torch-universal/flex_sae/__init__.py +18 -0
build/torch-universal/flex_sae/_ops.py +8 -0
build/torch-universal/flex_sae/hierarchical_kernels.py +323 -0
build/torch-universal/flex_sae/topk_kernels.py +237 -0
example.py +100 -0
flake.lock +168 -0
flake.nix +18 -0
tests/__init__.py +0 -0
tests/test_all_kernels.py +66 -0
tests/test_setup.py +41 -0
torch-ext/flex_sae/__init__.py +18 -0
torch-ext/flex_sae/hierarchical_kernels.py +323 -0
torch-ext/flex_sae/topk_kernels.py +237 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+*.DS_Store
+*__pycache__
+*__MACOSX

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2025 T-Tech
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

NOTICE ADDED Viewed

	@@ -0,0 +1,16 @@

+Flex SAE Kernels
+Copyright (c) 2025 T-Tech
+This project is distributed under the Apache License, Version 2.0. The following
+third-party component is redistributed under its original terms:
+- Portions of `torch-ext/flex_sae/topk_kernels.py` are adapted from the Facebook
+  Research project "memory" (https://github.com/facebookresearch/memory).
+  That source is licensed under the Creative Commons Attribution-NonCommercial
+  4.0 International License (CC BY-NC 4.0). Any use of the adapted code must
+  comply with the non-commercial requirements described at
+  https://creativecommons.org/licenses/by-nc/4.0/.
+Where the Apache 2.0 license and CC BY-NC 4.0 differ, the more restrictive
+requirements apply to the adapted code. All other files are provided under the
+Apache License, Version 2.0.

README.md CHANGED Viewed

@@ -1,3 +1,196 @@
----
-license: apache-2.0
----

+---
+license: apache-2.0
+tags:
+  - kernel
+  - sae
+---
+# Flex SAE Kernels
+[![ArXiv](https://img.shields.io/badge/arXiv-2505.24473-b31b1b.svg)](https://arxiv.org/abs/2505.24473)
+Fused Triton implementations of the TopK and HierarchicalTopK sparse autoencoder (SAE) decoder losses described in *Train One Sparse Autoencoder Across Multiple Sparsity Budgets to Preserve Interpretability and Accuracy*.
+**This work has been accepted to [EMNLP 2025](https://2025.emnlp.org/).**
+## What is released?
+ - Fast TopK kernel for SAE (slightly modified version from xformers) `torch-ext/flex_sae/topk_kernels.py`
+ - Fast HierarchicalTopK kernels (see our [paper](https://arxiv.org/abs/2505.24473)) `torch-ext/flex_sae/hierarchical_kernels.py`.
+## Quickstart
+Kernels are available via loading from hub, they have the following signature:
+```python
+from kernels import get_kernel
+flex = get_kernel('t-tech/flex-sae')
+top_k_kernel = flex.triton_topk_sae_loss
+hierarchical_top_k_kernel = flex.triton_hierarchical_sae_loss
+"B -- batch size, K -- top-k, F -- dictionary size, D -- model hidden dim"
+loss: torch.Tensor = top_k_kernel(
+      indices: torch.Tensor,  # [B, K]
+    weight: torch.Tensor,  # [F, D]
+    vals: torch.Tensor,  # [B, K]
+    bias: torch.Tensor,  # [D]
+    target: torch.Tensor,  # [B, D]
+)
+loss: torch.Tensor = hierarchical_top_k_kernel(
+      indices: torch.Tensor,  # [B, K]
+    weight: torch.Tensor,  # [F, D]
+    vals: torch.Tensor,  # [B, K]
+    bias: torch.Tensor,  # [D]
+    target: torch.Tensor,  # [B, D]
+)
+```
+## Overview
+- `torch-ext/flex_sae/` contains the Triton kernels alongside torch reference implementations.
+- `tests/` hosts CUDA-backed property tests that ensure numerical parity across dtypes and kernels.
+- `build.toml`, `flake.nix` integrate the project with [Hugging Face kernel-builder](https://github.com/huggingface/kernel-builder).
+The Triton kernels target CUDA GPUs and focus on reducing the latency gap between TopK and HierarchicalTopK decoders while keeping memory usage flat.
+## Example
+You can find example usage in [example.py](https://huggingface.co/t-tech/flex-sae/blob/main/example.py).
+```python
+# /// script
+# dependencies = [
+#   "torch",
+#   "numpy",
+#   "kernels",
+# ]
+# ///
+import torch
+import numpy as np
+from kernels import get_kernel
+flex = get_kernel("t-tech/flex-sae")  #Fast Kernels
+@torch.compile(fullgraph=True)
+def hierarchical_sae_loss(
+    indices: torch.Tensor,  # [B, K]
+    weight: torch.Tensor,  # [F, D]
+    vals: torch.Tensor,  # [B, K]
+    bias: torch.Tensor,  # [D]
+    target: torch.Tensor,  # [B, D]
+) -> torch.Tensor:
+    emb = weight[indices].to(torch.float32)  # [K, D]
+    recon_cum = bias.to(torch.float32) + (emb * vals.unsqueeze(-1)).cumsum(dim=1)
+    diff = recon_cum.to(torch.float32) - target.to(torch.float32).unsqueeze(1)
+    loss = diff.pow(2).mean()
+    return loss
+B = 2048
+K = 256
+F = 1024 * 128
+D = 1024
+warmup = 5
+dtype = torch.float32
+vals = None
+decoder = None
+bias = None
+target = None
+indices = None
+def init_parameters():
+    global vals, decoder, bias, target, indices
+    vals = torch.randn(B, K, dtype=dtype, device="cuda").abs().requires_grad_()
+    decoder = torch.randn(F, D, dtype=dtype, device="cuda", requires_grad=True)
+    bias = torch.randn(D, dtype=dtype, device="cuda", requires_grad=True)
+    target = torch.randn(B, D, dtype=dtype, device="cuda")
+    indices = torch.randint(0, F, (B, K), dtype=torch.long, device="cuda")
+timing_kernel = []
+timing_vanilla = []
+torch.cuda.reset_peak_memory_stats()
+loss_kernel_list = torch.zeros((100,))
+loss_vanilla_list = torch.zeros((100,))
+def zero_grad():
+    vals.grad = None
+    decoder.grad = None
+    bias.grad = None
+    torch.cuda.empty_cache()
+for i in range(100 + warmup):
+    init_parameters()
+    start_kernel = torch.cuda.Event(enable_timing=True)
+    end_kernel = torch.cuda.Event(enable_timing=True)
+    start_vanilla = torch.cuda.Event(enable_timing=True)
+    end_vanilla = torch.cuda.Event(enable_timing=True)
+    start_kernel.record()
+    loss_kernel = flex.triton_hierarchical_sae_loss(indices, decoder, vals, bias, target)
+    loss_kernel.backward()
+    end_kernel.record()
+    zero_grad()
+    start_vanilla.record()
+    loss_vanilla = hierarchical_sae_loss(indices, decoder, vals, bias, target)
+    loss_vanilla.backward()
+    end_vanilla.record()
+    if i >= warmup:
+        torch.cuda.synchronize()
+        timing_kernel.append(start_kernel.elapsed_time(end_kernel))
+        timing_vanilla.append(start_vanilla.elapsed_time(end_vanilla))
+        loss_kernel_list[i-warmup] = loss_kernel.detach()
+        loss_vanilla_list[i-warmup] = loss_vanilla.detach()
+    zero_grad()
+if torch.allclose(loss_kernel, loss_vanilla):
+    print("✅ Outputs are close! Everything is good! 🎉")
+else:
+    print("❌ Outputs mismatch... ⚠️🤔")
+print(f"🦎 Triton Kernel Time (Ours): {np.mean(timing_kernel):.4f} ± {np.std(timing_kernel):.4f} ms")
+print(f"🔥 Torch Compile Kernel Time: {np.mean(timing_vanilla):.4f} ± {np.std(timing_vanilla):.4f} ms")
+print(f"🚀 Speedup: {np.mean(timing_vanilla) / np.mean(timing_kernel):.2f}x")
+```
+Run it with `uv run https://huggingface.co/t-tech/flex-sae/resolve/main/example.py`.
+## Performance
+Benchmarks were collected on a workload with dictionary size $F = 65 536$, embedding dimension $D = 2304$, and sparsity budgets $K \in \{32, 64, 128\}$. Latency is reported as time per training step (milliseconds) and memory as peak device usage (GiB).
+| Decoder backend | K=32 (ms / GiB) | K=64 (ms / GiB) | K=128 (ms / GiB) |
+| --- | --- | --- | --- |
+| **Pure torch-compiled** | | | |
+| TopK | 8.787 / 2.92 | 11.746 / 2.92 | 18.877 / 2.93 |
+| HierarchicalTopK | 12.824 / 6.29 | 23.379 / 10.79 | 43.851 / 19.80 |
+| **Triton kernels** | | | |
+| TopK | 5.576 / 2.92 | 6.339 / 2.92 | 7.961 / 2.93 |
+| HierarchicalTopK | **6.696 / 2.92** | **7.995 / 2.92** | **10.609 / 2.93** |
+Across the evaluated sparsity budgets the fused Triton HierarchicalTopK kernel matches TopK kernels on memory use while remaining consistently faster than the reference torch implementation.
+## License & Attribution
+- All files except `torch-ext/flex_sae/topk_kernels.py` are released under the [Apache License 2.0](LICENSE).
+- `torch-ext/flex_sae/topk_kernels.py` includes code adapted from Facebook Research's [memory](https://github.com/facebookresearch/memory) project, originally published under the Creative Commons Attribution-NonCommercial 4.0 International License. That component therefore remains available for non-commercial use only; see [NOTICE](NOTICE) for details.
+## Citation
+```bibtex
+@misc{balagansky2025trainsparseautoencodermultiple,
+      title={Train One Sparse Autoencoder Across Multiple Sparsity Budgets to Preserve Interpretability and Accuracy},
+      author={Nikita Balagansky and Yaroslav Aksenov and Daniil Laptev and Vadim Kurochkin and Gleb Gerasimov and Nikita Koryagin and Daniil Gavrilov},
+      year={2025},
+      eprint={2505.24473},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG},
+      url={https://arxiv.org/abs/2505.24473},
+}
+```

build.toml ADDED Viewed

	@@ -0,0 +1,3 @@

+[general]
+name = "flex_sae"
+universal = true

build/torch-universal/flex_sae/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# TopK and HierarchicalTopK SAE decoder Triton kernels
+# Copyright 2025 T-Tech
+from .topk_kernels import triton_topk_sae_loss, topk_sae_loss
+from .hierarchical_kernels import triton_hierarchical_sae_loss, hierarchical_sae_loss
+__kernel_metadata__ = {
+    "license": "Apache-2.0 (with CC-BY-NC-4.0 component; see NOTICE)",
+}
+__all__ = [
+    "__kernel_metadata__",
+    "topk_sae_loss",
+    "triton_topk_sae_loss",
+    "hierarchical_sae_loss",
+    "triton_hierarchical_sae_loss",
+]

build/torch-universal/flex_sae/_ops.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import torch
+ops = torch.ops._flex_sae_20250924130857
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_flex_sae_20250924130857::{op_name}"

build/torch-universal/flex_sae/hierarchical_kernels.py ADDED Viewed

	@@ -0,0 +1,323 @@

+# HierarchicalTopK SAE decoder Triton kernels
+# Copyright 2025 T-Tech
+from typing import Tuple
+import torch
+import triton
+import triton.language as tl
+@triton.jit
+def hierarchical_sae_forward_kernel(
+    loss_per_batch_ptr,  # [B]
+    final_recon_ptr,  # [B, D]
+    indices_ptr,  # [B, K]
+    weight_ptr,  # [F, D]
+    bias_ptr,  # [D]
+    vals_ptr,  # [B, K]
+    target_ptr,  # [B, D]
+    B: tl.constexpr,
+    D: tl.constexpr,
+    K: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    LOOP_NUM_STAGES: tl.constexpr,
+    BLOCK_B: tl.constexpr,
+):
+    tl.static_assert((D % BLOCK_D) == 0)
+    tl.static_assert((B % BLOCK_B) == 0)
+    tl.static_assert((K & (K - 1)) == 0, f"{K=} must be a power of 2")
+    tl.static_assert((BLOCK_D & (BLOCK_D - 1)) == 0, f"{BLOCK_D=} must be a power of 2")
+    tl.static_assert((BLOCK_B & (BLOCK_B - 1)) == 0, f"{BLOCK_B=} must be a power of 2")
+    pid_b = tl.program_id(axis=0).to(tl.int64)
+    pid_d = tl.program_id(axis=1).to(tl.int64)
+    batch_offsets = pid_b * BLOCK_B + tl.arange(0, BLOCK_B)
+    batch_offsets = batch_offsets.to(tl.int64)
+    tl.multiple_of(batch_offsets, BLOCK_B)
+    offset_d = pid_d * BLOCK_D + tl.arange(0, BLOCK_D)
+    offset_d = offset_d.to(tl.int64)
+    tl.multiple_of(offset_d, BLOCK_D)
+    tl.max_contiguous(offset_d, BLOCK_D)
+    batch_d_offset = batch_offsets[:, None] * D + offset_d[None, :]
+    bias_tile = tl.load(bias_ptr + offset_d).to(tl.float32)
+    recon = tl.zeros([BLOCK_B, BLOCK_D], dtype=tl.float32)
+    recon += bias_tile[None, :]
+    target = tl.load(target_ptr + batch_d_offset).to(tl.float32)
+    loss_accum = tl.zeros([BLOCK_B, BLOCK_D], dtype=tl.float32)
+    row_idx_ptr = indices_ptr + batch_offsets * K
+    row_val_ptr = vals_ptr + batch_offsets * K
+    idx = tl.load(row_idx_ptr).to(tl.int64)
+    val = tl.load(row_val_ptr).to(tl.float32)
+    val = val[:, None]
+    weight_tile = tl.load(weight_ptr + idx[:, None] * D + offset_d[None, :]).to(tl.float32)
+    for t in tl.range(0, K, num_stages=LOOP_NUM_STAGES):
+        recon += weight_tile * val
+        diff = recon - target
+        loss_accum += diff * diff
+        if t + 1 < K:
+            idx_next = tl.load(row_idx_ptr + (t + 1)).to(tl.int64)
+            val_next = tl.load(row_val_ptr + (t + 1)).to(tl.float32)
+            weight_next = tl.load(weight_ptr + idx_next[:, None] * D + offset_d[None, :]).to(tl.float32)
+            idx = idx_next
+            val = val_next[:, None]
+            weight_tile = weight_next
+    loss_tile = tl.sum(loss_accum, axis=1)
+    tl.atomic_add(
+        loss_per_batch_ptr + batch_offsets,
+        loss_tile,
+        sem="relaxed",
+    )
+    tl.store(
+        final_recon_ptr + batch_d_offset,
+        recon,
+    )
+@triton.jit
+def hierarchical_sae_backward_kernel(
+    weight_grad_ptr,  # [F, D]
+    vals_grad_ptr,  # [B, K]
+    bias_grad_ptr,  # [D]
+    final_recon_ptr,  # [B, D]
+    indices_ptr,  # [B, K]
+    weight_ptr,  # [F, D]
+    vals_ptr,  # [B, K]
+    target_ptr,  # [B, D]
+    B: tl.constexpr,
+    D: tl.constexpr,
+    K: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    LOOP_NUM_STAGES: tl.constexpr,
+    BLOCK_B: tl.constexpr,
+):
+    tl.static_assert((D % BLOCK_D) == 0)
+    tl.static_assert((B % BLOCK_B) == 0)
+    tl.static_assert((K & (K - 1)) == 0, f"{K=} must be a power of 2")
+    tl.static_assert((BLOCK_D & (BLOCK_D - 1)) == 0, f"{BLOCK_D=} must be a power of 2")
+    tl.static_assert((BLOCK_B & (BLOCK_B - 1)) == 0, f"{BLOCK_B=} must be a power of 2")
+    pid_b = tl.program_id(axis=0).to(tl.int64)
+    pid_d = tl.program_id(axis=1).to(tl.int64)
+    batch_offsets = pid_b * BLOCK_B + tl.arange(0, BLOCK_B)
+    batch_offsets = batch_offsets.to(tl.int64)
+    tl.multiple_of(batch_offsets, BLOCK_B)
+    offset_d = pid_d * BLOCK_D + tl.arange(0, BLOCK_D)
+    offset_d = offset_d.to(tl.int64)
+    tl.multiple_of(offset_d, BLOCK_D)
+    tl.max_contiguous(offset_d, BLOCK_D)
+    batch_d_offset = batch_offsets[:, None] * D + offset_d[None, :]
+    recon = tl.load(final_recon_ptr + batch_d_offset).to(tl.float32)
+    target = tl.load(target_ptr + batch_d_offset).to(tl.float32)
+    suffix = tl.zeros([BLOCK_B, BLOCK_D], dtype=tl.float32)
+    bias_accum = tl.zeros([BLOCK_B, BLOCK_D], dtype=tl.float32)
+    scale = tl.full((), 2.0 / (B * K * D), dtype=tl.float32)
+    row_idx_ptr = indices_ptr + batch_offsets * K
+    row_val_ptr = vals_ptr + batch_offsets * K
+    k_offsets = tl.arange(0, K)
+    val_grad_tile = tl.zeros([BLOCK_B, K], dtype=tl.float32)
+    step = K - 1
+    idx = tl.load(row_idx_ptr + step).to(tl.int64)
+    val = tl.load(row_val_ptr + step).to(tl.float32)
+    weight_tile = tl.load(weight_ptr + idx[:, None] * D + offset_d[None, :]).to(tl.float32)
+    for _ in tl.range(0, K, num_stages=LOOP_NUM_STAGES):
+        curr_step = step
+        diff = recon - target
+        grad_curr = diff * scale
+        suffix += grad_curr
+        bias_accum += grad_curr
+        val_broadcast = val[:, None]
+        contrib = suffix * val_broadcast
+        tl.atomic_add(
+            weight_grad_ptr + idx[:, None] * D + offset_d[None, :],
+            contrib,
+            sem="relaxed",
+        )
+        dot_partial = tl.sum(weight_tile * suffix, axis=1)
+        mask_curr = k_offsets[None, :] == curr_step
+        val_grad_tile = tl.where(mask_curr, dot_partial[:, None], val_grad_tile)
+        recon -= weight_tile * val_broadcast
+        if curr_step > 0:
+            step = curr_step - 1
+            idx = tl.load(row_idx_ptr + step).to(tl.int64)
+            val = tl.load(row_val_ptr + step).to(tl.float32)
+            weight_tile = tl.load(weight_ptr + idx[:, None] * D + offset_d[None, :]).to(tl.float32)
+    bias_grad_tile = tl.sum(bias_accum, axis=0)
+    tl.atomic_add(
+        bias_grad_ptr + offset_d,
+        bias_grad_tile,
+        sem="relaxed",
+    )
+    row_val_grad_ptr = vals_grad_ptr + batch_offsets[:, None] * K + k_offsets[None, :]
+    tl.atomic_add(
+        row_val_grad_ptr,
+        val_grad_tile,
+        sem="relaxed",
+    )
+def _hierarchical_sae_forward(
+    indices: torch.Tensor,  # [B, K]
+    weight: torch.Tensor,  # [F, D]
+    vals: torch.Tensor,  # [B, K]
+    bias: torch.Tensor,  # [D]
+    target: torch.Tensor,  # [B, D]
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    B, K = indices.shape
+    F, D = weight.shape
+    loss_per_batch = torch.zeros((B,), dtype=torch.float32, device=weight.device)
+    final_recon = torch.empty((B, D), dtype=torch.float32, device=weight.device)
+    def _forward_grid(meta):
+        return (
+            B // meta["BLOCK_B"],
+            D // meta["BLOCK_D"],
+        )
+    hierarchical_sae_forward_kernel[_forward_grid](
+        loss_per_batch,
+        final_recon,
+        indices,
+        weight,
+        bias,
+        vals,
+        target,
+        B=B,
+        D=D,
+        K=K,
+        BLOCK_D=64,
+        LOOP_NUM_STAGES=4,
+        BLOCK_B=1,
+        num_warps=2,
+        num_stages=2,
+    )
+    loss = loss_per_batch.sum() / (B * K * D)
+    return loss, final_recon
+def _hierarchical_sae_backward(
+    indices: torch.Tensor,  # [B, K]
+    weight: torch.Tensor,  # [F, D]
+    vals: torch.Tensor,  # [B, K]
+    target: torch.Tensor,  # [B, D]
+    final_recon: torch.Tensor,  # [B, D]
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    device = weight.device
+    B, K = indices.shape
+    F, D = weight.shape
+    dW = torch.zeros((F, D), dtype=torch.float32, device=device)
+    dVals = torch.zeros((B, K), dtype=torch.float32, device=device)
+    db = torch.zeros((D,), dtype=torch.float32, device=device)
+    def _backward_grid(meta):
+        return (
+            B // meta["BLOCK_B"],
+            D // meta["BLOCK_D"],
+        )
+    hierarchical_sae_backward_kernel[_backward_grid](
+        dW,
+        dVals,
+        db,
+        final_recon,
+        indices,
+        weight,
+        vals,
+        target,
+        B=B,
+        D=D,
+        K=K,
+        BLOCK_D=32,
+        LOOP_NUM_STAGES=16,
+        BLOCK_B=16,
+        num_warps=8,
+        num_stages=8,
+    )
+    return dW, dVals, db
+class HierarchicalSAELossFunction(torch.autograd.Function):
+    @staticmethod
+    @torch.amp.custom_fwd(device_type="cuda")
+    def forward(
+        ctx,
+        indices: torch.Tensor,  # [B, K]
+        weight: torch.Tensor,  # [F, D]
+        vals: torch.Tensor,  # [B, K]
+        bias: torch.Tensor,  # [D]
+        target: torch.Tensor,  # [B, D]
+    ):
+        loss, final_recon = _hierarchical_sae_forward(indices, weight, vals, bias, target)
+        ctx.save_for_backward(indices, weight, vals, target, final_recon)
+        return loss
+    @staticmethod
+    @torch.amp.custom_bwd(device_type="cuda")
+    def backward(ctx, grad):
+        indices, weight, vals, target, final_recon = ctx.saved_tensors
+        dW, dVals, db = _hierarchical_sae_backward(indices, weight, vals, target, final_recon)
+        if grad is not None:
+            dW.mul_(grad)
+            dVals.mul_(grad)
+            db.mul_(grad)
+        return None, dW, dVals, db, None
+def triton_hierarchical_sae_loss(
+    indices: torch.Tensor,  # [B, K]
+    weight: torch.Tensor,  # [F, D]
+    vals: torch.Tensor,  # [B, K]
+    bias: torch.Tensor,  # [D]
+    target: torch.Tensor,  # [B, D]
+) -> torch.Tensor:
+    return HierarchicalSAELossFunction.apply(indices, weight, vals, bias, target)
+def hierarchical_sae_loss(
+    indices: torch.Tensor,  # [B, K]
+    weight: torch.Tensor,  # [F, D]
+    vals: torch.Tensor,  # [B, K]
+    bias: torch.Tensor,  # [D]
+    target: torch.Tensor,  # [B, D]
+) -> torch.Tensor:
+    emb = weight[indices].to(torch.float32)  # [K, D]
+    recon_cum = bias.to(torch.float32) + (emb * vals.unsqueeze(-1)).cumsum(dim=1)
+    diff = recon_cum.to(torch.float32) - target.to(torch.float32).unsqueeze(1)
+    loss = diff.pow(2).mean()
+    return loss

build/torch-universal/flex_sae/topk_kernels.py ADDED Viewed

	@@ -0,0 +1,237 @@

+# TopK SAE decoder Triton kernels
+# Copyright 2025 T-Tech
+# This code is adapted from Facebook Research under the
+# Creative Commons Attribution-NonCommercial 4.0 International License.
+# Original code can be found at: https://github.com/facebookresearch/memory
+from typing import Tuple
+import torch
+import triton
+import triton.language as tl
+@triton.jit
+def embedding_bag_forward_kernel(
+    out_ptr,  # [B, D]
+    indices_ptr,  # [B, K]
+    weight_ptr,  # [F, D]
+    vals_ptr,  # [B, K]
+    D: tl.constexpr,
+    K: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+):
+    tl.static_assert((D % BLOCK_D) == 0)
+    tl.static_assert((K & (K - 1)) == 0, f"{K=} must be a power of 2")
+    tl.static_assert((BLOCK_D & (BLOCK_D - 1)) == 0, f"{BLOCK_D=} must be a power of 2")
+    b = tl.program_id(axis=0).to(tl.int64)
+    pid_d = tl.program_id(axis=1).to(tl.int64)
+    off_d = pid_d * BLOCK_D + tl.arange(0, BLOCK_D)
+    out_value = tl.zeros([BLOCK_D], dtype=tl.float32)
+    for i in tl.range(K):
+        my_index = tl.load(indices_ptr + b * K + i).to(tl.int64)
+        my_scaling = tl.load(vals_ptr + b * K + i)
+        w_tile = tl.load(weight_ptr + my_index * D + off_d).to(tl.float32)
+        out_value += w_tile * my_scaling
+    tl.store(out_ptr + b * D + off_d, out_value)
+def embedding_bag_forward(
+    indices: torch.Tensor,  # [B, K]
+    weight: torch.Tensor,  # [F, D]
+    vals: torch.Tensor,  # [B, K]
+) -> torch.Tensor:
+    B, K = indices.shape
+    D = weight.shape[1]
+    trt_out = torch.empty([B, D], dtype=weight.dtype, device=weight.device)
+    def _forward_grid(meta):
+        return (B, D // meta["BLOCK_D"])
+    embedding_bag_forward_kernel[_forward_grid](
+        trt_out,
+        indices,
+        weight,
+        vals,
+        D=D,
+        K=K,
+        BLOCK_D=64,
+        num_warps=1,
+        num_stages=1,
+    )
+    return trt_out
+@triton.jit
+def count_per_embedding_kernel(
+    count_per_emb_ptr,  # [F + 1]
+    indices_ptr,  # [B, K]
+    K: tl.constexpr,
+):
+    batch_id = tl.program_id(axis=0).to(tl.int64)
+    for t in tl.range(K):
+        embedding_id = tl.load(indices_ptr + batch_id * K + t)
+        tl.atomic_add(count_per_emb_ptr + embedding_id + 1, 1, sem="relaxed")
+@triton.jit
+def map_embeddings_and_outputs_kernel(
+    reverse_mapping_ptr,  # [B * K]
+    mapping_write_pos_ptr,  # [F]
+    indices_ptr,  # [B, K]
+    K: tl.constexpr,
+):
+    batch_id = tl.program_id(axis=0).to(tl.int64)
+    for t in tl.range(K):
+        embedding_id = tl.load(indices_ptr + batch_id * K + t)
+        write_pos = tl.atomic_add(mapping_write_pos_ptr + embedding_id, 1, sem="relaxed")
+        tl.store(reverse_mapping_ptr + write_pos, batch_id * K + t)
+@triton.jit
+def aggregate_gradient_for_embedding_kernel(
+    weight_grad_ptr,  # [F, D]
+    vals_grad_ptr,  # [B, K]
+    weight_ptr,  # [F, D]
+    emb_begin_pos_ptr,  # [F + 1]
+    reverse_mapping_ptr,  # [B * K]
+    vals_ptr,  # [B, K]
+    gradient_ptr,  # [B, D]
+    D: tl.constexpr,
+    K: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+):
+    tl.static_assert((D % BLOCK_D) == 0)
+    tl.static_assert((K & (K - 1)) == 0, f"{K=} must be a power of 2")
+    tl.static_assert((BLOCK_D & (BLOCK_D - 1)) == 0, f"{BLOCK_D=} must be a power of 2")
+    e = tl.program_id(axis=0).to(tl.int64)
+    pid_d = tl.program_id(axis=1).to(tl.int64)
+    off_d = pid_d * BLOCK_D + tl.arange(0, BLOCK_D)
+    begin = tl.load(emb_begin_pos_ptr + e)
+    end = tl.load(emb_begin_pos_ptr + e + 1)
+    w_row_tile = tl.load(weight_ptr + e * D + off_d).to(tl.float32)
+    w_grad_tile = tl.zeros([BLOCK_D], dtype=tl.float32)
+    for idx in tl.range(begin, end):
+        out_linear = tl.load(reverse_mapping_ptr + idx).to(tl.int64)
+        b = out_linear // K
+        psw = tl.load(vals_ptr + out_linear)
+        g_tile = tl.load(gradient_ptr + b * D + off_d).to(tl.float32)
+        w_grad_tile += psw * g_tile
+        psw_grad_partial = tl.sum(g_tile * w_row_tile)
+        tl.atomic_add(vals_grad_ptr + out_linear, psw_grad_partial, sem="relaxed")
+    tl.store(weight_grad_ptr + e * D + off_d, w_grad_tile)
+def embedding_bag_backward(
+    indices: torch.Tensor,  # [B, K]
+    weight: torch.Tensor,  # [F, D]
+    vals: torch.Tensor,  # [B, K]
+    gradient: torch.Tensor,  # [B, D]
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    F, D = weight.shape
+    B, K = indices.shape
+    count_per_emb = torch.zeros((F + 1,), dtype=torch.uint32, device=indices.device)
+    count_per_embedding_kernel[(B,)](count_per_emb, indices, K=K, num_warps=1)
+    emb_begin_pos = count_per_emb.cumsum(0)  # [F + 1]
+    reverse_mapping = torch.empty([B * K], dtype=torch.uint32, device=indices.device)
+    assert B * K <= 2 ** (reverse_mapping.dtype.itemsize * 8) - 1
+    map_embeddings_and_outputs_kernel[(B,)](
+        reverse_mapping_ptr=reverse_mapping,
+        mapping_write_pos_ptr=emb_begin_pos.clone(),
+        indices_ptr=indices,
+        K=K,
+        num_warps=1,
+    )
+    weight_grad = torch.empty_like(weight, dtype=torch.float32)  # [F, D]
+    vals_grad = torch.zeros_like(vals, dtype=torch.float32)  # [B, K]
+    def _forward_grid(meta):
+        return (F, D // meta["BLOCK_D"])
+    aggregate_gradient_for_embedding_kernel[_forward_grid](
+        weight_grad_ptr=weight_grad,
+        vals_grad_ptr=vals_grad,
+        weight_ptr=weight,
+        emb_begin_pos_ptr=emb_begin_pos,
+        reverse_mapping_ptr=reverse_mapping,
+        vals_ptr=vals,
+        gradient_ptr=gradient,
+        D=D,
+        K=K,
+        BLOCK_D=256,
+        num_warps=1,
+        num_stages=2,
+    )
+    return weight_grad, vals_grad
+class xFormersEmbeddingBag(torch.autograd.Function):
+    @staticmethod
+    @torch.amp.custom_fwd(device_type="cuda")
+    def forward(
+        ctx,
+        indices: torch.Tensor,  # [B, K]
+        weight: torch.Tensor,  # [F, D]
+        vals: torch.Tensor,  # [B, K]
+    ) -> torch.Tensor:
+        ctx.save_for_backward(indices, weight, vals)
+        return embedding_bag_forward(indices, weight, vals)  # [B, D]
+    @staticmethod
+    @torch.amp.custom_bwd(device_type="cuda")
+    def backward(ctx, gradient):
+        indices, weight, vals = ctx.saved_tensors
+        weight_g, vals_g = embedding_bag_backward(
+            indices,
+            weight,
+            vals,
+            gradient,
+        )
+        return None, weight_g, vals_g
+def triton_topk_sae_loss(
+    indices: torch.Tensor,  # [B, K]
+    weight: torch.Tensor,  # [F, D]
+    vals: torch.Tensor,  # [B, K]
+    bias: torch.Tensor,  # [D]
+    target: torch.Tensor,  # [B, D]
+) -> torch.Tensor:
+    recon = bias.to(torch.float32) + xFormersEmbeddingBag.apply(indices, weight, vals)
+    diff = recon.to(torch.float32) - target.to(torch.float32)
+    loss = diff.pow(2).mean()
+    return loss
+def topk_sae_loss(
+    indices: torch.Tensor,  # [B, K]
+    weight: torch.Tensor,  # [F, D]
+    vals: torch.Tensor,  # [B, K]
+    bias: torch.Tensor,  # [D]
+    target: torch.Tensor,  # [B, D]
+) -> torch.Tensor:
+    emb = weight[indices].to(torch.float32)  # [K, D]
+    recon = bias.to(torch.float32) + (emb * vals.unsqueeze(-1)).sum(dim=1)
+    diff = recon.to(torch.float32) - target.to(torch.float32)
+    loss = diff.pow(2).mean()
+    return loss

example.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# /// script
+# dependencies = [
+#   "torch",
+#   "numpy",
+#   "kernels",
+# ]
+# ///
+import torch
+import numpy as np
+from kernels import get_kernel
+flex = get_kernel("t-tech/flex-sae")  #Fast Kernels
+@torch.compile(fullgraph=True)
+def hierarchical_sae_loss(
+    indices: torch.Tensor,  # [B, K]
+    weight: torch.Tensor,  # [F, D]
+    vals: torch.Tensor,  # [B, K]
+    bias: torch.Tensor,  # [D]
+    target: torch.Tensor,  # [B, D]
+) -> torch.Tensor:
+    emb = weight[indices].to(torch.float32)  # [K, D]
+    recon_cum = bias.to(torch.float32) + (emb * vals.unsqueeze(-1)).cumsum(dim=1)
+    diff = recon_cum.to(torch.float32) - target.to(torch.float32).unsqueeze(1)
+    loss = diff.pow(2).mean()
+    return loss
+B = 2048
+K = 256
+F = 1024 * 128
+D = 1024
+warmup = 5
+dtype = torch.float32
+vals = None
+decoder = None
+bias = None
+target = None
+indices = None
+def init_parameters():
+    global vals, decoder, bias, target, indices
+    vals = torch.randn(B, K, dtype=dtype, device="cuda").abs().requires_grad_()
+    decoder = torch.randn(F, D, dtype=dtype, device="cuda", requires_grad=True)
+    bias = torch.randn(D, dtype=dtype, device="cuda", requires_grad=True)
+    target = torch.randn(B, D, dtype=dtype, device="cuda")
+    indices = torch.randint(0, F, (B, K), dtype=torch.long, device="cuda")
+timing_kernel = []
+timing_vanilla = []
+torch.cuda.reset_peak_memory_stats()
+loss_kernel_list = torch.zeros((100,))
+loss_vanilla_list = torch.zeros((100,))
+def zero_grad():
+    vals.grad = None
+    decoder.grad = None
+    bias.grad = None
+    torch.cuda.empty_cache()
+for i in range(100 + warmup):
+    init_parameters()
+    start_kernel = torch.cuda.Event(enable_timing=True)
+    end_kernel = torch.cuda.Event(enable_timing=True)
+    start_vanilla = torch.cuda.Event(enable_timing=True)
+    end_vanilla = torch.cuda.Event(enable_timing=True)
+    start_kernel.record()
+    loss_kernel = flex.triton_hierarchical_sae_loss(indices, decoder, vals, bias, target)
+    loss_kernel.backward()
+    end_kernel.record()
+    zero_grad()
+    start_vanilla.record()
+    loss_vanilla = hierarchical_sae_loss(indices, decoder, vals, bias, target)
+    loss_vanilla.backward()
+    end_vanilla.record()
+    if i >= warmup:
+        torch.cuda.synchronize()
+        timing_kernel.append(start_kernel.elapsed_time(end_kernel))
+        timing_vanilla.append(start_vanilla.elapsed_time(end_vanilla))
+        loss_kernel_list[i-warmup] = loss_kernel.detach()
+        loss_vanilla_list[i-warmup] = loss_vanilla.detach()
+    zero_grad()
+if torch.allclose(loss_kernel, loss_vanilla):
+    print("✅ Outputs are close! Everything is good! 🎉")
+else:
+    print("❌ Outputs mismatch... ⚠️🤔")
+print(f"🦎 Triton Kernel Time (Ours): {np.mean(timing_kernel):.4f} ± {np.std(timing_kernel):.4f} ms")
+print(f"🔥 Torch Compile Kernel Time: {np.mean(timing_vanilla):.4f} ± {np.std(timing_vanilla):.4f} ms")
+print(f"🚀 Speedup: {np.mean(timing_vanilla) / np.mean(timing_kernel):.2f}x")

flake.lock ADDED Viewed

	@@ -0,0 +1,168 @@

+{
+  "nodes": {
+    "flake-compat": {
+      "locked": {
+        "lastModified": 1747046372,
+        "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-compat_2": {
+      "locked": {
+        "lastModified": 1747046372,
+        "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-utils": {
+      "inputs": {
+        "systems": "systems"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "flake-utils_2": {
+      "inputs": {
+        "systems": "systems_2"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "hf-nix": {
+      "inputs": {
+        "flake-compat": "flake-compat_2",
+        "flake-utils": "flake-utils_2",
+        "nixpkgs": "nixpkgs"
+      },
+      "locked": {
+        "lastModified": 1757675377,
+        "narHash": "sha256-JQKZOI1ZYO4faJnanuoTXziSmqzXe5rEFSGliWDWqWw=",
+        "owner": "huggingface",
+        "repo": "hf-nix",
+        "rev": "faf3354403a7381958d08e826c15fe30f6986a4f",
+        "type": "github"
+      },
+      "original": {
+        "owner": "huggingface",
+        "repo": "hf-nix",
+        "type": "github"
+      }
+    },
+    "kernel-builder": {
+      "inputs": {
+        "flake-compat": "flake-compat",
+        "flake-utils": "flake-utils",
+        "hf-nix": "hf-nix",
+        "nixpkgs": [
+          "kernel-builder",
+          "hf-nix",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1758713083,
+        "narHash": "sha256-C7yob+hU6/IL7NDX0GVBxKKY3GPVNOwX9OU+LRCCVrk=",
+        "owner": "huggingface",
+        "repo": "kernel-builder",
+        "rev": "051fbc3dfe6afdbe01a6f15197b440d0333090cd",
+        "type": "github"
+      },
+      "original": {
+        "owner": "huggingface",
+        "repo": "kernel-builder",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1755963616,
+        "narHash": "sha256-6yD0ww/S8n+U2uPYcJZ3DRURP8Kx036GRpR2uPNZroE=",
+        "owner": "nixos",
+        "repo": "nixpkgs",
+        "rev": "73e96df7cff5783f45e21342a75a1540c4eddce4",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nixos",
+        "ref": "nixos-unstable-small",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "kernel-builder": "kernel-builder"
+      }
+    },
+    "systems": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    },
+    "systems_2": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}

flake.nix ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  description = "Flake for TopK and HierarchicaTopK SAE Triton kernels";
+  inputs = {
+    kernel-builder.url = "github:huggingface/kernel-builder";
+  };
+  outputs =
+    {
+      self,
+      kernel-builder,
+    }:
+    kernel-builder.lib.genFlakeOutputs {
+      path = ./.;
+      rev = self.shortRev or self.dirtyShortRev or self.lastModifiedDate;
+      doGetKernelCheck = true;
+    };
+}

tests/__init__.py ADDED Viewed

File without changes

tests/test_all_kernels.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from typing import Callable
+import pytest
+import torch
+pytest.importorskip("torch.cuda")
+from .test_setup import DTYPES, DTYPE_TO_TOLS, PARAMS, SEED
+from flex_sae import (
+    triton_hierarchical_sae_loss,
+    hierarchical_sae_loss,
+    triton_topk_sae_loss,
+    topk_sae_loss,
+)
+@pytest.fixture(autouse=True)
+def _set_cuda_default_device():
+    torch.set_default_device("cuda")
+def run_funcs(B, K, F, D, dtype, *, kernel_foo: Callable, ref_foo: Callable):
+    if dtype is torch.bfloat16 and not torch.cuda.is_bf16_supported():
+        pytest.skip("BF16 not supported on this GPU")
+    torch.manual_seed(SEED)
+    indices = torch.randint(0, F, (B, K), dtype=torch.long, device="cuda")
+    vals = torch.randn(B, K, dtype=dtype, device="cuda").abs().requires_grad_()
+    decoder = torch.randn(F, D, dtype=dtype, device="cuda", requires_grad=True)
+    bias = torch.randn(D, dtype=dtype, device="cuda", requires_grad=True)
+    target = torch.randn(B, D, dtype=dtype, device="cuda")
+    sv_ref = vals.clone().detach().requires_grad_()
+    dec_ref = decoder.clone().detach().requires_grad_()
+    bias_ref = bias.clone().detach().requires_grad_()
+    loss_f = kernel_foo(indices, decoder, vals, bias, target)
+    loss_r = ref_foo(indices, dec_ref, sv_ref, bias_ref, target)
+    torch.testing.assert_close(loss_f, loss_r, **DTYPE_TO_TOLS[dtype])
+    grad_out = torch.randn((), device="cuda", dtype=torch.float32)
+    loss_f.backward(grad_out)
+    loss_r.backward(grad_out.clone())
+    torch.testing.assert_close(vals.grad, sv_ref.grad, **DTYPE_TO_TOLS[dtype])
+    torch.testing.assert_close(decoder.grad, dec_ref.grad, **DTYPE_TO_TOLS[dtype])
+    torch.testing.assert_close(bias.grad, bias_ref.grad, **DTYPE_TO_TOLS[dtype])
+    assert indices.grad is None
+@pytest.mark.parametrize("B, K, F, D", PARAMS)
+@pytest.mark.parametrize("dtype", DTYPES)
+def test_triton_hierarchical_sae_loss_and_grads(B, K, F, D, dtype):
+    run_funcs(B, K, F, D, dtype, kernel_foo=triton_hierarchical_sae_loss, ref_foo=hierarchical_sae_loss)
+    torch.cuda.empty_cache()
+@pytest.mark.parametrize("B, K, F, D", PARAMS)
+@pytest.mark.parametrize("dtype", DTYPES)
+def test_topk_sae_loss_and_grads(B, K, F, D, dtype):
+    run_funcs(
+        B, K, F, D, dtype, kernel_foo=triton_topk_sae_loss, ref_foo=topk_sae_loss
+    )
+    torch.cuda.empty_cache()

tests/test_setup.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import torch
+SEED = 1234
+PARAMS = [
+    (16, 16, 64, 512),
+    (16, 32, 96, 768),
+    (16, 64, 128, 1024),
+    (32, 16, 128, 512),
+    (32, 32, 160, 768),
+    (32, 64, 192, 1024),
+    (48, 32, 176, 1024),
+    (48, 64, 224, 1280),
+    (64, 16, 192, 768),
+    (64, 32, 224, 1024),
+    (64, 128, 256, 2048),
+    (80, 32, 240, 1280),
+    (80, 64, 256, 1536),
+    (96, 32, 256, 1536),
+    (96, 64, 288, 2048),
+    (96, 128, 320, 3072),
+    (112, 64, 320, 2048),
+    (112, 128, 352, 2560),
+    (128, 32, 256, 1024),
+    (128, 64, 320, 1536),
+    (128, 128, 384, 3072),
+    (160, 64, 320, 1536),
+    (160, 128, 384, 2560),
+    (192, 64, 384, 2048),
+    (192, 128, 448, 3072),
+    (192, 256, 512, 4096),
+]
+DTYPE_TO_TOLS = {
+    torch.float32: {"atol": 1e-4, "rtol": 1e-3},
+    torch.bfloat16: {"atol": 1e-2, "rtol": 1e-2},
+    torch.float16: {"atol": 1e-3, "rtol": 1e-3},
+}
+DTYPES = list(DTYPE_TO_TOLS.keys())

torch-ext/flex_sae/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# TopK and HierarchicalTopK SAE decoder Triton kernels
+# Copyright 2025 T-Tech
+from .topk_kernels import triton_topk_sae_loss, topk_sae_loss
+from .hierarchical_kernels import triton_hierarchical_sae_loss, hierarchical_sae_loss
+__kernel_metadata__ = {
+    "license": "Apache-2.0 (with CC-BY-NC-4.0 component; see NOTICE)",
+}
+__all__ = [
+    "__kernel_metadata__",
+    "topk_sae_loss",
+    "triton_topk_sae_loss",
+    "hierarchical_sae_loss",
+    "triton_hierarchical_sae_loss",
+]

torch-ext/flex_sae/hierarchical_kernels.py ADDED Viewed

	@@ -0,0 +1,323 @@

+# HierarchicalTopK SAE decoder Triton kernels
+# Copyright 2025 T-Tech
+from typing import Tuple
+import torch
+import triton
+import triton.language as tl
+@triton.jit
+def hierarchical_sae_forward_kernel(
+    loss_per_batch_ptr,  # [B]
+    final_recon_ptr,  # [B, D]
+    indices_ptr,  # [B, K]
+    weight_ptr,  # [F, D]
+    bias_ptr,  # [D]
+    vals_ptr,  # [B, K]
+    target_ptr,  # [B, D]
+    B: tl.constexpr,
+    D: tl.constexpr,
+    K: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    LOOP_NUM_STAGES: tl.constexpr,
+    BLOCK_B: tl.constexpr,
+):
+    tl.static_assert((D % BLOCK_D) == 0)
+    tl.static_assert((B % BLOCK_B) == 0)
+    tl.static_assert((K & (K - 1)) == 0, f"{K=} must be a power of 2")
+    tl.static_assert((BLOCK_D & (BLOCK_D - 1)) == 0, f"{BLOCK_D=} must be a power of 2")
+    tl.static_assert((BLOCK_B & (BLOCK_B - 1)) == 0, f"{BLOCK_B=} must be a power of 2")
+    pid_b = tl.program_id(axis=0).to(tl.int64)
+    pid_d = tl.program_id(axis=1).to(tl.int64)
+    batch_offsets = pid_b * BLOCK_B + tl.arange(0, BLOCK_B)
+    batch_offsets = batch_offsets.to(tl.int64)
+    tl.multiple_of(batch_offsets, BLOCK_B)
+    offset_d = pid_d * BLOCK_D + tl.arange(0, BLOCK_D)
+    offset_d = offset_d.to(tl.int64)
+    tl.multiple_of(offset_d, BLOCK_D)
+    tl.max_contiguous(offset_d, BLOCK_D)
+    batch_d_offset = batch_offsets[:, None] * D + offset_d[None, :]
+    bias_tile = tl.load(bias_ptr + offset_d).to(tl.float32)
+    recon = tl.zeros([BLOCK_B, BLOCK_D], dtype=tl.float32)
+    recon += bias_tile[None, :]
+    target = tl.load(target_ptr + batch_d_offset).to(tl.float32)
+    loss_accum = tl.zeros([BLOCK_B, BLOCK_D], dtype=tl.float32)
+    row_idx_ptr = indices_ptr + batch_offsets * K
+    row_val_ptr = vals_ptr + batch_offsets * K
+    idx = tl.load(row_idx_ptr).to(tl.int64)
+    val = tl.load(row_val_ptr).to(tl.float32)
+    val = val[:, None]
+    weight_tile = tl.load(weight_ptr + idx[:, None] * D + offset_d[None, :]).to(tl.float32)
+    for t in tl.range(0, K, num_stages=LOOP_NUM_STAGES):
+        recon += weight_tile * val
+        diff = recon - target
+        loss_accum += diff * diff
+        if t + 1 < K:
+            idx_next = tl.load(row_idx_ptr + (t + 1)).to(tl.int64)
+            val_next = tl.load(row_val_ptr + (t + 1)).to(tl.float32)
+            weight_next = tl.load(weight_ptr + idx_next[:, None] * D + offset_d[None, :]).to(tl.float32)
+            idx = idx_next
+            val = val_next[:, None]
+            weight_tile = weight_next
+    loss_tile = tl.sum(loss_accum, axis=1)
+    tl.atomic_add(
+        loss_per_batch_ptr + batch_offsets,
+        loss_tile,
+        sem="relaxed",
+    )
+    tl.store(
+        final_recon_ptr + batch_d_offset,
+        recon,
+    )
+@triton.jit
+def hierarchical_sae_backward_kernel(
+    weight_grad_ptr,  # [F, D]
+    vals_grad_ptr,  # [B, K]
+    bias_grad_ptr,  # [D]
+    final_recon_ptr,  # [B, D]
+    indices_ptr,  # [B, K]
+    weight_ptr,  # [F, D]
+    vals_ptr,  # [B, K]
+    target_ptr,  # [B, D]
+    B: tl.constexpr,
+    D: tl.constexpr,
+    K: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    LOOP_NUM_STAGES: tl.constexpr,
+    BLOCK_B: tl.constexpr,
+):
+    tl.static_assert((D % BLOCK_D) == 0)
+    tl.static_assert((B % BLOCK_B) == 0)
+    tl.static_assert((K & (K - 1)) == 0, f"{K=} must be a power of 2")
+    tl.static_assert((BLOCK_D & (BLOCK_D - 1)) == 0, f"{BLOCK_D=} must be a power of 2")
+    tl.static_assert((BLOCK_B & (BLOCK_B - 1)) == 0, f"{BLOCK_B=} must be a power of 2")
+    pid_b = tl.program_id(axis=0).to(tl.int64)
+    pid_d = tl.program_id(axis=1).to(tl.int64)
+    batch_offsets = pid_b * BLOCK_B + tl.arange(0, BLOCK_B)
+    batch_offsets = batch_offsets.to(tl.int64)
+    tl.multiple_of(batch_offsets, BLOCK_B)
+    offset_d = pid_d * BLOCK_D + tl.arange(0, BLOCK_D)
+    offset_d = offset_d.to(tl.int64)
+    tl.multiple_of(offset_d, BLOCK_D)
+    tl.max_contiguous(offset_d, BLOCK_D)
+    batch_d_offset = batch_offsets[:, None] * D + offset_d[None, :]
+    recon = tl.load(final_recon_ptr + batch_d_offset).to(tl.float32)
+    target = tl.load(target_ptr + batch_d_offset).to(tl.float32)
+    suffix = tl.zeros([BLOCK_B, BLOCK_D], dtype=tl.float32)
+    bias_accum = tl.zeros([BLOCK_B, BLOCK_D], dtype=tl.float32)
+    scale = tl.full((), 2.0 / (B * K * D), dtype=tl.float32)
+    row_idx_ptr = indices_ptr + batch_offsets * K
+    row_val_ptr = vals_ptr + batch_offsets * K
+    k_offsets = tl.arange(0, K)
+    val_grad_tile = tl.zeros([BLOCK_B, K], dtype=tl.float32)
+    step = K - 1
+    idx = tl.load(row_idx_ptr + step).to(tl.int64)
+    val = tl.load(row_val_ptr + step).to(tl.float32)
+    weight_tile = tl.load(weight_ptr + idx[:, None] * D + offset_d[None, :]).to(tl.float32)
+    for _ in tl.range(0, K, num_stages=LOOP_NUM_STAGES):
+        curr_step = step
+        diff = recon - target
+        grad_curr = diff * scale
+        suffix += grad_curr
+        bias_accum += grad_curr
+        val_broadcast = val[:, None]
+        contrib = suffix * val_broadcast
+        tl.atomic_add(
+            weight_grad_ptr + idx[:, None] * D + offset_d[None, :],
+            contrib,
+            sem="relaxed",
+        )
+        dot_partial = tl.sum(weight_tile * suffix, axis=1)
+        mask_curr = k_offsets[None, :] == curr_step
+        val_grad_tile = tl.where(mask_curr, dot_partial[:, None], val_grad_tile)
+        recon -= weight_tile * val_broadcast
+        if curr_step > 0:
+            step = curr_step - 1
+            idx = tl.load(row_idx_ptr + step).to(tl.int64)
+            val = tl.load(row_val_ptr + step).to(tl.float32)
+            weight_tile = tl.load(weight_ptr + idx[:, None] * D + offset_d[None, :]).to(tl.float32)
+    bias_grad_tile = tl.sum(bias_accum, axis=0)
+    tl.atomic_add(
+        bias_grad_ptr + offset_d,
+        bias_grad_tile,
+        sem="relaxed",
+    )
+    row_val_grad_ptr = vals_grad_ptr + batch_offsets[:, None] * K + k_offsets[None, :]
+    tl.atomic_add(
+        row_val_grad_ptr,
+        val_grad_tile,
+        sem="relaxed",
+    )
+def _hierarchical_sae_forward(
+    indices: torch.Tensor,  # [B, K]
+    weight: torch.Tensor,  # [F, D]
+    vals: torch.Tensor,  # [B, K]
+    bias: torch.Tensor,  # [D]
+    target: torch.Tensor,  # [B, D]
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    B, K = indices.shape
+    F, D = weight.shape
+    loss_per_batch = torch.zeros((B,), dtype=torch.float32, device=weight.device)
+    final_recon = torch.empty((B, D), dtype=torch.float32, device=weight.device)
+    def _forward_grid(meta):
+        return (
+            B // meta["BLOCK_B"],
+            D // meta["BLOCK_D"],
+        )
+    hierarchical_sae_forward_kernel[_forward_grid](
+        loss_per_batch,
+        final_recon,
+        indices,
+        weight,
+        bias,
+        vals,
+        target,
+        B=B,
+        D=D,
+        K=K,
+        BLOCK_D=64,
+        LOOP_NUM_STAGES=4,
+        BLOCK_B=1,
+        num_warps=2,
+        num_stages=2,
+    )
+    loss = loss_per_batch.sum() / (B * K * D)
+    return loss, final_recon
+def _hierarchical_sae_backward(
+    indices: torch.Tensor,  # [B, K]
+    weight: torch.Tensor,  # [F, D]
+    vals: torch.Tensor,  # [B, K]
+    target: torch.Tensor,  # [B, D]
+    final_recon: torch.Tensor,  # [B, D]
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    device = weight.device
+    B, K = indices.shape
+    F, D = weight.shape
+    dW = torch.zeros((F, D), dtype=torch.float32, device=device)
+    dVals = torch.zeros((B, K), dtype=torch.float32, device=device)
+    db = torch.zeros((D,), dtype=torch.float32, device=device)
+    def _backward_grid(meta):
+        return (
+            B // meta["BLOCK_B"],
+            D // meta["BLOCK_D"],
+        )
+    hierarchical_sae_backward_kernel[_backward_grid](
+        dW,
+        dVals,
+        db,
+        final_recon,
+        indices,
+        weight,
+        vals,
+        target,
+        B=B,
+        D=D,
+        K=K,
+        BLOCK_D=32,
+        LOOP_NUM_STAGES=16,
+        BLOCK_B=16,
+        num_warps=8,
+        num_stages=8,
+    )
+    return dW, dVals, db
+class HierarchicalSAELossFunction(torch.autograd.Function):
+    @staticmethod
+    @torch.amp.custom_fwd(device_type="cuda")
+    def forward(
+        ctx,
+        indices: torch.Tensor,  # [B, K]
+        weight: torch.Tensor,  # [F, D]
+        vals: torch.Tensor,  # [B, K]
+        bias: torch.Tensor,  # [D]
+        target: torch.Tensor,  # [B, D]
+    ):
+        loss, final_recon = _hierarchical_sae_forward(indices, weight, vals, bias, target)
+        ctx.save_for_backward(indices, weight, vals, target, final_recon)
+        return loss
+    @staticmethod
+    @torch.amp.custom_bwd(device_type="cuda")
+    def backward(ctx, grad):
+        indices, weight, vals, target, final_recon = ctx.saved_tensors
+        dW, dVals, db = _hierarchical_sae_backward(indices, weight, vals, target, final_recon)
+        if grad is not None:
+            dW.mul_(grad)
+            dVals.mul_(grad)
+            db.mul_(grad)
+        return None, dW, dVals, db, None
+def triton_hierarchical_sae_loss(
+    indices: torch.Tensor,  # [B, K]
+    weight: torch.Tensor,  # [F, D]
+    vals: torch.Tensor,  # [B, K]
+    bias: torch.Tensor,  # [D]
+    target: torch.Tensor,  # [B, D]
+) -> torch.Tensor:
+    return HierarchicalSAELossFunction.apply(indices, weight, vals, bias, target)
+def hierarchical_sae_loss(
+    indices: torch.Tensor,  # [B, K]
+    weight: torch.Tensor,  # [F, D]
+    vals: torch.Tensor,  # [B, K]
+    bias: torch.Tensor,  # [D]
+    target: torch.Tensor,  # [B, D]
+) -> torch.Tensor:
+    emb = weight[indices].to(torch.float32)  # [K, D]
+    recon_cum = bias.to(torch.float32) + (emb * vals.unsqueeze(-1)).cumsum(dim=1)
+    diff = recon_cum.to(torch.float32) - target.to(torch.float32).unsqueeze(1)
+    loss = diff.pow(2).mean()
+    return loss

torch-ext/flex_sae/topk_kernels.py ADDED Viewed

	@@ -0,0 +1,237 @@

+# TopK SAE decoder Triton kernels
+# Copyright 2025 T-Tech
+# This code is adapted from Facebook Research under the
+# Creative Commons Attribution-NonCommercial 4.0 International License.
+# Original code can be found at: https://github.com/facebookresearch/memory
+from typing import Tuple
+import torch
+import triton
+import triton.language as tl
+@triton.jit
+def embedding_bag_forward_kernel(
+    out_ptr,  # [B, D]
+    indices_ptr,  # [B, K]
+    weight_ptr,  # [F, D]
+    vals_ptr,  # [B, K]
+    D: tl.constexpr,
+    K: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+):
+    tl.static_assert((D % BLOCK_D) == 0)
+    tl.static_assert((K & (K - 1)) == 0, f"{K=} must be a power of 2")
+    tl.static_assert((BLOCK_D & (BLOCK_D - 1)) == 0, f"{BLOCK_D=} must be a power of 2")
+    b = tl.program_id(axis=0).to(tl.int64)
+    pid_d = tl.program_id(axis=1).to(tl.int64)
+    off_d = pid_d * BLOCK_D + tl.arange(0, BLOCK_D)
+    out_value = tl.zeros([BLOCK_D], dtype=tl.float32)
+    for i in tl.range(K):
+        my_index = tl.load(indices_ptr + b * K + i).to(tl.int64)
+        my_scaling = tl.load(vals_ptr + b * K + i)
+        w_tile = tl.load(weight_ptr + my_index * D + off_d).to(tl.float32)
+        out_value += w_tile * my_scaling
+    tl.store(out_ptr + b * D + off_d, out_value)
+def embedding_bag_forward(
+    indices: torch.Tensor,  # [B, K]
+    weight: torch.Tensor,  # [F, D]
+    vals: torch.Tensor,  # [B, K]
+) -> torch.Tensor:
+    B, K = indices.shape
+    D = weight.shape[1]
+    trt_out = torch.empty([B, D], dtype=weight.dtype, device=weight.device)
+    def _forward_grid(meta):
+        return (B, D // meta["BLOCK_D"])
+    embedding_bag_forward_kernel[_forward_grid](
+        trt_out,
+        indices,
+        weight,
+        vals,
+        D=D,
+        K=K,
+        BLOCK_D=64,
+        num_warps=1,
+        num_stages=1,
+    )
+    return trt_out
+@triton.jit
+def count_per_embedding_kernel(
+    count_per_emb_ptr,  # [F + 1]
+    indices_ptr,  # [B, K]
+    K: tl.constexpr,
+):
+    batch_id = tl.program_id(axis=0).to(tl.int64)
+    for t in tl.range(K):
+        embedding_id = tl.load(indices_ptr + batch_id * K + t)
+        tl.atomic_add(count_per_emb_ptr + embedding_id + 1, 1, sem="relaxed")
+@triton.jit
+def map_embeddings_and_outputs_kernel(
+    reverse_mapping_ptr,  # [B * K]
+    mapping_write_pos_ptr,  # [F]
+    indices_ptr,  # [B, K]
+    K: tl.constexpr,
+):
+    batch_id = tl.program_id(axis=0).to(tl.int64)
+    for t in tl.range(K):
+        embedding_id = tl.load(indices_ptr + batch_id * K + t)
+        write_pos = tl.atomic_add(mapping_write_pos_ptr + embedding_id, 1, sem="relaxed")
+        tl.store(reverse_mapping_ptr + write_pos, batch_id * K + t)
+@triton.jit
+def aggregate_gradient_for_embedding_kernel(
+    weight_grad_ptr,  # [F, D]
+    vals_grad_ptr,  # [B, K]
+    weight_ptr,  # [F, D]
+    emb_begin_pos_ptr,  # [F + 1]
+    reverse_mapping_ptr,  # [B * K]
+    vals_ptr,  # [B, K]
+    gradient_ptr,  # [B, D]
+    D: tl.constexpr,
+    K: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+):
+    tl.static_assert((D % BLOCK_D) == 0)
+    tl.static_assert((K & (K - 1)) == 0, f"{K=} must be a power of 2")
+    tl.static_assert((BLOCK_D & (BLOCK_D - 1)) == 0, f"{BLOCK_D=} must be a power of 2")
+    e = tl.program_id(axis=0).to(tl.int64)
+    pid_d = tl.program_id(axis=1).to(tl.int64)
+    off_d = pid_d * BLOCK_D + tl.arange(0, BLOCK_D)
+    begin = tl.load(emb_begin_pos_ptr + e)
+    end = tl.load(emb_begin_pos_ptr + e + 1)
+    w_row_tile = tl.load(weight_ptr + e * D + off_d).to(tl.float32)
+    w_grad_tile = tl.zeros([BLOCK_D], dtype=tl.float32)
+    for idx in tl.range(begin, end):
+        out_linear = tl.load(reverse_mapping_ptr + idx).to(tl.int64)
+        b = out_linear // K
+        psw = tl.load(vals_ptr + out_linear)
+        g_tile = tl.load(gradient_ptr + b * D + off_d).to(tl.float32)
+        w_grad_tile += psw * g_tile
+        psw_grad_partial = tl.sum(g_tile * w_row_tile)
+        tl.atomic_add(vals_grad_ptr + out_linear, psw_grad_partial, sem="relaxed")
+    tl.store(weight_grad_ptr + e * D + off_d, w_grad_tile)
+def embedding_bag_backward(
+    indices: torch.Tensor,  # [B, K]
+    weight: torch.Tensor,  # [F, D]
+    vals: torch.Tensor,  # [B, K]
+    gradient: torch.Tensor,  # [B, D]
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    F, D = weight.shape
+    B, K = indices.shape
+    count_per_emb = torch.zeros((F + 1,), dtype=torch.uint32, device=indices.device)
+    count_per_embedding_kernel[(B,)](count_per_emb, indices, K=K, num_warps=1)
+    emb_begin_pos = count_per_emb.cumsum(0)  # [F + 1]
+    reverse_mapping = torch.empty([B * K], dtype=torch.uint32, device=indices.device)
+    assert B * K <= 2 ** (reverse_mapping.dtype.itemsize * 8) - 1
+    map_embeddings_and_outputs_kernel[(B,)](
+        reverse_mapping_ptr=reverse_mapping,
+        mapping_write_pos_ptr=emb_begin_pos.clone(),
+        indices_ptr=indices,
+        K=K,
+        num_warps=1,
+    )
+    weight_grad = torch.empty_like(weight, dtype=torch.float32)  # [F, D]
+    vals_grad = torch.zeros_like(vals, dtype=torch.float32)  # [B, K]
+    def _forward_grid(meta):
+        return (F, D // meta["BLOCK_D"])
+    aggregate_gradient_for_embedding_kernel[_forward_grid](
+        weight_grad_ptr=weight_grad,
+        vals_grad_ptr=vals_grad,
+        weight_ptr=weight,
+        emb_begin_pos_ptr=emb_begin_pos,
+        reverse_mapping_ptr=reverse_mapping,
+        vals_ptr=vals,
+        gradient_ptr=gradient,
+        D=D,
+        K=K,
+        BLOCK_D=256,
+        num_warps=1,
+        num_stages=2,
+    )
+    return weight_grad, vals_grad
+class xFormersEmbeddingBag(torch.autograd.Function):
+    @staticmethod
+    @torch.amp.custom_fwd(device_type="cuda")
+    def forward(
+        ctx,
+        indices: torch.Tensor,  # [B, K]
+        weight: torch.Tensor,  # [F, D]
+        vals: torch.Tensor,  # [B, K]
+    ) -> torch.Tensor:
+        ctx.save_for_backward(indices, weight, vals)
+        return embedding_bag_forward(indices, weight, vals)  # [B, D]
+    @staticmethod
+    @torch.amp.custom_bwd(device_type="cuda")
+    def backward(ctx, gradient):
+        indices, weight, vals = ctx.saved_tensors
+        weight_g, vals_g = embedding_bag_backward(
+            indices,
+            weight,
+            vals,
+            gradient,
+        )
+        return None, weight_g, vals_g
+def triton_topk_sae_loss(
+    indices: torch.Tensor,  # [B, K]
+    weight: torch.Tensor,  # [F, D]
+    vals: torch.Tensor,  # [B, K]
+    bias: torch.Tensor,  # [D]
+    target: torch.Tensor,  # [B, D]
+) -> torch.Tensor:
+    recon = bias.to(torch.float32) + xFormersEmbeddingBag.apply(indices, weight, vals)
+    diff = recon.to(torch.float32) - target.to(torch.float32)
+    loss = diff.pow(2).mean()
+    return loss
+def topk_sae_loss(
+    indices: torch.Tensor,  # [B, K]
+    weight: torch.Tensor,  # [F, D]
+    vals: torch.Tensor,  # [B, K]
+    bias: torch.Tensor,  # [D]
+    target: torch.Tensor,  # [B, D]
+) -> torch.Tensor:
+    emb = weight[indices].to(torch.float32)  # [K, D]
+    recon = bias.to(torch.float32) + (emb * vals.unsqueeze(-1)).sum(dim=1)
+    diff = recon.to(torch.float32) - target.to(torch.float32)
+    loss = diff.pow(2).mean()
+    return loss