add docs

mawright · mawright · commit ff4ec6ab2cf6 · 2025-07-31T11:47:20.000-07:00
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -0,0 +1,59 @@
+name: Build and Deploy Documentation
+
+on:
+  # Runs on pushes to main branch
+  push:
+    branches: [ main ]
+  # Allows manual trigger from Actions tab
+  workflow_dispatch:
+
+# Sets permissions for GITHUB_TOKEN to allow deployment to GitHub Pages
+permissions:
+  contents: read
+  pages: write
+  id-token: write
+
+# Allow only one concurrent deployment
+concurrency:
+  group: "pages"
+  cancel-in-progress: false
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+          
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e ".[docs]"
+          
+      - name: Build documentation
+        run: mkdocs build
+        
+      - name: Setup Pages
+        uses: actions/configure-pages@v5
+        
+      - name: Upload artifact
+        uses: actions/upload-pages-artifact@v3
+        with:
+          path: ./site
+          retention-days: 1
+
+  deploy:
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    runs-on: ubuntu-latest
+    needs: build
+    steps:
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@v4
diff --git a/docs/index.md b/docs/index.md
@@ -0,0 +1,7 @@
+# sparse-transformer-layers documentation
+
+Welcome to the documentation for sparse-transformer-layers.
+
+For basic information, please see the [repository Readme](https://github.com/mawright/sparse-transformer-layers).
+
+This documentation features more detailed usage instructions for all of the Transformer layers in the library.
diff --git a/docs/msdeform_attn.md b/docs/msdeform_attn.md
@@ -0,0 +1,41 @@
+# Sparse multi-scale deformable attention
+
+## Overview
+
+This implements a version of Multi-scale Deformable Attention (MSDeformAttention) adapted for sparse tensors. 
+
+---
+
+::: blocks.ms_deform_attn.SparseMSDeformableAttentionBlock
+    options:
+        members:
+            - forward
+            - reset_parameters
+        show_root_heading: true
+        show_root_toc_entry: true
+        show_root_full_path: false
+
+---
+
+::: layers.sparse_ms_deform_attn.layer.SparseMSDeformableAttention
+    options:
+        members:
+        - forward
+        - reset_parameters
+        show_root_heading: true
+        show_root_toc_entry: true
+        show_root_full_path: false
+
+--- 
+
+## Utilities
+
+::: layers.sparse_ms_deform_attn.utils
+    options:
+        members:
+            - sparse_split_heads
+            - multilevel_sparse_bilinear_grid_sample
+        show_root_heading: false
+        show_root_toc_entry: false
+        show_root_full_path: false
+        heading_level: 3
diff --git a/docs/neigh_attn.md b/docs/neigh_attn.md
@@ -0,0 +1,29 @@
+# Multi-level sparse neighborhood attention
+
+## Overview
+
+The multi-level sparse neighborhood attention operation allows query points to attend to the small neighborhoods of nonzero points around their spatial position, one neighborhood for each feature level.
+This is a potentially useful alternative or complement to multi-scale deformable attention, which can potentially try to sample from zero points on sparse tensors. The neighborhood attention operation, on the other hand, will always attend to all nonzero points within the given neighborhood sizes.
+
+The neighborhood attention implementation makes use of a custom autograd operator that checkpoints the key and value projections of the neighborhood points and manually calculates the backward pass.
+This checkpointing is essential for memory management, particularly for operations with many potential query points such as within a DETR encoder, or a DETR decoder with many object queries.
+
+---
+
+::: blocks.neighborhood_attn.SparseNeighborhoodAttentionBlock
+    options:
+        members:
+            - forward
+            - reset_parameters
+        show_root_heading: true
+        show_root_toc_entry: true
+        show_root_full_path: false
+
+---
+
+::: blocks.neighborhood_attn
+    options:
+        members:
+            - get_multilevel_neighborhoods
+        show_root_heading: false
+        show_root_toc_entry: false
diff --git a/docs/self_attn.md b/docs/self_attn.md
@@ -0,0 +1,16 @@
+# Multi-level sparse self-attention
+
+## Overview
+
+The self-attention implementation is intended for use with `torch.sparse_coo_tensor` multi-level feature maps. It uses [`RoPEEncodingND`](https://mawright.github.io/nd-rotary-encodings/layer/#position_encoding_layer.rope_encoding_layer.RoPEEncodingND) from [nd-rotary-encodings](https://github.com/mawright/nd-rotary-encodings) to encode the positions and feature levels of all input points.
+
+---
+
+::: blocks.self_attn.MultilevelSelfAttentionBlockWithRoPE
+    options:
+        members:
+            - forward
+            - reset_parameters
+        show_root_heading: true
+        show_root_toc_entry: true
+        show_root_full_path: false
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -0,0 +1,23 @@
+site_name: sparse-transformer-layers
+theme:
+  name: readthedocs
+repo_url: https://github.com/mawright/sparse-transformer-layers
+
+plugins:
+  - search
+  - mkdocstrings:
+      handlers:
+        python:
+          options:
+            show_source: false
+          paths: [sparse_transformer_layers]
+
+markdown_extensions:
+  - toc:
+      permalink: true
+
+nav:
+  - Home: index.md
+  - Self-Attention: self_attn.md
+  - Neighborhood Attention: neigh_attn.md
+  - MSDeform Attention: msdeform_attn.md
diff --git a/sparse_transformer_layers/__init__.py b/sparse_transformer_layers/__init__.py
@@ -0,0 +1,14 @@
+from .blocks import (
+    MultilevelSelfAttentionBlockWithRoPE,
+    SparseMSDeformableAttentionBlock,
+    SparseNeighborhoodAttentionBlock,
+)
+from .layers import BatchSparseIndexSubsetAttention, SparseMSDeformableAttention
+
+__all__ = [
+    "BatchSparseIndexSubsetAttention",
+    "SparseMSDeformableAttention",
+    "SparseMSDeformableAttentionBlock",
+    "SparseNeighborhoodAttentionBlock",
+    "MultilevelSelfAttentionBlockWithRoPE",
+]
diff --git a/sparse_transformer_layers/blocks/__init__.py b/sparse_transformer_layers/blocks/__init__.py
@@ -1,3 +1,9 @@
+from .ms_deform_attn import SparseMSDeformableAttentionBlock
+from .neighborhood_attn import SparseNeighborhoodAttentionBlock
 from .self_attn import MultilevelSelfAttentionBlockWithRoPE
 
-__all__ = ["MultilevelSelfAttentionBlockWithRoPE"]
+__all__ = [
+    "MultilevelSelfAttentionBlockWithRoPE",
+    "SparseMSDeformableAttentionBlock",
+    "SparseNeighborhoodAttentionBlock",
+]
diff --git a/sparse_transformer_layers/blocks/ms_deform_attn.py b/sparse_transformer_layers/blocks/ms_deform_attn.py
@@ -7,6 +7,32 @@
 
 
 class SparseMSDeformableAttentionBlock(nn.Module):
+    """A standard transformer block using Sparse Multi-Scale Deformable Attention.
+
+    This module encapsulates the `SparseMSDeformableAttention` layer within a
+    typical transformer block structure. It includes a query input projection,
+    the attention mechanism itself, an output projection with dropout, a residual
+    connection, and layer normalization. The layer normalization can be applied
+    either before (pre-norm) or after (post-norm) the main block operations.
+
+    This block is designed to be a plug-and-play component in a larger transformer
+    architecture that operates on sparse, multi-scale feature maps, such as the
+    encoder or decoder of a Deformable DETR-like model.
+
+    The current version of this module only supports spatially-2D data.
+
+    Args:
+        embed_dim (int): The embedding dimension for the queries and features.
+        n_heads (int): The number of attention heads.
+        n_levels (int): The number of feature levels to sample from.
+        n_points (int): The number of sampling points per head per level.
+        dropout (float): Dropout probability for the output projection. Defaults to 0.0.
+        bias (bool): Whether to include bias terms in the input and output
+            projection layers. Defaults to False.
+        norm_first (bool): If True, applies layer normalization before the attention
+            and projection (pre-norm). If False, applies it after the residual
+            connection (post-norm). Defaults to True.
+    """
     def __init__(
         self,
         embed_dim: int,
@@ -48,6 +74,31 @@ def forward(
         background_embedding: Optional[Tensor] = None,
         query_level_indices: Optional[Tensor] = None,
     ) -> Tensor:
+        """Forward pass for the SparseMSDeformableAttentionBlock.
+
+        Args:
+            query (Tensor): Batch-flattened query tensor of shape [n_query, embed_dim].
+            query_spatial_positions (Tensor): Spatial positions of queries,
+                shape [n_queries, 2]. The positions must be floating-point
+                values scaled to the feature level in which each query resides.
+            query_batch_offsets (Tensor): Tensor of shape [batch_size+1] indicating
+                the start and end indices for each batch item in the flattened `query`.
+            stacked_feature_maps (Tensor): A sparse tensor containing feature maps
+                from all levels, with shape [batch, height, width, levels, embed_dim].
+                The last dimension is dense, others are sparse.
+            level_spatial_shapes (Tensor): Spatial dimensions (height, width) of each
+                feature level, shape [n_levels, 2].
+            background_embedding (Optional[Tensor]): An embedding to use for sampling
+                points that fall in unspecified regions of the sparse feature maps.
+                Shape [batch, n_levels, embed_dim].
+            query_level_indices (Optional[Tensor]): The level index for each query,
+                shape [n_queries]. If None, queries are assumed to be at the largest
+                feature level.
+
+        Returns:
+            Tensor: The output tensor after the attention block, with the same shape
+                as the input `query`, [n_query, embed_dim].
+        """
         residual = query
         if self.norm_first:
             query = self.norm(query)
@@ -74,6 +125,7 @@ def forward(
         return x
 
     def reset_parameters(self):
+        """Resets the parameters of all submodules."""
         self.norm.reset_parameters()
         self.q_in_proj.reset_parameters()
         self.msdeform_attn.reset_parameters()
diff --git a/sparse_transformer_layers/blocks/neighborhood_attn.py b/sparse_transformer_layers/blocks/neighborhood_attn.py
@@ -303,6 +303,7 @@ def forward(
         return x
 
     def reset_parameters(self):
+        """Initializes/resets the weights of all submodules."""
         self.norm.reset_parameters()
         self.q_in_proj.reset_parameters()
         self.subset_attn.reset_parameters()
@@ -333,21 +334,20 @@ def get_multilevel_neighborhoods(
             Default: [3, 5, 7, 9].
 
     Returns:
-        Tuple[Tensor, Tensor, Tensor]: A tuple containing:
-            - multilevel_neighborhood_indices: Tensor of shape
-                [n_queries, sum(neighborhood_sizes^position_dim), position_dim]
-                containing the spatial indices of all neighborhood points for each
-                query across all levels.
-            - out_of_bounds_mask: Boolean tensor of shape
-                [n_queries, sum(neighborhood_sizes^position_dim)] that is True at locations
-                in multilevel_neighborhood_indices that are out of bounds; i.e.
-                negative or >= the spatial shape for that level
-                If some of the computed neighborhood indices for a query are out of
-                bounds of the level's spatial shape, those indices will instead be
-                filled with mask values of -1.
-            - level_indices: Tensor of shape [sum(neighborhood_sizes^position_dim)]
-                mapping each neighborhood position to its corresponding resolution
-                level.
+        multilevel_neighborhood_indices (Tensor): Tensor of shape
+            [n_queries, sum(neighborhood_sizes^position_dim), position_dim]
+            containing the spatial indices of all neighborhood points for each
+            query across all levels.
+        out_of_bounds_mask (Tensor): Boolean tensor of shape
+            [n_queries, sum(neighborhood_sizes^position_dim)] that is True at locations
+            in multilevel_neighborhood_indices that are out of bounds; i.e.
+            negative or >= the spatial shape for that level
+            If some of the computed neighborhood indices for a query are out of
+            bounds of the level's spatial shape, those indices will instead be
+            filled with mask values of -1.
+        level_indices (Tensor): Tensor of shape [sum(neighborhood_sizes^position_dim)]
+            mapping each neighborhood position to its corresponding resolution
+            level.
 
     Raises:
         ValueError: If input tensors don't have the expected shape or dimensions, or
diff --git a/sparse_transformer_layers/blocks/self_attn.py b/sparse_transformer_layers/blocks/self_attn.py
@@ -25,6 +25,11 @@ class MultilevelSelfAttentionBlockWithRoPE(nn.Module):
     This module applies self-attention across tokens from multiple resolution levels,
     using Rotary Position Encodings (RoPE) to encode the spatial positions of tokens.
 
+    This module is meant as a plug-and-play standard Multi-head Attention Transformer
+    block, encapsulating the input projection, RoPE encoding, attention operation,
+    output operation with optional dropout, and layer normalization in either pre-norm
+    or post-norm configurations.
+
     Args:
         embed_dim (int): Dimensionality of input and output embeddings.
         n_heads (int): Number of attention heads.
diff --git a/sparse_transformer_layers/layers/sparse_ms_deform_attn/layer.py b/sparse_transformer_layers/layers/sparse_ms_deform_attn/layer.py
@@ -25,6 +25,8 @@ class SparseMSDeformableAttention(nn.Module):
     to efficiently sample values from the sparse feature maps at the predicted
     locations.
 
+    The current version of this module only supports spatially-2D data.
+
     The module contains learnable parameters for:
     - A value projection (`value_proj`) applied to the input feature maps.
     - A linear layer (`sampling_offsets`) to predict the 2D offsets for each
diff --git a/tests/layers/subset_attn/autograd_op/test_properties.py b/tests/layers/subset_attn/autograd_op/test_properties.py
@@ -19,11 +19,11 @@
 
 
 @pytest.mark.cuda_if_available
-@settings(deadline=None, max_examples=25)
+@settings(deadline=None, max_examples=10)
 @given(
     input_params=exhaustive_attention_input_configs(
         dtypes=[torch.double], min_requiring_grads=1
-    )
+    ),
 )
 def test_gradcheck_exhaustive(device: str, input_params: dict[str, Any]) -> None:
     """Gradcheck test letting Hypothesis really explore the input space.