ecmwf
diff --git a/‎.github/workflows/inactivity-bot.yml‎
Lines changed: 7 additions & 1 deletion b/‎.github/workflows/inactivity-bot.yml‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎graphs/src/anemoi/graphs/nodes/attributes/area_weights.py‎
Lines changed: 1 addition & 1 deletion b/‎graphs/src/anemoi/graphs/nodes/attributes/area_weights.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎models/src/anemoi/models/layers/attention.py‎
Lines changed: 15 additions & 10 deletions b/‎models/src/anemoi/models/layers/attention.py‎
Lines changed: 15 additions & 10 deletions
diff --git a/‎models/src/anemoi/models/layers/block.py‎
Lines changed: 19 additions & 2 deletions b/‎models/src/anemoi/models/layers/block.py‎
Lines changed: 19 additions & 2 deletions
diff --git a/‎models/src/anemoi/models/layers/mapper.py‎
Lines changed: 42 additions & 0 deletions b/‎models/src/anemoi/models/layers/mapper.py‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎models/src/anemoi/models/layers/processor.py‎
Lines changed: 14 additions & 0 deletions b/‎models/src/anemoi/models/layers/processor.py‎
Lines changed: 14 additions & 0 deletions
@@ -5,13 +5,19 @@ on:
     - cron: "0 23 * * *"  # every day at 23pm on default(main) branch
   workflow_dispatch: # Allows manual trigger
 
+permissions:
+  actions: write
+  contents: write # only for delete-branch option
+  issues: write
+  pull-requests: write
+
 jobs:
   stale:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/stale@v9
         with:
-          repo-token: $
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
 
           # Issue settings
           days-before-issue-stale: 90 # (~3 months)
 
@@ -354,7 +354,7 @@ def compute_latitude_weight(self, latitudes: np.ndarray) -> np.ndarray:
 
 
 class IsolatitudeAreaWeights(BaseLatWeightedAttribute):
-    """Latitude-weighted area weights for rectilinear grids.
+    r"""Latitude-weighted area weights for rectilinear grids.
 
     Attributes
     ----------
 
@@ -57,6 +57,7 @@ def __init__(
         num_heads: int,
         embed_dim: int,
         layer_kernels: DotDict,
+        attn_channels: Optional[int] = None,
         qkv_bias: bool = False,
         qk_norm: bool = False,
         is_causal: bool = False,
@@ -81,7 +82,10 @@ def __init__(
         num_heads : int
             number of heads
         embed_dim : int
-            embedding dimension
+            Input and output embedding dimension
+        attn_channels : int, optional
+            Internal attention width used for q/k/v projections. If None,
+            defaults to embed_dim.
         qkv_bias : bool, optional
             bias for querys, keys and values, by default False
         qk_norm : bool, optional
@@ -102,16 +106,17 @@ def __init__(
         """
         super().__init__()
 
-        assert (
-            embed_dim % num_heads == 0
-        ), f"Embedding dimension ({embed_dim}) must be divisible by number of heads ({num_heads})"
+        self.attn_channels = embed_dim if attn_channels is None else attn_channels
+        if self.attn_channels <= 0:
+            raise ValueError(f"attn_channels must be > 0, got {self.attn_channels}")
+        if self.attn_channels % num_heads != 0:
+            raise ValueError(f"attn_channels ({self.attn_channels}) must be divisible by number of heads ({num_heads})")
 
         self.attention_implementation = attention_implementation
         self.use_alibi_slopes = use_alibi_slopes
 
         self.num_heads = num_heads
-        self.embed_dim = embed_dim
-        self.head_dim = embed_dim // num_heads  # q k v
+        self.head_dim = self.attn_channels // num_heads  # q k v
         self.window_size = window_size
         self.dropout_p = dropout_p
         self.is_causal = is_causal
@@ -128,11 +133,11 @@ def __init__(
             self.alibi_slopes = None
 
         linear = layer_kernels.Linear
-        self.lin_q = nn.Linear(embed_dim, embed_dim, bias=qkv_bias)
-        self.lin_k = nn.Linear(embed_dim, embed_dim, bias=qkv_bias)
-        self.lin_v = nn.Linear(embed_dim, embed_dim, bias=qkv_bias)
+        self.lin_q = nn.Linear(embed_dim, self.attn_channels, bias=qkv_bias)
+        self.lin_k = nn.Linear(embed_dim, self.attn_channels, bias=qkv_bias)
+        self.lin_v = nn.Linear(embed_dim, self.attn_channels, bias=qkv_bias)
 
-        self.projection = linear(embed_dim, embed_dim, bias=True)
+        self.projection = linear(self.attn_channels, embed_dim, bias=True)
 
         if self.qk_norm:
             self.q_norm = layer_kernels["QueryNorm"](self.head_dim)
 
@@ -112,6 +112,7 @@ def __init__(
         num_heads: int,
         window_size: Optional[int],
         layer_kernels: DotDict,
+        attn_channels: Optional[int] = None,
         dropout_p: float = 0.0,
         qk_norm: bool = False,
         attention_implementation: str = "flash_attention",
@@ -128,6 +129,7 @@ def __init__(
         self.attention = MultiHeadSelfAttention(
             num_heads=num_heads,
             embed_dim=num_channels,
+            attn_channels=attn_channels,
             window_size=window_size,
             qkv_bias=False,
             is_causal=False,
@@ -186,6 +188,7 @@ def __init__(
         num_heads: int,
         window_size: Optional[int],
         layer_kernels: DotDict,
+        attn_channels: Optional[int] = None,
         dropout_p: float = 0.0,
         qk_norm: bool = False,
         attention_implementation: str = "flash_attention",
@@ -197,6 +200,7 @@ def __init__(
         super().__init__(
             num_channels=num_channels,
             hidden_dim=hidden_dim,
+            attn_channels=attn_channels,
             num_heads=num_heads,
             window_size=window_size,
             layer_kernels=layer_kernels,
@@ -212,6 +216,7 @@ def __init__(
         self.attention = MultiHeadCrossAttention(
             num_heads=num_heads,
             embed_dim=num_channels,
+            attn_channels=attn_channels,
             window_size=window_size,
             qkv_bias=False,
             qk_norm=qk_norm,
@@ -462,6 +467,7 @@ def __init__(
         mlp_implementation: MLPImplementation = "mlp",
         update_src_nodes: bool = False,
         layer_kernels: DotDict,
+        attn_channels: Optional[int] = None,
         graph_attention_backend: str = "triton",
         edge_pre_mlp: bool = False,
         **kwargs,
@@ -474,6 +480,9 @@ def __init__(
             Number of input channels.
         out_channels : int
             Number of output channels.
+        attn_channels : int, optional
+            Internal attention width used for q/k/v and edge projections. If
+            None, defaults to out_channels.
         num_heads : int,
             Number of heads
         edge_dim : int,
@@ -496,7 +505,15 @@ def __init__(
 
         self.update_src_nodes = update_src_nodes
 
-        self.out_channels_conv = out_channels // num_heads
+        self.attn_channels = out_channels if attn_channels is None else attn_channels
+        if self.attn_channels <= 0:
+            raise ValueError(f"attn_channels must be > 0, got {self.attn_channels}")
+        if self.attn_channels % num_heads != 0:
+            raise ValueError(
+                f"attn_channels ({self.attn_channels}) must be divisible by num_heads ({num_heads}) in {self.__class__.__name__}."
+            )
+
+        self.out_channels_conv = self.attn_channels // num_heads
         self.num_heads = num_heads
         self.qk_norm = qk_norm
 
@@ -508,7 +525,7 @@ def __init__(
         self.lin_self = Linear(in_channels, num_heads * self.out_channels_conv, bias=bias)
         self.lin_edge = Linear(edge_dim, num_heads * self.out_channels_conv)  # , bias=False)
 
-        self.projection = Linear(out_channels, out_channels)
+        self.projection = Linear(self.attn_channels, out_channels)
 
         if self.qk_norm:
             self.q_norm = layer_kernels.QueryNorm(self.out_channels_conv)
 
@@ -150,6 +150,7 @@ def __init__(
         num_heads: int,
         mlp_hidden_ratio: float,
         edge_dim: int,
+        attn_channels: Optional[int] = None,
         qk_norm: bool = False,
         mlp_implementation: MLPImplementation = "mlp",
         cpu_offload: bool = False,
@@ -179,6 +180,11 @@ def __init__(
             ratio of mlp hidden dimension to embedding dimension
         edge_dim : int
             Edge feature dimension
+        attn_channels : int, optional
+            Internal attention width used for q/k/v and edge projections. If
+            None, defaults to the hidden dimension. This allows reducing the
+            number of channels used for the attention computation without
+            changing the width of the surrounding MLPs.
         qk_norm : bool, optional
             Whether to use query and key normalization, default False
         mlp_implementation: MLPImplementation
@@ -213,6 +219,7 @@ def __init__(
             in_channels=hidden_dim,
             hidden_dim=compute_mlp_hidden_dim(hidden_dim, mlp_hidden_ratio),
             out_channels=hidden_dim,
+            attn_channels=attn_channels,
             num_heads=num_heads,
             edge_dim=edge_dim,
             qk_norm=qk_norm,
@@ -507,6 +514,7 @@ def __init__(
         num_heads: int,
         mlp_hidden_ratio: float,
         edge_dim: int,
+        attn_channels: Optional[int] = None,
         qk_norm: bool = False,
         mlp_implementation: MLPImplementation = "mlp",
         cpu_offload: bool = False,
@@ -534,6 +542,11 @@ def __init__(
             ratio of mlp hidden dimension to embedding dimension
         edge_dim : int
             Edge feature dimension
+        attn_channels : int, optional
+            Internal attention width used for q/k/v and edge projections. If
+            None, defaults to the hidden dimension. This allows reducing the
+            number of channels used for the attention computation without
+            changing the width of the surrounding MLPs.
         qk_norm : bool, optional
             Whether to use query and key normalization, default False
         mlp_implementation: MLPImplementation
@@ -561,6 +574,7 @@ def __init__(
             mlp_hidden_ratio=mlp_hidden_ratio,
             edge_dim=edge_dim,
             mlp_implementation=mlp_implementation,
+            attn_channels=attn_channels,
             layer_kernels=layer_kernels,
             shard_strategy=shard_strategy,
             graph_attention_backend=graph_attention_backend,
@@ -629,6 +643,7 @@ def __init__(
         num_heads: int,
         mlp_hidden_ratio: float,
         edge_dim: int,
+        attn_channels: Optional[int] = None,
         qk_norm: bool = False,
         mlp_implementation: MLPImplementation = "mlp",
         initialise_data_extractor_zero: bool = False,
@@ -659,6 +674,11 @@ def __init__(
             Ratio of mlp hidden dimension to embedding dimension
         edge_dim : int
             Edge feature dimension
+        attn_channels : int, optional
+            Internal attention width used for q/k/v and edge projections. If
+            None, defaults to the hidden dimension. This allows reducing the
+            number of channels used for the attention computation without
+            changing the width of the surrounding MLPs.
         qk_norm : bool, optional
             Whether to use query and key normalization, default False
         mlp_implementation: MLPImplementation
@@ -689,6 +709,7 @@ def __init__(
             mlp_hidden_ratio=mlp_hidden_ratio,
             edge_dim=edge_dim,
             mlp_implementation=mlp_implementation,
+            attn_channels=attn_channels,
             layer_kernels=layer_kernels,
             shard_strategy=shard_strategy,
             graph_attention_backend=graph_attention_backend,
@@ -1108,6 +1129,7 @@ def __init__(
         num_chunks: int,
         num_heads: int,
         mlp_hidden_ratio: float,
+        attn_channels: Optional[int] = None,
         window_size: Optional[int] = None,
         dropout_p: float = 0.0,
         qk_norm: bool = False,
@@ -1133,6 +1155,11 @@ def __init__(
             Output channels of the destination node, by default None
         mlp_hidden_ratio: float
             Ratio of mlp hidden dimension to embedding dimension
+        attn_channels : int, optional
+            Internal attention width used for q/k/v projections. If None,
+            defaults to the hidden dimension. This allows reducing the number
+            of channels used for the attention computation without changing
+            the width of the surrounding MLPs.
         qk_norm: bool, optional
             Normalize query and key, by default False
         dropout_p: float, optional
@@ -1167,6 +1194,7 @@ def __init__(
         self.proc = TransformerMapperBlock(
             num_channels=hidden_dim,
             hidden_dim=compute_mlp_hidden_dim(hidden_dim, mlp_hidden_ratio),
+            attn_channels=attn_channels,
             num_heads=num_heads,
             window_size=window_size,
             layer_kernels=self.layer_factory,
@@ -1256,6 +1284,7 @@ def __init__(
         num_chunks: int,
         num_heads: int,
         mlp_hidden_ratio: float,
+        attn_channels: Optional[int] = None,
         qk_norm: bool = False,
         dropout_p: float = 0.0,
         mlp_implementation: MLPImplementation = "mlp",
@@ -1282,6 +1311,11 @@ def __init__(
             Output channels of the destination node, by default None
         mlp_hidden_ratio: float
             Ratio of mlp hidden dimension to embedding dimension
+        attn_channels : int, optional
+            Internal attention width used for q/k/v projections. If None,
+            defaults to the hidden dimension. This allows reducing the number
+            of channels used for the attention computation without changing
+            the width of the surrounding MLPs.
         qk_norm: bool, optional
             Normalize query and key, by default False
         dropout_p: float, optional
@@ -1313,6 +1347,7 @@ def __init__(
             cpu_offload=cpu_offload,
             num_heads=num_heads,
             mlp_hidden_ratio=mlp_hidden_ratio,
+            attn_channels=attn_channels,
             window_size=window_size,
             dropout_p=dropout_p,
             qk_norm=qk_norm,
@@ -1384,6 +1419,7 @@ def __init__(
         num_chunks: int,
         num_heads: int,
         mlp_hidden_ratio: float,
+        attn_channels: Optional[int] = None,
         qk_norm: bool = False,
         dropout_p: float = 0.0,
         mlp_implementation: MLPImplementation = "mlp",
@@ -1410,6 +1446,11 @@ def __init__(
             Output channels of the destination node, by default None
         mlp_hidden_ratio: float
             Ratio of mlp hidden dimension to embedding dimension
+        attn_channels : int, optional
+            Internal attention width used for q/k/v projections. If None,
+            defaults to the hidden dimension. This allows reducing the number
+            of channels used for the attention computation without changing
+            the width of the surrounding MLPs.
         qk_norm: bool, optional
             Normalize query and key, by default False
         dropout_p: float, optional
@@ -1441,6 +1482,7 @@ def __init__(
             cpu_offload=cpu_offload,
             num_heads=num_heads,
             mlp_hidden_ratio=mlp_hidden_ratio,
+            attn_channels=attn_channels,
             window_size=window_size,
             dropout_p=dropout_p,
             qk_norm=qk_norm,
 
@@ -213,6 +213,7 @@ def __init__(
         num_chunks: int,
         num_heads: int,
         mlp_hidden_ratio: float,
+        attn_channels: Optional[int] = None,
         qk_norm=False,
         dropout_p: float = 0.0,
         attention_implementation: str = "flash_attention",
@@ -238,6 +239,11 @@ def __init__(
             Number of heads in transformer
         mlp_hidden_ratio: float
             Ratio of mlp hidden dimension to embedding dimension
+        attn_channels : int, optional
+            Internal attention width used for q/k/v projections. If None,
+            defaults to num_channels. This allows reducing the number of
+            channels used for the attention computation without changing the
+            width of the surrounding MLPs.
         qk_norm: bool, optional
             Normalize query and key, by default False
         dropout_p: float, optional
@@ -275,6 +281,7 @@ def __init__(
             TransformerProcessorBlock,
             num_channels=num_channels,
             hidden_dim=compute_mlp_hidden_dim(num_channels, mlp_hidden_ratio),
+            attn_channels=attn_channels,
             num_heads=num_heads,
             qk_norm=qk_norm,
             window_size=window_size,
@@ -423,6 +430,7 @@ def __init__(
         num_heads: int,
         mlp_hidden_ratio: float,
         edge_dim: int,
+        attn_channels: Optional[int] = None,
         qk_norm: bool = False,
         mlp_implementation: MLPImplementation = "mlp",
         cpu_offload: bool = False,
@@ -447,6 +455,11 @@ def __init__(
             Ratio of mlp hidden dimension to embedding dimension
         edge_dim : int
             Edge feature dimension
+        attn_channels : int, optional
+            Internal attention width used for q/k/v and edge projections. If
+            None, defaults to num_channels. This allows reducing the number
+            of channels used for the attention computation without changing
+            the width of the surrounding MLPs.
         qk_norm: bool, optional
             Normalize query and key, by default False
         mlp_implementation: MLPImplementation
@@ -476,6 +489,7 @@ def __init__(
             in_channels=num_channels,
             hidden_dim=compute_mlp_hidden_dim(num_channels, mlp_hidden_ratio),
             out_channels=num_channels,
+            attn_channels=attn_channels,
             num_heads=num_heads,
             layer_kernels=self.layer_factory,
             qk_norm=qk_norm,