add smollm to torchtune (#2887)

zhangtemplar · facebook-github-bot · commit 6406b9f7df39 · 2025-07-21T15:43:41.000-07:00
Summary:

add smolLM2 family to torchtune. SmolLM2 is a family of compact language models available in three size: 135M, 360M, and 1.7B parameters. They architecture is the same as LLaMA 3.

Reviewed By: byzhang

Differential Revision:
D78495904

Privacy Context Container: L1305358
diff --git a/torchtune/models/smol/__init__.py b/torchtune/models/smol/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from ._component_builders import smollm2
+
+from ._model_builders import (
+    smollm2_135m,
+    smollm2_360m,
+    smollm2_1_7b,
+)
+
+__all__ = [
+    "smollm2",
+    "smollm2_135m",
+    "smollm2_360m",
+    "smollm2_1_7b",
+]
diff --git a/torchtune/models/smol/_component_builders.py b/torchtune/models/smol/_component_builders.py
@@ -0,0 +1,49 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from torchtune.modules import (
+    TransformerDecoder,
+)
+from torchtune.models.llama3_2._component_builders import llama3_2
+
+"""
+Component builders for SmolLM 2. It is based on LLaMA architecture.
+
+https://huggingface.co/HuggingFaceTB/SmolLM2-135M/
+
+SmolLM2 is a family of compact language models available in three size: 135M, 360M, 
+and 1.7B parameters. They are capable of solving a wide range of tasks while being 
+lightweight enough to run on-device. More details in our paper: https://arxiv.org/abs/2502.02737v1
+"""
+
+
+def smollm2(
+    num_layers: int,
+    num_heads: int,
+    num_kv_heads: int,
+    embed_dim: int,
+    intermediate_dim: int,
+    max_seq_len: int = 8192,
+    vocab_size: int = 49152,
+    attn_dropout: float = 0.0,
+    rope_base: int = 100000,
+    norm_eps: float = 1e-5,
+    scale_factor: int = 32,
+    tie_word_embeddings: bool = True,
+) -> TransformerDecoder:
+    return llama3_2(
+        vocab_size=vocab_size,
+        num_layers=num_layers,
+        num_heads=num_heads,
+        num_kv_heads=num_kv_heads,
+        embed_dim=embed_dim,
+        max_seq_len=max_seq_len,
+        attn_dropout=attn_dropout,
+        rope_base=rope_base,
+        intermediate_dim=intermediate_dim,
+        norm_eps=norm_eps,
+        scale_factor=scale_factor,
+        tie_word_embeddings=tie_word_embeddings,
+    )
diff --git a/torchtune/models/smol/_model_builders.py b/torchtune/models/smol/_model_builders.py
@@ -0,0 +1,29 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from torchtune.models.smol._component_builders import smollm2
+
+from torchtune.modules import TransformerDecoder
+
+"""
+https://huggingface.co/HuggingFaceTB/SmolLM2-135M/
+
+SmolLM2 is a family of compact language models available in three size: 135M, 360M, 
+and 1.7B parameters. They are capable of solving a wide range of tasks while being 
+lightweight enough to run on-device. More details in our paper: https://arxiv.org/abs/2502.02737v1
+"""
+
+
+def smollm2_135m() -> TransformerDecoder:
+    return smollm2(30, 9, 3, 576, 1536)
+
+
+def smollm2_360m() -> TransformerDecoder:
+    return smollm2(32, 15, 5, 960, 2560)
+
+
+def smollm2_1_7b() -> TransformerDecoder:
+    return smollm2(24, 32, 32, 2048, 8192, rope_base=130000)