ServiceNow · tscholak · Nov 13, 2024 · Nov 9, 2024 · Nov 9, 2024 · Nov 10, 2024
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -57,12 +57,9 @@ jobs:
             ghcr.io/servicenow/fast-llm
           tags: |
             type=schedule
-            type=ref,event=branch
-            type=semver,pattern={{version}}
-            type=semver,pattern={{major}}.{{minor}}
-            type=semver,pattern={{major}}
+            type=pep440,pattern={{version}}
             type=sha
-            type=raw,value=latest,enabled={{github.ref == 'refs/heads/main'}}
+            type=raw,value=latest,enable={{is_default_branch}}
 
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3
@@ -78,7 +75,6 @@ jobs:
         uses: docker/build-push-action@v6
         with:
           context: .
-          # push: ${{ github.event_name != 'pull_request' }}
           push: true
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}

diff --git a/Dockerfile b/Dockerfile
@@ -7,28 +7,25 @@ RUN apt-get update \
     && rm -rf /var/lib/apt/lists/* \
     && git lfs install
 
-# Add a user for Fast-LLM with sudo privileges for runtime adjustments
-ARG FAST_LLM_USER_ID=1000
-RUN useradd -m -u $FAST_LLM_USER_ID -s /bin/bash fast_llm \
-    && echo 'fast_llm ALL=(ALL) NOPASSWD: ALL' >> /etc/sudoers
-
-USER fast_llm
+# Set the working directory
 WORKDIR /app
 
-# Environment settings for Python and PATH
-ENV PYTHONPATH=/app:/app/Megatron-LM \
-    PATH=$PATH:/home/fast_llm/.local/bin/
+# Environment settings for Python and the user
+ENV PYTHONPATH=/app:/app/Megatron-LM
 
-# Copy the dependency files and install dependencies
-COPY --chown=fast_llm setup.py setup.cfg pyproject.toml ./
-COPY --chown=fast_llm ./fast_llm/csrc/ fast_llm/csrc/
+# Copy the dependency files and install dependencies globally
+COPY setup.py setup.cfg pyproject.toml ./
+COPY ./fast_llm/csrc/ fast_llm/csrc/
 RUN PIP_NO_INPUT=1 pip3 install --no-cache-dir --no-build-isolation -e ".[CORE,OPTIONAL,DEV]"
 
 # Copy the rest of the code
-COPY --chown=fast_llm ./Megatron-LM Megatron-LM
-COPY --chown=fast_llm ./examples examples
-COPY --chown=fast_llm ./tests tests
-COPY --chown=fast_llm ./tools tools
+COPY ./Megatron-LM Megatron-LM
+COPY ./examples examples
+COPY ./tests tests
+COPY ./tools tools
+
+# Copy the main source code
+COPY --exclude=./fast_llm/csrc/ ./fast_llm/ fast_llm/
 
-# Copy the main source code for Fast-LLM
-COPY --exclude=./fast_llm/csrc/ --chown=fast_llm ./fast_llm/ fast_llm/
+# Ensure the source code files are writable
+RUN chmod -R a+w /app
diff --git a/fast_llm/data/config.py b/fast_llm/data/config.py
@@ -106,7 +106,6 @@ def _validate(self):
 class TokenizerConfig(Config):
     """
     Configuration for the tokenizer.
-    Currently, the tokenizer is only needed for FIM.
     """
 
     format: str = Field(

diff --git a/fast_llm/data/gpt/memmap.py b/fast_llm/data/gpt/memmap.py
@@ -106,7 +106,7 @@ def write_dataset(cls, prefix: pathlib.Path | str, documents: list[np.ndarray]):
         dtype = documents[0].dtype
         num_documents = len(documents)
         lengths = np.array([len(document) for document in documents], dtype=np.int32)
-        pointers = padded_cumsum(lengths[:-1].astype(np.int64) * 2)
+        pointers = padded_cumsum(lengths[:-1].astype(np.int64)) * np.dtype(dtype).itemsize
         prefix.parent.mkdir(parents=True, exist_ok=True)
         with prefix.with_suffix(".idx").open("wb") as stream:
             stream.write(cls._INDEX_HEADER)

diff --git a/fast_llm/data/tokenizer.py b/fast_llm/data/tokenizer.py
@@ -6,7 +6,7 @@
 
 class Tokenizer:
     """
-    A Huggingface (transformers) tokenizer.
+    A wrapper around Huggingface (transformers) tokenizer.
     """
 
     def __init__(self, config: TokenizerConfig):

diff --git a/fast_llm/tools/cli.py b/fast_llm/tools/cli.py
@@ -15,13 +15,15 @@ def fast_llm(args=None):
     # (Pre-)configure logging
     configure_logging()
     parser = argparse.ArgumentParser(add_help=False)
-    parser.add_argument("subcommand", choices=["train", "convert"])
+    parser.add_argument("subcommand", choices=["train", "convert", "prepare_dataset"])
     parsed, unparsed = parser.parse_known_args(args)
     try:
         if parsed.subcommand == "train":
             from fast_llm.tools.train import CliTrainingConfig as Runnable
         elif parsed.subcommand == "convert":
             from fast_llm.tools.convert import ConversionConfig as Runnable
+        elif parsed.subcommand == "prepare_dataset":
+            from fast_llm.tools.prepare_dataset import PrepareDatasetConfig as Runnable
         else:
             raise RuntimeError("Unknown subcommand")
         Runnable.parse_and_run(unparsed)