Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
7304119
fix GPTMemmapDataset
tscholak Nov 9, 2024
47d453b
fix GPTMemmapDataset
tscholak Nov 9, 2024
bef3a72
add prepare-dataset command
tscholak Nov 10, 2024
0ffc75c
add prepare-dataset command
tscholak Nov 10, 2024
fda6386
add prepare-dataset command
tscholak Nov 10, 2024
acae7d9
add prepare-dataset command
tscholak Nov 10, 2024
eb7da59
add prepare-dataset command
tscholak Nov 10, 2024
b5ed2f0
add prepare-dataset command
tscholak Nov 10, 2024
c8f746a
only push latest tag for commits to main
tscholak Nov 10, 2024
e0f813c
use older generics syntax
tscholak Nov 10, 2024
b88c9d3
remove user and install Fast-LLM globally
tscholak Nov 10, 2024
4df12d9
simplify Dockerfile
tscholak Nov 11, 2024
3737bc0
improvements
tscholak Nov 11, 2024
4b6b195
add docstring
tscholak Nov 11, 2024
52a6f0b
use full imports
tscholak Nov 11, 2024
55b0b88
use full imports
tscholak Nov 11, 2024
1f975d2
use full imports
tscholak Nov 11, 2024
b665e91
don't load tokenizer during validatin
tscholak Nov 11, 2024
af1439e
Merge remote-tracking branch 'origin/main' into tscholak/prepare-dataset
tscholak Nov 11, 2024
e51677f
simplify
tscholak Nov 12, 2024
1f447bb
simplify
tscholak Nov 12, 2024
fb50c13
address comments
tscholak Nov 12, 2024
33067c8
address comments
tscholak Nov 12, 2024
dbc221c
address comments
tscholak Nov 12, 2024
a2ae051
address comments
tscholak Nov 12, 2024
81162b3
fixes
jlamypoirier Nov 12, 2024
a134a52
fix
jlamypoirier Nov 12, 2024
fbb011a
No venv
jlamypoirier Nov 12, 2024
4827f49
Faster tests
jlamypoirier Nov 12, 2024
f8c328f
use dtype
tscholak Nov 13, 2024
ded3027
remove unused venv package
tscholak Nov 13, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 2 additions & 6 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,9 @@ jobs:
ghcr.io/servicenow/fast-llm
tags: |
type=schedule
type=ref,event=branch
type=semver,pattern={{version}}
type=semver,pattern={{major}}.{{minor}}
type=semver,pattern={{major}}
type=pep440,pattern={{version}}
type=sha
type=raw,value=latest,enabled={{github.ref == 'refs/heads/main'}}
type=raw,value=latest,enable={{is_default_branch}}

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
Expand All @@ -78,7 +75,6 @@ jobs:
uses: docker/build-push-action@v6
with:
context: .
# push: ${{ github.event_name != 'pull_request' }}
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
Expand Down
33 changes: 15 additions & 18 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,28 +7,25 @@ RUN apt-get update \
&& rm -rf /var/lib/apt/lists/* \
&& git lfs install

# Add a user for Fast-LLM with sudo privileges for runtime adjustments
ARG FAST_LLM_USER_ID=1000
RUN useradd -m -u $FAST_LLM_USER_ID -s /bin/bash fast_llm \
&& echo 'fast_llm ALL=(ALL) NOPASSWD: ALL' >> /etc/sudoers

USER fast_llm
# Set the working directory
WORKDIR /app

# Environment settings for Python and PATH
ENV PYTHONPATH=/app:/app/Megatron-LM \
PATH=$PATH:/home/fast_llm/.local/bin/
# Environment settings for Python and the user
ENV PYTHONPATH=/app:/app/Megatron-LM

# Copy the dependency files and install dependencies
COPY --chown=fast_llm setup.py setup.cfg pyproject.toml ./
COPY --chown=fast_llm ./fast_llm/csrc/ fast_llm/csrc/
# Copy the dependency files and install dependencies globally
COPY setup.py setup.cfg pyproject.toml ./
COPY ./fast_llm/csrc/ fast_llm/csrc/
RUN PIP_NO_INPUT=1 pip3 install --no-cache-dir --no-build-isolation -e ".[CORE,OPTIONAL,DEV]"

# Copy the rest of the code
COPY --chown=fast_llm ./Megatron-LM Megatron-LM
COPY --chown=fast_llm ./examples examples
COPY --chown=fast_llm ./tests tests
COPY --chown=fast_llm ./tools tools
COPY ./Megatron-LM Megatron-LM
COPY ./examples examples
COPY ./tests tests
COPY ./tools tools

# Copy the main source code
COPY --exclude=./fast_llm/csrc/ ./fast_llm/ fast_llm/

# Copy the main source code for Fast-LLM
COPY --exclude=./fast_llm/csrc/ --chown=fast_llm ./fast_llm/ fast_llm/
# Ensure the source code files are writable
RUN chmod -R a+w /app
1 change: 0 additions & 1 deletion fast_llm/data/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,6 @@ def _validate(self):
class TokenizerConfig(Config):
"""
Configuration for the tokenizer.
Currently, the tokenizer is only needed for FIM.
"""

format: str = Field(
Expand Down
2 changes: 1 addition & 1 deletion fast_llm/data/gpt/memmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def write_dataset(cls, prefix: pathlib.Path | str, documents: list[np.ndarray]):
dtype = documents[0].dtype
num_documents = len(documents)
lengths = np.array([len(document) for document in documents], dtype=np.int32)
pointers = padded_cumsum(lengths[:-1].astype(np.int64) * 2)
pointers = padded_cumsum(lengths[:-1].astype(np.int64)) * np.dtype(dtype).itemsize
prefix.parent.mkdir(parents=True, exist_ok=True)
with prefix.with_suffix(".idx").open("wb") as stream:
stream.write(cls._INDEX_HEADER)
Expand Down
2 changes: 1 addition & 1 deletion fast_llm/data/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

class Tokenizer:
"""
A Huggingface (transformers) tokenizer.
A wrapper around Huggingface (transformers) tokenizer.
"""

def __init__(self, config: TokenizerConfig):
Expand Down
4 changes: 3 additions & 1 deletion fast_llm/tools/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,15 @@ def fast_llm(args=None):
# (Pre-)configure logging
configure_logging()
parser = argparse.ArgumentParser(add_help=False)
parser.add_argument("subcommand", choices=["train", "convert"])
parser.add_argument("subcommand", choices=["train", "convert", "prepare_dataset"])
parsed, unparsed = parser.parse_known_args(args)
try:
if parsed.subcommand == "train":
from fast_llm.tools.train import CliTrainingConfig as Runnable
elif parsed.subcommand == "convert":
from fast_llm.tools.convert import ConversionConfig as Runnable
elif parsed.subcommand == "prepare_dataset":
from fast_llm.tools.prepare_dataset import PrepareDatasetConfig as Runnable
else:
raise RuntimeError("Unknown subcommand")
Runnable.parse_and_run(unparsed)
Expand Down
Loading