From 96dd72ac9af6efd26cbb2e682ac8efb8e0d637ff Mon Sep 17 00:00:00 2001 From: David Jurado Date: Fri, 9 Jul 2021 11:53:09 -0500 Subject: [PATCH 01/10] Add boston housing example --- boston_housing/.gitignore | 6 + boston_housing/README.md | 216 ++++++++++++++++++ boston_housing/mlcube/.mlcube.yaml | 29 +++ boston_housing/mlcube/platforms/docker.yaml | 15 ++ .../mlcube/workspace/parameters.yaml | 1 + boston_housing/project/01_download_dataset.py | 34 +++ .../project/02_preprocess_dataset.py | 39 ++++ boston_housing/project/03_train.py | 46 ++++ boston_housing/project/Dockerfile | 13 ++ boston_housing/project/mlcube.py | 73 ++++++ boston_housing/project/requirements.txt | 5 + boston_housing/project/run_and_time.sh | 12 + 12 files changed, 489 insertions(+) create mode 100644 boston_housing/.gitignore create mode 100644 boston_housing/README.md create mode 100644 boston_housing/mlcube/.mlcube.yaml create mode 100644 boston_housing/mlcube/platforms/docker.yaml create mode 100644 boston_housing/mlcube/workspace/parameters.yaml create mode 100644 boston_housing/project/01_download_dataset.py create mode 100644 boston_housing/project/02_preprocess_dataset.py create mode 100644 boston_housing/project/03_train.py create mode 100644 boston_housing/project/Dockerfile create mode 100644 boston_housing/project/mlcube.py create mode 100644 boston_housing/project/requirements.txt create mode 100644 boston_housing/project/run_and_time.sh diff --git a/boston_housing/.gitignore b/boston_housing/.gitignore new file mode 100644 index 0000000..200f99b --- /dev/null +++ b/boston_housing/.gitignore @@ -0,0 +1,6 @@ +project/raw_dataset.txt +project/processed_dataset.csv +mlcube/workspace/data +mlcube/run +mlcube/tasks +mlcube/mlcube.yaml \ No newline at end of file diff --git a/boston_housing/README.md b/boston_housing/README.md new file mode 100644 index 0000000..d7ee9c3 --- /dev/null +++ b/boston_housing/README.md @@ -0,0 +1,216 @@ +# Packing an existing projecto into MLCUbe + +In this tutorial we're going to use the [Boston Housing Dataset](https://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html). We'll take an existing implementation, create the needed files to pack it into MLCube and execute all tasks. + + +## Original project code + +At fist we have only 4 files, one for package dependencies and 3 scripts for each task: download data, preprocess data and train. + +``` +├── project + ├── 01_download_dataset.py + ├── 02_preprocess_dataset.py + ├── 03_train.py + └── requirements.txt +``` + +The most important thing that we need to remember about these scripts are the input parameters: + +* 01_download_dataset.py + +**--data_dir** : Dataset download path, inside this folder path a new file called raw_dataset.txt will be created. + +* 02_preprocess_dataset.py + +**--data_dir** : Folder path containing raw dataset file, when finished a new file called processed_dataset.csv will be created. + +* 03_train.py + +**--dataset_file_path** : Processed dataset file path. Note: this is the full path to the csv file. +**--n_estimators** : Number of boosting stages to perform. In this case we're using a gradient boosting regressor. + + +## MLCube scructure + +We'll need some files for MLCube, first we'll need to create a folder called **mlcube** in the same path from as project folder. We'll need to create the following structure (for this tutorial the files are already in place but some of them are empty for you to define their content) + +``` +├── mlcube +│   ├── .mlcube.yaml +│   ├── platforms +│   │   └── docker.yaml +│   └── workspace +│   └── parameters.yaml +└── project + ├── 01_download_dataset.py + ├── 02_preprocess_dataset.py + ├── 03_train.py + └── requirements.txt +``` + +In the following steps we'll describe each file. + +## Define tasks execution scripts + +In general, we'll have a script for each task, and there are different ways to describe their execution from a main hanlder file, in this tutorial we'll use a function from the Python subprocess modeule: + +* subprocess.Popen() + +When we don't have input parameters for a Python script (or maybe just one) we can describe the execution of that script from Python code as follows: + +```Python +import subprocess +# Set the full command as variable +command = "python my_task.py --single_parameter input" +# Split the command, this will give us the list: +# ['python', 'my_task.py', '--single_parameter', 'input'] +splitted_command = command.split() +# Execute the command as a new process +process = subprocess.Popen(splitted_command, cwd=".") +# Wait for the process to finish +process.wait() +``` + +### MLCube File: mlcube/workspace/parameters.yaml + +When we have a script with multiple input parameters, it will be hard to store the full command to execute it in a single variable, in this case we can create a shell script describing all the arguments and even add some extra fucntionalities, this will useful since we can define the input parameters as environment variables. + +We can use the **mlcube/workspace/parameters.yaml** file to describe all the input parameters we'll use (this file is already provided, please take a look and study its content), the idea is to describe all the parameters in this file and then use this single file as an input for the task. Then we can read the content of the parameters file in Python and set all the parameters as environment variables. Finally with the environment variables setted we can excute a shell script with our implementation. + +The way we execute all these steps in Python is described below. + +```Python +import os +import yaml +# Read the file and store the parameters in a variable +with open(parameters_file, 'r') as stream: + parameters = yaml.safe_load(stream) +# Get the system's enviroment +env = os.environ.copy() +# We can add a single new enviroment as follows +env.update({ +'NEW_ENV_VARIABLE': "my_new_env_variable", +}) +# Add all the parameters we got from the parameters file +env.update(parameters) +# Execute the shell script with the updated enviroment +process = subprocess.Popen("./run_and_time.sh", cwd=".", env=env) +# Wait for the process to finish +process.wait() +``` + +### Shell script + +In this tutorial we already have a shell script containing the steps to run the train task, the file is: **project/run_and_time.sh**, please take a look and study its content. + +### MLCube handler Python file + +At this point we know how to execute the tasks sripts from Python code, now we can create a file that contains the definition on how to run each task. + +This file will be located in **project/mlcube.py**, this is the main file that will serve as the entrypoint to run all tasks. + +This file is already provided, please take a look and study its content. + +## Dockerize the project + +We'll create a Dockerfile with the needed steps to run the project, at the end we'll need to define the execution of the **mlcube.py** file as the entrypoint. This file will be located in **project/Dockerfile**. + +This file is already provided, please take a look and study its content. + +When creating the docker image, we'll need to run the docker build command inside the project folder, the command that we'll use is: + +`docker build . -t mlcommons/boston_housing:0.0.1 -f Dockerfile` + +Keep in mind the tag that we just described. + +At this point our solution folder structure should look like this: + +``` +├── mlcube +│   ├── .mlcube.yaml +│   ├── platforms +│   │   └── docker.yaml +│   └── workspace +│   └── parameters.yaml +└── project + ├── 01_download_dataset.py + ├── 02_preprocess_dataset.py + ├── 03_train.py + ├── Dockerfile + ├── mlcube.py + ├── requirements.txt + └── run_and_time.sh +``` + + +### Define MLCube files + +Inside the mlcube folder we'll need to define the following files. + +### mlcube/platforms/docker.yaml + +This file contains the description of the platform that we'll use to run MLCube, in this case is Docker. In the container definition we'll have the following subfields: + +* command: Main command to run, in this case is docker +* run_args: In this field we'll define all the arguments to run the docker conatiner, e.g. --rm, --gpus, etc. +* image: Image to use, in this case we'll need to use the same image tag from the docker build command. + +This file is already provided, please take a look and study its content. + +### MLCube task definition file + +The file located in **mlcube/.mlcube.yaml** contains the definition of all the tasks and their parameters. + +This file is already provided, please take a look and study its content. + +With this file we have finished the packing of the project into MLCube! Now we can setup the project and run all the tasks. + + +### Project setup +```Python +# Create Python environment +virtualenv -p python3 ./env && source ./env/bin/activate + +# Install MLCube and MLCube docker runner from GitHub repository (normally, users will just run `pip install mlcube mlcube_docker`) +git clone https://github.com/mlcommons/mlcube && cd ./mlcube +cd ./mlcube && python setup.py bdist_wheel && pip install --force-reinstall ./dist/mlcube-* && cd .. +cd ./runners/mlcube_docker && python setup.py bdist_wheel && pip install --force-reinstall --no-deps ./dist/mlcube_docker-* && cd ../../.. +python3 -m pip install tornado + +# Fetch the boston housing example from GitHub +git clone https://github.com/mlcommons/mlcube_examples && cd ./mlcube_examples +git fetch origin pull/27/head:feature/boston_housing && git checkout feature/boston_housing +cd ./boston_housing/project + +# Build MLCube docker image. +docker build . -t mlcommons/boston_housing:0.0.1 -f Dockerfile + +# Show tasks implemented in this MLCube. +cd ../mlcube && mlcube describe +``` + +### Dataset + +The [Boston Housing Dataset](https://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html) will be downloaded and processed. Sizes of the dataset in each step: + +| Dataset Step | MLCube Task | Format | Size | +|--------------------------------|-------------------|------------|---------| +| Downlaod (Compressed dataset) | download_data | txt file | ~52 KB | +| Preprocess (Processed dataset) | preprocess_data | csv file | ~40 KB | +| Total | (After all tasks) | All | ~92 KB | + +### Tasks execution +``` +# Download Boston housing dataset. Default path = /workspace/data +# To override it, use --data_dir=DATA_DIR +mlcube run --task download_data --platform docker + +# Preprocess Boston housing dataset, this will convert raw .txt data to .csv format +# It will use the DATA_DIR path defined in the previous step +mlcube run --task preprocess_data --platform docker + +# Run training. +# Parameters to override: --dataset_file_path=DATASET_FILE_PATH --parameters_file=PATH_TO_TRAINING_PARAMS +mlcube run --task train --platform docker +``` \ No newline at end of file diff --git a/boston_housing/mlcube/.mlcube.yaml b/boston_housing/mlcube/.mlcube.yaml new file mode 100644 index 0000000..c2724e5 --- /dev/null +++ b/boston_housing/mlcube/.mlcube.yaml @@ -0,0 +1,29 @@ +name: MLCommons Boston Housing +author: MLCommons Best Practices Working Group + +tasks: + # Download boston housing dataset + download_data: + parameters: + # Directory where dataset will be saved. + - {name: data_dir, type: directory, io: output} + tasks: + download_data: {data_dir: $WORKSPACE/data} + # Preprocess dataset + preprocess_data: + parameters: + # Same directory location where dataset was downloaded + - {name: data_dir, type: directory, io: output} + tasks: + preprocess_data: {data_dir: $WORKSPACE/data} + # Train gradient boosting regressor model + train: + parameters: + # Processed dataset file + - {name: dataset_file_path, type: file, io: input} + # Yaml file with training parameters. + - {name: parameters_file, type: file, io: input} + tasks: + train: + dataset_file_path: $WORKSPACE/data/processed_dataset.csv + parameters_file: $WORKSPACE/parameters.yaml \ No newline at end of file diff --git a/boston_housing/mlcube/platforms/docker.yaml b/boston_housing/mlcube/platforms/docker.yaml new file mode 100644 index 0000000..f61fd6f --- /dev/null +++ b/boston_housing/mlcube/platforms/docker.yaml @@ -0,0 +1,15 @@ +schema_type: mlcube_platform +schema_version: 0.1.0 + +platform: + name: "docker" + version: ">=18.01" + +container: + command: docker + run_args: >- + --rm --net=host --uts=host --ipc=host + --ulimit stack=67108864 --ulimit memlock=-1 + --privileged=true --security-opt seccomp=unconfined + -v /dev/shm:/dev/shm + image: mlcommons/boston_housing:0.0.1 \ No newline at end of file diff --git a/boston_housing/mlcube/workspace/parameters.yaml b/boston_housing/mlcube/workspace/parameters.yaml new file mode 100644 index 0000000..4d9e545 --- /dev/null +++ b/boston_housing/mlcube/workspace/parameters.yaml @@ -0,0 +1 @@ +N_ESTIMATORS: "500" \ No newline at end of file diff --git a/boston_housing/project/01_download_dataset.py b/boston_housing/project/01_download_dataset.py new file mode 100644 index 0000000..de66bf9 --- /dev/null +++ b/boston_housing/project/01_download_dataset.py @@ -0,0 +1,34 @@ +"""Download the raw Boston Housing Dataset""" +import os +import argparse +import requests + +DATASET_URL = "http://lib.stat.cmu.edu/datasets/boston" + + +def download_dataset(data_dir): + """Download dataset and store it in a given path. + Args: + data_dir (str): Dataset download path.""" + + request = requests.get(DATASET_URL) + file_name = "raw_dataset.txt" + file_path = os.path.join(data_dir, file_name) + with open(file_path,'wb') as f: + f.write(request.content) + print(f"\nRaw dataset saved at: {file_path}") + + +def main(): + + parser = argparse.ArgumentParser(description='Download dataset') + parser.add_argument('--data_dir', required=True, + help='Dataset download path') + args = parser.parse_args() + + data_dir = args.data_dir + download_dataset(data_dir) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/boston_housing/project/02_preprocess_dataset.py b/boston_housing/project/02_preprocess_dataset.py new file mode 100644 index 0000000..6306509 --- /dev/null +++ b/boston_housing/project/02_preprocess_dataset.py @@ -0,0 +1,39 @@ +"""Preprocess the dataset and save in CSV format""" +import os +import argparse +import pandas as pd + +def process_data(data_dir): + """Process raw dataset and save it in CSV format. + Args: + data_dir (str): Folder path containing dataset.""" + + col_names = ["CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT", "PRICE"] + raw_file = os.path.join(data_dir, "raw_dataset.txt") + print(f"\nProcessing raw file: {raw_file}") + + df = pd.read_csv(raw_file, skiprows=22, header=None, delim_whitespace=True) + df_even=df[df.index%2==0].reset_index(drop=True) + df_odd=df[df.index%2==1].iloc[: , :3].reset_index(drop=True) + df_odd.columns = [11,12,13] + dataset = df_even.join(df_odd) + dataset.columns = col_names + + output_file = os.path.join(data_dir, "processed_dataset.csv") + dataset.to_csv(output_file, index=False) + print(f"Processed dataset saved at: {output_file}") + + +def main(): + + parser = argparse.ArgumentParser(description='Preprocess dataset') + parser.add_argument('--data_dir', required=True, + help='Folder containing dataset file') + args = parser.parse_args() + + data_dir = args.data_dir + process_data(data_dir) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/boston_housing/project/03_train.py b/boston_housing/project/03_train.py new file mode 100644 index 0000000..3609f43 --- /dev/null +++ b/boston_housing/project/03_train.py @@ -0,0 +1,46 @@ +"""Train gradient boosting regressor on Boston housing dataset""" +import os +import argparse +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.metrics import mean_squared_error +from sklearn.ensemble import GradientBoostingRegressor + + +def train(dataset_file_path, n_estimators): + df = pd.read_csv(dataset_file_path) + + data = df.drop(['PRICE'], axis=1) + target = df[['PRICE']] + X_train, X_test, Y_train, Y_test = train_test_split(data, target, test_size = 0.25) + + clf = GradientBoostingRegressor(n_estimators=n_estimators, verbose = 1) + clf.fit(X_train, Y_train.values.ravel()) + + train_predicted = clf.predict(X_train) + train_expected = Y_train + train_rmse = mean_squared_error(train_predicted, train_expected, squared=False) + + test_predicted = clf.predict(X_test) + test_expected = Y_test + test_rmse = mean_squared_error(test_predicted, test_expected, squared=False) + + print(f"\nTRAIN RMSE:\t{train_rmse}") + print(f"TEST RMSE:\t{test_rmse}") + +def main(): + + parser = argparse.ArgumentParser(description='Train model') + parser.add_argument('--dataset_file_path', required=True, + help='Processed dataset file path') + parser.add_argument('--n_estimators', type=int, default=100, + help='number of boosting stages to perform') + args = parser.parse_args() + + dataset_file_path = args.dataset_file_path + n_estimators = args.n_estimators + train(dataset_file_path, n_estimators) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/boston_housing/project/Dockerfile b/boston_housing/project/Dockerfile new file mode 100644 index 0000000..cf6f369 --- /dev/null +++ b/boston_housing/project/Dockerfile @@ -0,0 +1,13 @@ +FROM python:3.6-slim-buster + +#RUN sudo apt update && sudo apt search vim && sudo apt install vim + +WORKDIR /workspace/boston_housing + +COPY . . + +RUN chmod +x ./run_and_time.sh + +RUN pip install --no-cache --disable-pip-version-check --default-timeout=1000 -U -r requirements.txt + +ENTRYPOINT ["python", "/workspace/boston_housing/mlcube.py"] \ No newline at end of file diff --git a/boston_housing/project/mlcube.py b/boston_housing/project/mlcube.py new file mode 100644 index 0000000..590ad23 --- /dev/null +++ b/boston_housing/project/mlcube.py @@ -0,0 +1,73 @@ +"""MLCube handler file""" +import os +import yaml +import typer +import shutil +import subprocess +from pathlib import Path + + +app = typer.Typer() + +class DownloadDataTask(object): + """Download task Class + It defines the environment variables: + DATA_ROOT_DIR: Directory path to download the dataset + Then executes the download script""" + @staticmethod + def run(data_dir: str) -> None: + + command = f"python 01_download_dataset.py --data_dir {data_dir}" + splitted_command = command.split() + process = subprocess.Popen(splitted_command, cwd=".") + process.wait() + +class PreprocessDataTask(object): + """Preprocess dataset task Class + It defines the environment variables: + DATA_ROOT_DIR: Dataset directory path + Then executes the preprocess script""" + @staticmethod + def run(data_dir: str) -> None: + + command = f"python 02_preprocess_dataset.py --data_dir {data_dir}" + splitted_command = command.split() + process = subprocess.Popen(splitted_command, cwd=".") + process.wait() + +class TrainTask(object): + """Preprocess dataset task Class + It defines the environment variables: + DATA_DIR: Dataset directory path + All other parameters are defined in the parameters_file + Then executes the benchmark script""" + @staticmethod + def run(dataset_file_path: str, parameters_file: str) -> None: + with open(parameters_file, 'r') as stream: + parameters = yaml.safe_load(stream) + + env = os.environ.copy() + env.update({ + 'DATASET_FILE_PATH': dataset_file_path, + }) + + env.update(parameters) + + process = subprocess.Popen("./run_and_time.sh", cwd=".", env=env) + process.wait() + +@app.command("download_data") +def download_data(data_dir: str = typer.Option(..., '--data_dir')): + DownloadDataTask.run(data_dir) + +@app.command("preprocess_data") +def preprocess_data(data_dir: str = typer.Option(..., '--data_dir')): + PreprocessDataTask.run(data_dir) + +@app.command("train") +def train(dataset_file_path: str = typer.Option(..., '--dataset_file_path'), + parameters_file: str = typer.Option(..., '--parameters_file')): + TrainTask.run(dataset_file_path, parameters_file) + +if __name__ == '__main__': + app() \ No newline at end of file diff --git a/boston_housing/project/requirements.txt b/boston_housing/project/requirements.txt new file mode 100644 index 0000000..2db03ef --- /dev/null +++ b/boston_housing/project/requirements.txt @@ -0,0 +1,5 @@ +requests==2.24.0 +pandas==1.1.3 +scikit-learn==0.23.2 +typer==0.3.2 +PyYAML==5.4.1 \ No newline at end of file diff --git a/boston_housing/project/run_and_time.sh b/boston_housing/project/run_and_time.sh new file mode 100644 index 0000000..762e758 --- /dev/null +++ b/boston_housing/project/run_and_time.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +set -e + +: ${DATASET_FILE_PATH:=${1:-"./processed_dataset.csv"}} +: ${N_ESTIMATORS:=${2:-"100"}} + +ARGS="--dataset_file_path=$DATASET_FILE_PATH" +ARGS+=" --n_estimators $N_ESTIMATORS" + +# Execute command and time it +time python 03_train.py ${ARGS} \ No newline at end of file From b11325df110306fdb103cda869182710d4859cb7 Mon Sep 17 00:00:00 2001 From: davidjurado Date: Fri, 16 Jul 2021 11:41:32 -0500 Subject: [PATCH 02/10] Update README.md --- boston_housing/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/boston_housing/README.md b/boston_housing/README.md index d7ee9c3..abd7296 100644 --- a/boston_housing/README.md +++ b/boston_housing/README.md @@ -1,4 +1,4 @@ -# Packing an existing projecto into MLCUbe +# Packing an existing project into MLCUbe In this tutorial we're going to use the [Boston Housing Dataset](https://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html). We'll take an existing implementation, create the needed files to pack it into MLCube and execute all tasks. @@ -213,4 +213,4 @@ mlcube run --task preprocess_data --platform docker # Run training. # Parameters to override: --dataset_file_path=DATASET_FILE_PATH --parameters_file=PATH_TO_TRAINING_PARAMS mlcube run --task train --platform docker -``` \ No newline at end of file +``` From d74420117a531c284a9c995d1c50b9daa65058fd Mon Sep 17 00:00:00 2001 From: David Jurado Date: Mon, 19 Jul 2021 17:45:46 -0500 Subject: [PATCH 03/10] Update to config 2.0 --- boston_housing/.gitignore | 3 +- boston_housing/README.md | 55 +++++++++------- boston_housing/mlcube/.mlcube.yaml | 29 --------- boston_housing/mlcube/mlcube.yaml | 34 ++++++++++ boston_housing/mlcube/mlcube_cli.py | 70 +++++++++++++++++++++ boston_housing/mlcube/platforms/docker.yaml | 15 ----- 6 files changed, 136 insertions(+), 70 deletions(-) delete mode 100644 boston_housing/mlcube/.mlcube.yaml create mode 100644 boston_housing/mlcube/mlcube.yaml create mode 100644 boston_housing/mlcube/mlcube_cli.py delete mode 100644 boston_housing/mlcube/platforms/docker.yaml diff --git a/boston_housing/.gitignore b/boston_housing/.gitignore index 200f99b..bb2e484 100644 --- a/boston_housing/.gitignore +++ b/boston_housing/.gitignore @@ -2,5 +2,4 @@ project/raw_dataset.txt project/processed_dataset.csv mlcube/workspace/data mlcube/run -mlcube/tasks -mlcube/mlcube.yaml \ No newline at end of file +mlcube/tasks \ No newline at end of file diff --git a/boston_housing/README.md b/boston_housing/README.md index abd7296..086bd0f 100644 --- a/boston_housing/README.md +++ b/boston_housing/README.md @@ -33,13 +33,12 @@ The most important thing that we need to remember about these scripts are the in ## MLCube scructure -We'll need some files for MLCube, first we'll need to create a folder called **mlcube** in the same path from as project folder. We'll need to create the following structure (for this tutorial the files are already in place but some of them are empty for you to define their content) +We'll need a couple of files for MLCube, first we'll need to create a folder called **mlcube** in the same path from as project folder. We'll need to create the following structure (for this tutorial the files are already in place) ``` ├── mlcube -│   ├── .mlcube.yaml -│   ├── platforms -│   │   └── docker.yaml +│   ├── mlcube.yaml +│   ├── mlcube_cli.py │   └── workspace │   └── parameters.yaml └── project @@ -104,7 +103,24 @@ process.wait() In this tutorial we already have a shell script containing the steps to run the train task, the file is: **project/run_and_time.sh**, please take a look and study its content. -### MLCube handler Python file + +### MLCube Python CLI file + +The **mlcube/mlcube_cli.py** file simulates MLCube CLI. It is temporary stored here, and is part of MLCube library. The only command avaibale to execute is `run`, and the possible arguments are: + + --mlcube TEXT Path to MLCube directory, default is current. + --platform TEXT Platform to run MLCube, default is docker/podman. + --task TEXT MLCube task name to run, default is `main`. + --workspace TEXT Workspace path, default is `workspace` within MLCube folder + +Example: + +``` +python mlcube_cli.py run --mlcube ./ --task train --platform docker +``` + + +### MLCube Python entrypoint file At this point we know how to execute the tasks sripts from Python code, now we can create a file that contains the definition on how to run each task. @@ -128,9 +144,8 @@ At this point our solution folder structure should look like this: ``` ├── mlcube -│   ├── .mlcube.yaml -│   ├── platforms -│   │   └── docker.yaml +│   ├── mlcube.yaml +│   ├── mlcube_cli.py │   └── workspace │   └── parameters.yaml └── project @@ -160,7 +175,7 @@ This file is already provided, please take a look and study its content. ### MLCube task definition file -The file located in **mlcube/.mlcube.yaml** contains the definition of all the tasks and their parameters. +The file located in **mlcube/mlcube.yaml** contains the definition of all the tasks and their parameters. This file is already provided, please take a look and study its content. @@ -171,23 +186,15 @@ With this file we have finished the packing of the project into MLCube! Now we c ```Python # Create Python environment virtualenv -p python3 ./env && source ./env/bin/activate - # Install MLCube and MLCube docker runner from GitHub repository (normally, users will just run `pip install mlcube mlcube_docker`) -git clone https://github.com/mlcommons/mlcube && cd ./mlcube -cd ./mlcube && python setup.py bdist_wheel && pip install --force-reinstall ./dist/mlcube-* && cd .. -cd ./runners/mlcube_docker && python setup.py bdist_wheel && pip install --force-reinstall --no-deps ./dist/mlcube_docker-* && cd ../../.. -python3 -m pip install tornado +git clone https://github.com/sergey-serebryakov/mlbox.git && cd mlbox && git checkout feature/configV2 +cd ./runners/mlcube_docker && export PYTHONPATH=$(pwd) +cd ../../ && pip install -r mlcube/requirements.txt && pip install omegaconf && cd ../ # Fetch the boston housing example from GitHub git clone https://github.com/mlcommons/mlcube_examples && cd ./mlcube_examples git fetch origin pull/27/head:feature/boston_housing && git checkout feature/boston_housing -cd ./boston_housing/project - -# Build MLCube docker image. -docker build . -t mlcommons/boston_housing:0.0.1 -f Dockerfile - -# Show tasks implemented in this MLCube. -cd ../mlcube && mlcube describe +cd ./boston_housing/mlcube ``` ### Dataset @@ -204,13 +211,13 @@ The [Boston Housing Dataset](https://www.cs.toronto.edu/~delve/data/boston/bosto ``` # Download Boston housing dataset. Default path = /workspace/data # To override it, use --data_dir=DATA_DIR -mlcube run --task download_data --platform docker +python mlcube_cli.py run --task download_data # Preprocess Boston housing dataset, this will convert raw .txt data to .csv format # It will use the DATA_DIR path defined in the previous step -mlcube run --task preprocess_data --platform docker +python mlcube_cli.py run --task preprocess_data # Run training. # Parameters to override: --dataset_file_path=DATASET_FILE_PATH --parameters_file=PATH_TO_TRAINING_PARAMS -mlcube run --task train --platform docker +python mlcube_cli.py run --task train ``` diff --git a/boston_housing/mlcube/.mlcube.yaml b/boston_housing/mlcube/.mlcube.yaml deleted file mode 100644 index c2724e5..0000000 --- a/boston_housing/mlcube/.mlcube.yaml +++ /dev/null @@ -1,29 +0,0 @@ -name: MLCommons Boston Housing -author: MLCommons Best Practices Working Group - -tasks: - # Download boston housing dataset - download_data: - parameters: - # Directory where dataset will be saved. - - {name: data_dir, type: directory, io: output} - tasks: - download_data: {data_dir: $WORKSPACE/data} - # Preprocess dataset - preprocess_data: - parameters: - # Same directory location where dataset was downloaded - - {name: data_dir, type: directory, io: output} - tasks: - preprocess_data: {data_dir: $WORKSPACE/data} - # Train gradient boosting regressor model - train: - parameters: - # Processed dataset file - - {name: dataset_file_path, type: file, io: input} - # Yaml file with training parameters. - - {name: parameters_file, type: file, io: input} - tasks: - train: - dataset_file_path: $WORKSPACE/data/processed_dataset.csv - parameters_file: $WORKSPACE/parameters.yaml \ No newline at end of file diff --git a/boston_housing/mlcube/mlcube.yaml b/boston_housing/mlcube/mlcube.yaml new file mode 100644 index 0000000..8c6843f --- /dev/null +++ b/boston_housing/mlcube/mlcube.yaml @@ -0,0 +1,34 @@ +name: MLCommons Boston Housing +description: MLCommons Boston Housing example +authors: + - {name: "MLCommons Best Practices Working Group"} + +platform: + accelerator_count: 0 + +container: + # Image name. + image: mlcommons/boston_housing:0.0.1 + # Docker build context relative to $MLCUBE_ROOT. Default is `build`. + build_context: "../project" + # Docker file name within docker build context, default is `Dockerfile`. + build_file: "Dockerfile" + +tasks: + download_data: + # Download boston housing dataset + io: + # Directory where dataset will be saved. + - {name: data_dir, type: directory, io: output, default: $WORKSPACE/data} + preprocess_data: + # Preprocess dataset + io: + # Same directory location where dataset was downloaded + - {name: data_dir, type: directory, io: output, default: $WORKSPACE/data} + train: + # Train gradient boosting regressor model + io: + # Processed dataset file + - {name: dataset_file_path, type: file, io: input, default: $WORKSPACE/data/processed_dataset.csv} + # Yaml file with training parameters. + - {name: parameters_file, type: file, io: input, default: $WORKSPACE/parameters.yaml} \ No newline at end of file diff --git a/boston_housing/mlcube/mlcube_cli.py b/boston_housing/mlcube/mlcube_cli.py new file mode 100644 index 0000000..5df6a4b --- /dev/null +++ b/boston_housing/mlcube/mlcube_cli.py @@ -0,0 +1,70 @@ +""" +This requires the MLCube 2.0 configuration +""" +import os +import yaml +import click +import typing +from mlcube_docker.docker_run import DockerRun + + +def load_config(mlcube_config_path: str, user_config_path: str) -> typing.Dict: + """Returns dictionary containing MLCube configuration""" + # Load mlcube config data + try: + with open(mlcube_config_path) as stream: + mlcube_config_data = yaml.load(stream.read(), Loader=yaml.SafeLoader) + except IOError as exc: + # If file doesn't exist throw the exception: + # OSError: {PATH_TO}/mnist/mlcube.yaml: No such file or directory + raise IOError("%s: %s" % (mlcube_config_path, exc.strerror)) + + # Load user config data if file exists + if os.path.isfile(user_config_path): + with open(user_config_path) as stream: + user_config_data = yaml.load(stream.read(), Loader=yaml.SafeLoader) + else: + return mlcube_config_data + + # Merge config data + tmp = mlcube_config_data['container'] + mlcube_config_data['container'] = user_config_data['container'] + mlcube_config_data['container'].update(tmp) + return mlcube_config_data + + +@click.group(name='mlcube') +def cli(): + pass + + +@cli.command(name='run', help='Run MLCube ML task.', + context_settings=dict(ignore_unknown_options=True, allow_extra_args=True)) +@click.option('--mlcube', required=False, type=str, help='Path to MLCube directory, default is current.') +@click.option('--platform', required=False, type=str, help='Platform to run MLCube, default is docker/podman.') +@click.option('--task', required=False, type=str, help='MLCube task name to run, default is `main`.') +@click.option('--workspace', required=False, type=str, help='Workspace path, default is `workspace` within ' + 'MLCube folder') +def run(mlcube: str, platform: str, task: str, workspace: str): + mlcube_root = os.path.abspath(mlcube or os.getcwd()) + if os.path.isfile(mlcube_root): + mlcube_root = os.path.dirname(mlcube_root) + + platform = platform or 'docker' + if platform != 'docker': + raise ValueError(f"Only `docker` platform is supported") + + task = task or 'main' + workspace = workspace or os.path.join(mlcube_root, 'workspace') + + mlcube_config_data = load_config( + os.path.join(str(mlcube_root), 'mlcube.yaml'), + os.path.join(os.path.expanduser("~"), '.mlcube.yaml') + ) + + docker_runner = DockerRun(mlcube_config_data, root=mlcube_root, workspace=workspace, task=task) + docker_runner.run() + + +if __name__ == "__main__": + cli() \ No newline at end of file diff --git a/boston_housing/mlcube/platforms/docker.yaml b/boston_housing/mlcube/platforms/docker.yaml deleted file mode 100644 index f61fd6f..0000000 --- a/boston_housing/mlcube/platforms/docker.yaml +++ /dev/null @@ -1,15 +0,0 @@ -schema_type: mlcube_platform -schema_version: 0.1.0 - -platform: - name: "docker" - version: ">=18.01" - -container: - command: docker - run_args: >- - --rm --net=host --uts=host --ipc=host - --ulimit stack=67108864 --ulimit memlock=-1 - --privileged=true --security-opt seccomp=unconfined - -v /dev/shm:/dev/shm - image: mlcommons/boston_housing:0.0.1 \ No newline at end of file From 9251d7ba0edacb338aa00566dca684a634cb289f Mon Sep 17 00:00:00 2001 From: David Jurado Date: Thu, 5 Aug 2021 19:10:14 -0500 Subject: [PATCH 04/10] Add support for overriding parameters at command line --- boston_housing/mlcube/mlcube_cli.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/boston_housing/mlcube/mlcube_cli.py b/boston_housing/mlcube/mlcube_cli.py index 5df6a4b..4583fc8 100644 --- a/boston_housing/mlcube/mlcube_cli.py +++ b/boston_housing/mlcube/mlcube_cli.py @@ -33,6 +33,21 @@ def load_config(mlcube_config_path: str, user_config_path: str) -> typing.Dict: return mlcube_config_data +def override_extra_parameters(ctx, mlcube_config_data, task): + """Get extra paramters from context and override them on mlcube config data dict""" + for input_param in ctx.args: + input_key, input_value = input_param.split("=") + input_key = input_key.replace("--", "") + # Replace main container parameters + for key in mlcube_config_data["container"]: + if key==input_key: + mlcube_config_data["container"][key]=input_value + # Replace io paths in current task + for io in mlcube_config_data["tasks"][task]["io"]: + if io["name"]==input_key: + io["default"]=input_value + + @click.group(name='mlcube') def cli(): pass @@ -45,7 +60,8 @@ def cli(): @click.option('--task', required=False, type=str, help='MLCube task name to run, default is `main`.') @click.option('--workspace', required=False, type=str, help='Workspace path, default is `workspace` within ' 'MLCube folder') -def run(mlcube: str, platform: str, task: str, workspace: str): +@click.pass_context +def run(ctx, mlcube: str, platform: str, task: str, workspace: str): mlcube_root = os.path.abspath(mlcube or os.getcwd()) if os.path.isfile(mlcube_root): mlcube_root = os.path.dirname(mlcube_root) @@ -61,7 +77,7 @@ def run(mlcube: str, platform: str, task: str, workspace: str): os.path.join(str(mlcube_root), 'mlcube.yaml'), os.path.join(os.path.expanduser("~"), '.mlcube.yaml') ) - + override_extra_parameters(ctx, mlcube_config_data, task) docker_runner = DockerRun(mlcube_config_data, root=mlcube_root, workspace=workspace, task=task) docker_runner.run() From b83e6148c5daaec2f138ece549a8ba5bfb0d92af Mon Sep 17 00:00:00 2001 From: David Jurado Date: Mon, 6 Sep 2021 11:06:14 -0500 Subject: [PATCH 05/10] Update config file to v2.0 --- boston_housing/README.md | 30 +++++----- boston_housing/mlcube/mlcube.yaml | 16 +++--- boston_housing/mlcube/mlcube_cli.py | 86 ----------------------------- 3 files changed, 21 insertions(+), 111 deletions(-) delete mode 100644 boston_housing/mlcube/mlcube_cli.py diff --git a/boston_housing/README.md b/boston_housing/README.md index 086bd0f..77494a3 100644 --- a/boston_housing/README.md +++ b/boston_housing/README.md @@ -2,12 +2,11 @@ In this tutorial we're going to use the [Boston Housing Dataset](https://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html). We'll take an existing implementation, create the needed files to pack it into MLCube and execute all tasks. - ## Original project code At fist we have only 4 files, one for package dependencies and 3 scripts for each task: download data, preprocess data and train. -``` +```bash ├── project ├── 01_download_dataset.py ├── 02_preprocess_dataset.py @@ -30,12 +29,11 @@ The most important thing that we need to remember about these scripts are the in **--dataset_file_path** : Processed dataset file path. Note: this is the full path to the csv file. **--n_estimators** : Number of boosting stages to perform. In this case we're using a gradient boosting regressor. - ## MLCube scructure We'll need a couple of files for MLCube, first we'll need to create a folder called **mlcube** in the same path from as project folder. We'll need to create the following structure (for this tutorial the files are already in place) -``` +```bash ├── mlcube │   ├── mlcube.yaml │   ├── mlcube_cli.py @@ -103,7 +101,6 @@ process.wait() In this tutorial we already have a shell script containing the steps to run the train task, the file is: **project/run_and_time.sh**, please take a look and study its content. - ### MLCube Python CLI file The **mlcube/mlcube_cli.py** file simulates MLCube CLI. It is temporary stored here, and is part of MLCube library. The only command avaibale to execute is `run`, and the possible arguments are: @@ -115,11 +112,10 @@ The **mlcube/mlcube_cli.py** file simulates MLCube CLI. It is temporary stored h Example: -``` +```bash python mlcube_cli.py run --mlcube ./ --task train --platform docker ``` - ### MLCube Python entrypoint file At this point we know how to execute the tasks sripts from Python code, now we can create a file that contains the definition on how to run each task. @@ -142,7 +138,7 @@ Keep in mind the tag that we just described. At this point our solution folder structure should look like this: -``` +```bash ├── mlcube │   ├── mlcube.yaml │   ├── mlcube_cli.py @@ -158,7 +154,6 @@ At this point our solution folder structure should look like this: └── run_and_time.sh ``` - ### Define MLCube files Inside the mlcube folder we'll need to define the following files. @@ -181,15 +176,17 @@ This file is already provided, please take a look and study its content. With this file we have finished the packing of the project into MLCube! Now we can setup the project and run all the tasks. - ### Project setup -```Python + +```bash # Create Python environment virtualenv -p python3 ./env && source ./env/bin/activate -# Install MLCube and MLCube docker runner from GitHub repository (normally, users will just run `pip install mlcube mlcube_docker`) -git clone https://github.com/sergey-serebryakov/mlbox.git && cd mlbox && git checkout feature/configV2 -cd ./runners/mlcube_docker && export PYTHONPATH=$(pwd) -cd ../../ && pip install -r mlcube/requirements.txt && pip install omegaconf && cd ../ + +# Install MLCube and MLCube docker runner from GitHub repository +# (normally, users will just run `pip install mlcube mlcube_docker`) +git clone https://github.com/mlcommons/mlcube && cd mlcube/mlcube +python setup.py bdist_wheel && pip install --force-reinstall ./dist/mlcube-* && cd .. +cd ./runners/mlcube_docker && python setup.py bdist_wheel && pip install --force-reinstall --no-deps ./dist/mlcube_docker-* && cd ../../.. # Fetch the boston housing example from GitHub git clone https://github.com/mlcommons/mlcube_examples && cd ./mlcube_examples @@ -208,7 +205,8 @@ The [Boston Housing Dataset](https://www.cs.toronto.edu/~delve/data/boston/bosto | Total | (After all tasks) | All | ~92 KB | ### Tasks execution -``` + +```bash # Download Boston housing dataset. Default path = /workspace/data # To override it, use --data_dir=DATA_DIR python mlcube_cli.py run --task download_data diff --git a/boston_housing/mlcube/mlcube.yaml b/boston_housing/mlcube/mlcube.yaml index 8c6843f..df0421a 100644 --- a/boston_housing/mlcube/mlcube.yaml +++ b/boston_housing/mlcube/mlcube.yaml @@ -6,7 +6,7 @@ authors: platform: accelerator_count: 0 -container: +docker: # Image name. image: mlcommons/boston_housing:0.0.1 # Docker build context relative to $MLCUBE_ROOT. Default is `build`. @@ -17,18 +17,16 @@ container: tasks: download_data: # Download boston housing dataset - io: + parameters: # Directory where dataset will be saved. - - {name: data_dir, type: directory, io: output, default: $WORKSPACE/data} + outputs: {data_dir: data/} preprocess_data: # Preprocess dataset - io: + parameters: # Same directory location where dataset was downloaded - - {name: data_dir, type: directory, io: output, default: $WORKSPACE/data} + inputs: {data_dir: data/} train: # Train gradient boosting regressor model - io: + parameters: # Processed dataset file - - {name: dataset_file_path, type: file, io: input, default: $WORKSPACE/data/processed_dataset.csv} - # Yaml file with training parameters. - - {name: parameters_file, type: file, io: input, default: $WORKSPACE/parameters.yaml} \ No newline at end of file + inputs: {dataset_file_path: data/processed_dataset.csv, parameters_file: parameters.yaml} \ No newline at end of file diff --git a/boston_housing/mlcube/mlcube_cli.py b/boston_housing/mlcube/mlcube_cli.py deleted file mode 100644 index 4583fc8..0000000 --- a/boston_housing/mlcube/mlcube_cli.py +++ /dev/null @@ -1,86 +0,0 @@ -""" -This requires the MLCube 2.0 configuration -""" -import os -import yaml -import click -import typing -from mlcube_docker.docker_run import DockerRun - - -def load_config(mlcube_config_path: str, user_config_path: str) -> typing.Dict: - """Returns dictionary containing MLCube configuration""" - # Load mlcube config data - try: - with open(mlcube_config_path) as stream: - mlcube_config_data = yaml.load(stream.read(), Loader=yaml.SafeLoader) - except IOError as exc: - # If file doesn't exist throw the exception: - # OSError: {PATH_TO}/mnist/mlcube.yaml: No such file or directory - raise IOError("%s: %s" % (mlcube_config_path, exc.strerror)) - - # Load user config data if file exists - if os.path.isfile(user_config_path): - with open(user_config_path) as stream: - user_config_data = yaml.load(stream.read(), Loader=yaml.SafeLoader) - else: - return mlcube_config_data - - # Merge config data - tmp = mlcube_config_data['container'] - mlcube_config_data['container'] = user_config_data['container'] - mlcube_config_data['container'].update(tmp) - return mlcube_config_data - - -def override_extra_parameters(ctx, mlcube_config_data, task): - """Get extra paramters from context and override them on mlcube config data dict""" - for input_param in ctx.args: - input_key, input_value = input_param.split("=") - input_key = input_key.replace("--", "") - # Replace main container parameters - for key in mlcube_config_data["container"]: - if key==input_key: - mlcube_config_data["container"][key]=input_value - # Replace io paths in current task - for io in mlcube_config_data["tasks"][task]["io"]: - if io["name"]==input_key: - io["default"]=input_value - - -@click.group(name='mlcube') -def cli(): - pass - - -@cli.command(name='run', help='Run MLCube ML task.', - context_settings=dict(ignore_unknown_options=True, allow_extra_args=True)) -@click.option('--mlcube', required=False, type=str, help='Path to MLCube directory, default is current.') -@click.option('--platform', required=False, type=str, help='Platform to run MLCube, default is docker/podman.') -@click.option('--task', required=False, type=str, help='MLCube task name to run, default is `main`.') -@click.option('--workspace', required=False, type=str, help='Workspace path, default is `workspace` within ' - 'MLCube folder') -@click.pass_context -def run(ctx, mlcube: str, platform: str, task: str, workspace: str): - mlcube_root = os.path.abspath(mlcube or os.getcwd()) - if os.path.isfile(mlcube_root): - mlcube_root = os.path.dirname(mlcube_root) - - platform = platform or 'docker' - if platform != 'docker': - raise ValueError(f"Only `docker` platform is supported") - - task = task or 'main' - workspace = workspace or os.path.join(mlcube_root, 'workspace') - - mlcube_config_data = load_config( - os.path.join(str(mlcube_root), 'mlcube.yaml'), - os.path.join(os.path.expanduser("~"), '.mlcube.yaml') - ) - override_extra_parameters(ctx, mlcube_config_data, task) - docker_runner = DockerRun(mlcube_config_data, root=mlcube_root, workspace=workspace, task=task) - docker_runner.run() - - -if __name__ == "__main__": - cli() \ No newline at end of file From 4210f50676da55f2156131e0653c4dc90096e8d6 Mon Sep 17 00:00:00 2001 From: David Jurado Date: Mon, 6 Sep 2021 11:14:33 -0500 Subject: [PATCH 06/10] Fix dockerfile --- boston_housing/project/Dockerfile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/boston_housing/project/Dockerfile b/boston_housing/project/Dockerfile index cf6f369..e0c2d9f 100644 --- a/boston_housing/project/Dockerfile +++ b/boston_housing/project/Dockerfile @@ -4,10 +4,12 @@ FROM python:3.6-slim-buster WORKDIR /workspace/boston_housing +COPY requirements.txt requirements.txt + +RUN pip install --no-cache --disable-pip-version-check --default-timeout=1000 -U -r requirements.txt + COPY . . RUN chmod +x ./run_and_time.sh -RUN pip install --no-cache --disable-pip-version-check --default-timeout=1000 -U -r requirements.txt - ENTRYPOINT ["python", "/workspace/boston_housing/mlcube.py"] \ No newline at end of file From 9a6a7393d09954b09a37752904926815678a900d Mon Sep 17 00:00:00 2001 From: David Jurado Date: Mon, 6 Sep 2021 16:21:27 -0500 Subject: [PATCH 07/10] Update Readme --- boston_housing/README.md | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/boston_housing/README.md b/boston_housing/README.md index 77494a3..d2ee5c4 100644 --- a/boston_housing/README.md +++ b/boston_housing/README.md @@ -36,7 +36,6 @@ We'll need a couple of files for MLCube, first we'll need to create a folder cal ```bash ├── mlcube │   ├── mlcube.yaml -│   ├── mlcube_cli.py │   └── workspace │   └── parameters.yaml └── project @@ -101,19 +100,30 @@ process.wait() In this tutorial we already have a shell script containing the steps to run the train task, the file is: **project/run_and_time.sh**, please take a look and study its content. -### MLCube Python CLI file +### MLCube Command -The **mlcube/mlcube_cli.py** file simulates MLCube CLI. It is temporary stored here, and is part of MLCube library. The only command avaibale to execute is `run`, and the possible arguments are: +We are targeting pull-type installation, so MLCube images should be available on docker hub. If not, try this: - --mlcube TEXT Path to MLCube directory, default is current. - --platform TEXT Platform to run MLCube, default is docker/podman. - --task TEXT MLCube task name to run, default is `main`. - --workspace TEXT Workspace path, default is `workspace` within MLCube folder +```bash +mlcube run ... -Pdocker.build_strategy=auto +``` + +Parameters defined in mlcube.yaml can be overridden using: param=input, example: + +```bash +mlcube run --task=download_data data_dir=absolute_path_to_custom_dir +``` + +Also, users can override the workspace directory by using: + +```bash +mlcube run --task=download_data --workspace=absolute_path_to_custom_dir +``` -Example: +Note: Sometimes, overriding the workspace path could fail for some task, this is because the input parameter parameters_file should be specified, to solve this use: ```bash -python mlcube_cli.py run --mlcube ./ --task train --platform docker +mlcube run --task=train --workspace=absolute_path_to_custom_dir parameters_file=$(pwd)/workspace/parameters.yaml ``` ### MLCube Python entrypoint file @@ -141,7 +151,6 @@ At this point our solution folder structure should look like this: ```bash ├── mlcube │   ├── mlcube.yaml -│   ├── mlcube_cli.py │   └── workspace │   └── parameters.yaml └── project @@ -209,13 +218,13 @@ The [Boston Housing Dataset](https://www.cs.toronto.edu/~delve/data/boston/bosto ```bash # Download Boston housing dataset. Default path = /workspace/data # To override it, use --data_dir=DATA_DIR -python mlcube_cli.py run --task download_data +mlcube run --task download_data # Preprocess Boston housing dataset, this will convert raw .txt data to .csv format # It will use the DATA_DIR path defined in the previous step -python mlcube_cli.py run --task preprocess_data +mlcube run --task preprocess_data # Run training. # Parameters to override: --dataset_file_path=DATASET_FILE_PATH --parameters_file=PATH_TO_TRAINING_PARAMS -python mlcube_cli.py run --task train +mlcube run --task train ``` From cecbc54bc110caec4e39c3f471f1b30f63837a61 Mon Sep 17 00:00:00 2001 From: David Jurado Date: Tue, 7 Sep 2021 16:37:52 -0500 Subject: [PATCH 08/10] Fix Readme --- boston_housing/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/boston_housing/README.md b/boston_housing/README.md index d2ee5c4..bf85e30 100644 --- a/boston_housing/README.md +++ b/boston_housing/README.md @@ -217,7 +217,7 @@ The [Boston Housing Dataset](https://www.cs.toronto.edu/~delve/data/boston/bosto ```bash # Download Boston housing dataset. Default path = /workspace/data -# To override it, use --data_dir=DATA_DIR +# To override it, use data_dir=DATA_DIR mlcube run --task download_data # Preprocess Boston housing dataset, this will convert raw .txt data to .csv format @@ -225,6 +225,6 @@ mlcube run --task download_data mlcube run --task preprocess_data # Run training. -# Parameters to override: --dataset_file_path=DATASET_FILE_PATH --parameters_file=PATH_TO_TRAINING_PARAMS +# Parameters to override: dataset_file_path=DATASET_FILE_PATH parameters_file=PATH_TO_TRAINING_PARAMS mlcube run --task train ``` From 933bfea565f5fc6719197011b9923066935571a8 Mon Sep 17 00:00:00 2001 From: Sergey Serebryakov Date: Thu, 30 Sep 2021 13:12:38 -0700 Subject: [PATCH 09/10] Bug fix: MLCube example never recognized it was running for the 1st time. (#35) The greeting message printed by this MLCube can differ if it runs for 1st time or not. It identifies this by checking if a chat file exists. The MLCube was implemented so that it was checking for file existence after opening this file, so the function that checks if file exists or not was always returning true. --- hello_world/hello_world.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hello_world/hello_world.py b/hello_world/hello_world.py index 3e8c807..7d68b81 100644 --- a/hello_world/hello_world.py +++ b/hello_world/hello_world.py @@ -1,4 +1,3 @@ -import os import typer from datetime import datetime @@ -17,11 +16,12 @@ def get_greeting_message(chat_file: str) -> str: @app.command() def hello(name_file: str = typer.Option(..., '--name'), chat_file: str = typer.Option(..., '--chat')): + greeting_message: str = get_greeting_message(chat_file) with open(chat_file, 'a') as chat_stream: chat_stream.write('[{}] Hi, {}! {}\n'.format( datetime.now(), get_name(name_file), - get_greeting_message(chat_file) + greeting_message )) From d0f0baa1af7953fdc9df3350ec61e876bccbff77 Mon Sep 17 00:00:00 2001 From: David Jurado Date: Wed, 20 Oct 2021 19:00:04 -0500 Subject: [PATCH 10/10] Update MLCube installation command in Readme --- boston_housing/README.md | 12 ++++-------- mnist_openfl/pytorch/README.md | 10 ++-------- mnist_openfl/tensorflow/README.md | 10 ++-------- 3 files changed, 8 insertions(+), 24 deletions(-) diff --git a/boston_housing/README.md b/boston_housing/README.md index bf85e30..2e982be 100644 --- a/boston_housing/README.md +++ b/boston_housing/README.md @@ -187,15 +187,11 @@ With this file we have finished the packing of the project into MLCube! Now we c ### Project setup +## Project setup + ```bash -# Create Python environment -virtualenv -p python3 ./env && source ./env/bin/activate - -# Install MLCube and MLCube docker runner from GitHub repository -# (normally, users will just run `pip install mlcube mlcube_docker`) -git clone https://github.com/mlcommons/mlcube && cd mlcube/mlcube -python setup.py bdist_wheel && pip install --force-reinstall ./dist/mlcube-* && cd .. -cd ./runners/mlcube_docker && python setup.py bdist_wheel && pip install --force-reinstall --no-deps ./dist/mlcube_docker-* && cd ../../.. +# Create Python environment and install MLCube Docker runner +virtualenv -p python3 ./env && source ./env/bin/activate && pip install mlcube-docker # Fetch the boston housing example from GitHub git clone https://github.com/mlcommons/mlcube_examples && cd ./mlcube_examples diff --git a/mnist_openfl/pytorch/README.md b/mnist_openfl/pytorch/README.md index bbdf172..b45c94b 100644 --- a/mnist_openfl/pytorch/README.md +++ b/mnist_openfl/pytorch/README.md @@ -3,14 +3,8 @@ ## Project setup ```bash -# Create Python environment -virtualenv -p python3 ./env && source ./env/bin/activate - -# Install MLCube and MLCube docker runner from GitHub repository -# (normally, users will just run `pip install mlcube mlcube_docker`) -git clone https://github.com/mlcommons/mlcube && cd mlcube/mlcube -python setup.py bdist_wheel && pip install --force-reinstall ./dist/mlcube-* && cd .. -cd ./runners/mlcube_docker && python setup.py bdist_wheel && pip install --force-reinstall --no-deps ./dist/mlcube_docker-* && cd ../../.. +# Create Python environment and install MLCube Docker runner +virtualenv -p python3 ./env && source ./env/bin/activate && pip install mlcube-docker ``` ## Clone MLCube examples and go to mnist_openfl directory diff --git a/mnist_openfl/tensorflow/README.md b/mnist_openfl/tensorflow/README.md index d758aaf..cb0fb3e 100644 --- a/mnist_openfl/tensorflow/README.md +++ b/mnist_openfl/tensorflow/README.md @@ -3,14 +3,8 @@ ## Project setup ```bash -# Create Python environment -virtualenv -p python3 ./env && source ./env/bin/activate - -# Install MLCube and MLCube docker runner from GitHub repository -# (normally, users will just run `pip install mlcube mlcube_docker`) -git clone https://github.com/mlcommons/mlcube && cd mlcube/mlcube -python setup.py bdist_wheel && pip install --force-reinstall ./dist/mlcube-* && cd .. -cd ./runners/mlcube_docker && python setup.py bdist_wheel && pip install --force-reinstall --no-deps ./dist/mlcube_docker-* && cd ../../.. +# Create Python environment and install MLCube Docker runner +virtualenv -p python3 ./env && source ./env/bin/activate && pip install mlcube-docker ``` ## Clone MLCube examples and go to mnist_openfl directory