modified wandb related files

deep-diver · web-flow · commit bc15998ca0a9 · 2022-06-26T02:09:11.000Z
diff --git a/README.md b/README.md
@@ -21,11 +21,12 @@ This project shows how to realize MLOps in Git/GitHub. In order to achieve this
 4. Run `dvc add [ADDED FILE OR DIRECTORY]` to track your data with DVC
 5. Run `dvc remote add -d gdrive_storage gdrive://[ID of specific folder in gdrive]` to add Google Drive as the remote data storage
 6. Run `dvc push`, then URL to auth is provided. Copy and paste it to the browser, and autheticate
-7. Copy the content of `.dvc/tmp/gdrive-user-credentials.json` and put it as in [GitHub Secret](https://docs.github.com/en/actions/security-guides/encrypted-secrets#creating-encrypted-secrets-for-a-repository) with the name of `GDRIVE_CREDENTIALS`
+7. Copy the content of `.dvc/tmp/gdrive-user-credentials.json` and put it as in [GitHub Secret](https://docs.github.com/en/actions/security-guides/encrypted-secrets#creating-encrypted-secrets-for-a-repository) with the name of `GDRIVE_CREDENTIAL`
 8. Run `git add . && git commit -m "initial commit" && git push origin main` to keep the initial setup
 9. Write your own pipeline under `pipeline` directory. Codes for basic image classification in TensorFlow are provided initially.
 10. Run the following `dvc stage add` for training stage
 ```bash
+# if you want to use Iterative Studio / DVCLive for tracking training progress
 $ dvc stage add -n train \
                 -p train.train_size,train.batch_size,train.epoch,train.lr \
                 -d pipeline/modeling.py -d pipeline/train.py -d data \
@@ -35,25 +36,44 @@ $ dvc stage add -n train \
                 --plots-no-cache dvclive/scalars/eval/sparse_categorical_accuracy.tsv \
                 -o outputs/model \
                 python pipeline/train.py outputs/model
+
+# if you want to use W&B for tracking training progress
+$ dvc stage add -n train \
+                -p train.train_size,train.batch_size,train.epoch,train.lr \
+                -d pipeline/modeling.py -d pipeline/train.py -d data \
+                -o outputs/model \
+                python pipeline/train.py outputs/model
 ```
-10. Run the following `dvc stage add` for evaluate stage
+11. Run the following `dvc stage add` for evaluate stage
 ```bash
+# if you want to use Iterative Studio / DVCLive for tracking training progress
 $ dvc stage add -n evaluate \
                 -p evaluate.test,evaluate.batch_size \
                 -d pipeline/evaluate.py -d data/test -d outputs/model \
                 -M outputs/metrics.json \
                 python pipeline/evaluate.py outputs/model
+
+# if you want to use W&B for tracking training progress
+$ dvc stage add -n evaluate \
+                -p evaluate.test,evaluate.batch_size \
+                -d pipeline/evaluate.py -d data/test -d outputs/model \
+                python pipeline/evaluate.py outputs/model
 ```
-11. Update `params.yaml` as you need.
-12. Run `git add . && git commit -m "add initial pipeline setup" && git push origin main`
-13. Run `dvc repro` to run the pipeline initially
-14. Run `dvc add outputs/model.tar.gz` to add compressed version of model 
-15. Run `dvc push outputs/model.tar.gz`
-16. Run `echo "/pipeline/__pycache__" >> .gitignore` to ignore unnecessary directory
-17. Run `git add . && git commit -m "add initial pipeline run" && git push origin main`
-18. Add access token and user email of [JarvisLabs.ai](https://jarvislabs.ai/) to GitHub Secret as `JARVISLABS_ACCESS_TOKEN` and `JARVISLABS_USER_EMAIL`
-19. Add GitHub access token to GitHub Secret as `GH_ACCESS_TOKEN`
-20. Create a PR and write `#train` as in comment (you have to be the onwer of the repo)
+12. Update `params.yaml` as you need.
+13. Run `git add . && git commit -m "add initial pipeline setup" && git push origin main`
+14. Run `dvc repro` to run the pipeline initially
+15. Run `dvc add outputs/model.tar.gz` to add compressed version of model 
+16. Run `dvc push outputs/model.tar.gz`
+17. Run `echo "/pipeline/__pycache__" >> .gitignore` to ignore unnecessary directory
+18. Run `git add . && git commit -m "add initial pipeline run" && git push origin main`
+19. Add access token and user email of [JarvisLabs.ai](https://jarvislabs.ai/) to GitHub Secret as `JARVISLABS_ACCESS_TOKEN` and `JARVISLABS_USER_EMAIL`
+20. Add GitHub access token to GitHub Secret as `GH_ACCESS_TOKEN`
+21. Create a PR and write `#train` as in comment (you have to be the onwer of the repo)
+
+### W&B Integration Setup
+
+1. Add W&B's project name to GitHub Secret as `WANDB_PROJECT`
+2. Add W&B's API KEY to GitHub Secret as `WANDB_API_KEY`
 
 ### HuggingFace Integration Setup
 
diff --git a/pipeline/train_wandb.py b/pipeline/train_wandb.py
@@ -0,0 +1,99 @@
+import os
+import sys
+import glob
+import yaml
+import json
+import random
+import tarfile
+from pathlib import Path
+
+import tensorflow as tf
+from tensorflow.keras.applications import resnet50
+
+import modeling
+
+import wandb
+from wandb.keras import WandbCallback
+
+if len(sys.argv) != 2:
+    sys.stderr.write("Arguments error. Usage:\n")
+    sys.stderr.write("\tpython prepare.py data-file\n")
+    sys.exit(1)
+
+params = yaml.safe_load(open("params.yaml"))["train"]
+print(params)
+
+train = 'data'/Path(params['train'])
+test = 'data'/Path(params['test'])
+output = Path(sys.argv[1])
+
+_image_feature_description = {
+    'image': tf.io.FixedLenFeature([], tf.string),
+    'label': tf.io.FixedLenFeature([], tf.int64),
+}
+
+def _parse_image_function(example_proto):
+    features = tf.io.parse_single_example(example_proto, _image_feature_description)
+    image = tf.io.decode_png(features['image'], channels=3) # tf.io.decode_raw(features['image'], tf.uint8)
+    image = tf.image.resize(image, [224, 224])
+    image = resnet50.preprocess_input(image)
+
+    label = tf.cast(features['label'], tf.int32)
+
+    return image, label
+
+def _read_dataset(epochs, batch_size, channel):
+    filenames = glob.glob(str(channel/'*.tfrecord'))
+    dataset = tf.data.TFRecordDataset(filenames)
+
+    dataset = dataset.map(_parse_image_function, num_parallel_calls=4)
+    dataset = dataset.prefetch(tf.data.AUTOTUNE)
+    dataset = dataset.repeat(epochs)
+    dataset = dataset.shuffle(buffer_size=10 * batch_size)
+    dataset = dataset.batch(batch_size, drop_remainder=True)      
+
+    return dataset
+
+def make_tarfile(output_filename, source_dir):
+    with tarfile.open(output_filename, "w:gz") as tar:
+        tar.add(source_dir, arcname=os.path.basename(source_dir))
+
+def run_train():
+    project_name = os.environ["WANDB_PROJECT"]
+    wandb_key = os.environ["WANDB_API_KEY"]
+    wandb_run_name = os.environ["WANDB_RUN_NAME"]
+
+    wandb.login(
+        anonymous="never",
+        key=wandb_key
+    )
+    _ = wandb.init(project=project_name,
+                   config=params,
+                   name=wandb_run_name)
+
+    train_size = params['train_size']
+    train_step_size = train_size // params['batch_size']
+
+    train_ds = _read_dataset(params['epoch'], params['batch_size'], train)
+    test_ds = _read_dataset(params['epoch'], params['batch_size'], test)
+
+    wandbCallback = WandbCallback(training_data=train_ds, 
+                                  log_weights=(True), log_gradients=(True))
+
+    m = modeling._build_keras_model()
+    m = modeling._compile(m, float(params['lr']))
+
+    m.fit(
+        train_ds,
+        epochs=params['epoch'],
+        steps_per_epoch=train_step_size,
+        validation_data=test_ds,
+        callbacks=[wandbCallback])
+
+    m.save(output, 
+           save_format='tf', 
+           signatures=modeling._get_signature(m)) 
+
+    make_tarfile(f'{output}.tar.gz', output)
+
+run_train()
diff --git a/scripts/jl_exp_wandb.sh b/scripts/jl_exp_wandb.sh
@@ -0,0 +1,51 @@
+#!/bin/sh
+
+# install gh cli
+apt install sudo
+curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg | sudo dd of=/usr/share/keyrings/githubcli-archive-keyring.gpg
+echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | sudo tee /etc/apt/sources.list.d/github-cli.list > /dev/null
+sudo apt update
+sudo apt install gh
+
+# grant gh access 
+export GH_TOKEN='$GH_ACCESS_TOKEN'
+git config --global user.name "chansung"
+git config --global user.email "deep.diver.csp@gmail.com"
+
+# set W&B specific keys
+export WANDB_PROJECT='$WANDB_PROJECT'
+export WANDB_API_KEY='$WANDB_API_KEY'
+
+# move to the repo
+git clone https://github.com/codingpot/git-mlops.git
+
+# install dependencies
+cd git-mlops
+gh auth setup-git
+git checkout $CUR_BRANCH
+pip install -r requirements.txt
+pip install git+https://github.com/jarvislabsai/jlclient.git
+
+# set Gdrive credential
+mkdir .dvc/tmp
+echo '$GDRIVE_CREDENTIAL' > .dvc/tmp/gdrive-user-credentials.json
+
+# pull data
+dvc pull
+
+export WANDB_RUN_NAME=$CUR_BRANCH
+dvc repro
+
+exp_result=$(dvc exp show --only-changed --md)
+wandb_url="https://wandb.ai/codingpot/git-mlops"
+gh pr comment $CUR_PR_ID --body "[Visit W&B Log Page for this Pull Request]($wandb_url)"
+
+git reset --hard
+
+echo ${exp_ids[$idx]}
+echo ${exp_names[$idx]}
+dvc add outputs/model.tar.gz
+dvc push outputs/model.tar.gz
+
+VM_ID=$(tail -n 2 /home/.jarviscloud/jarvisconfig | head -n 1)
+python clouds/jarvislabs.py vm destroy $CLOUD_AT $CLOUD_ID $VM_ID