diff --git a/.github/stale.yml b/.github/stale.yml index 9871c8ec21a73..9157b33c61936 100644 --- a/.github/stale.yml +++ b/.github/stale.yml @@ -1,11 +1,11 @@ # Configuration for probot-stale - https://github.com/probot/stale # Number of days of inactivity before an Issue or Pull Request becomes stale -daysUntilStale: 365 +daysUntilStale: 10000 # Number of days of inactivity before an Issue or Pull Request with the stale label is closed. # Set to false to disable. If disabled, issues still need to be closed manually, but will remain marked as stale. -daysUntilClose: 730 +daysUntilClose: false # Only issues or pull requests with all of these labels are check if stale. Defaults to `[]` (disabled) onlyLabels: [] @@ -25,7 +25,7 @@ exemptMilestones: false exemptAssignees: false # Label to use when marking as stale -staleLabel: stale +staleLabel: Needs-Author-Feedback # Comment to post when marking as stale. Set to `false` to disable markComment: > @@ -34,8 +34,8 @@ markComment: > for your contributions. # Comment to post when removing the stale label. -unmarkComment: > - Thank you for interacting with this issue! Removing the stale label! +#unmarkComment: > +# Thank you for interacting with this issue! Removing the stale label! # Comment to post when closing a stale Issue or Pull Request. closeComment: > diff --git a/.github/workflows/jira.pr.yml b/.github/workflows/jira.pr.yml deleted file mode 100644 index 45d1a3e03aaae..0000000000000 --- a/.github/workflows/jira.pr.yml +++ /dev/null @@ -1,20 +0,0 @@ -on: - pull_request: - branches: - - master - -name: Jira Ticket Creation for PRs - -jobs: - create_ticket: - name: Create Jira ticket over API - runs-on: ubuntu-latest - steps: - - name: Call REST API - shell: bash - run: > - curl - -u ${{ secrets.JIRA_USER_EMAIL }}:${{ secrets.JIRA_API_TOKEN }} - -H 'Content-Type: application/json' - --data-raw '{"fields":{"project":{"key":"ML"},"summary":"Review \"${{ github.event.pull_request.title }}\" by ${{ github.event.pull_request.user.login }} [${{ github.repository }} #${{ github.event.number }}] ","description":"${{ github.event.pull_request._links.html.href }}","customfield_10008":"ML-9504","assignee":{"name":"eng-ml-platform-team"},"issuetype":{"name":"Story"},"components":[{"name":"ML Platform"}]}}' - ${{ secrets.JIRA_BASE_URL }}/rest/api/2/issue/ diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml new file mode 100644 index 0000000000000..4c58c4746c748 --- /dev/null +++ b/.github/workflows/master.yml @@ -0,0 +1,27 @@ +name: MLflow tests + +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + +jobs: + python-small: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.6 + uses: actions/setup-python@v1 + with: + python-version: 3.6 + - name: Install dependencies + run: | + export GITHUB_WORKFLOW=1 + INSTALL_SMALL_PYTHON_DEPS=true source ./travis/install-common-deps.sh + - name: Run tests + run: | + export GITHUB_WORKFLOW=1 + export PATH="$HOME/miniconda/bin:$PATH" + source activate test-environment + ./travis/run-small-python-tests.sh diff --git a/.travis.yml b/.travis.yml index 1be8011cf35a5..9238e1d54eb59 100644 --- a/.travis.yml +++ b/.travis.yml @@ -84,7 +84,7 @@ matrix: - pip install -r travis/small-requirements.txt - pip install -e . script: - - pytest --verbose --ignore=tests/h2o --ignore=tests/keras --ignore=tests/pytorch --ignore=tests/pyfunc --ignore=tests/sagemaker --ignore=tests/sklearn --ignore=tests/spark --ignore=tests/tensorflow --ignore=tests/keras_autolog --ignore=tests/tensorflow_autolog --ignore tests/azureml --ignore tests/onnx --ignore tests/projects --ignore=tests/xgboost --ignore=tests/lightgbm --ignore=tests/spark_autologging tests + - pytest --verbose --ignore=tests/h2o --ignore=tests/keras --ignore=tests/pytorch --ignore=tests/pyfunc --ignore=tests/sagemaker --ignore=tests/sklearn --ignore=tests/spark --ignore=tests/tensorflow --ignore=tests/keras_autolog --ignore=tests/tensorflow_autolog --ignore tests/azureml --ignore tests/onnx --ignore tests/projects --ignore=tests/xgboost --ignore=tests/lightgbm --ignore=tests/spark_autologging --ignore=tests/spacy tests - language: python python: 3.6 name: "Docs (rsthtml, javadocs)" diff --git a/docs/source/conf.py b/docs/source/conf.py index b3444487c05dc..9b8f9e4df57bd 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -34,6 +34,7 @@ # ones. extensions = [ 'sphinx.ext.autodoc', + 'sphinx.ext.viewcode', 'sphinx.ext.napoleon', 'sphinx_click.ext', ] diff --git a/docs/source/model-registry.rst b/docs/source/model-registry.rst index f7c32efcd64fc..e213ba7bdaac2 100644 --- a/docs/source/model-registry.rst +++ b/docs/source/model-registry.rst @@ -95,6 +95,7 @@ There are three programmatic ways to add a model to the registry. First, you can from sklearn.ensemble import RandomForestRegressor import mlflow + import mlflow.sklearn with mlflow.start_run(run_name="YOUR_RUN_NAME") as run: params = {"n_estimators": 5, "random_state": 42} @@ -103,7 +104,7 @@ There are three programmatic ways to add a model to the registry. First, you can # Log parameters and metrics using the MLflow APIs mlflow.log_params(params) mlflow.log_param("param_1", randint(0, 100)) - mlflow.log_metrics({"metric_1": random(), "metric_2", random() + 1}) + mlflow.log_metrics({"metric_1": random(), "metric_2": random() + 1}) # Log the sklearn model and register as version 1 mlflow.sklearn.log_model( diff --git a/docs/source/models.rst b/docs/source/models.rst index c67a889b65cfb..4cad04160c724 100644 --- a/docs/source/models.rst +++ b/docs/source/models.rst @@ -561,7 +561,10 @@ Example requests: }' # record-oriented (fine for vector rows, loses ordering for JSON records) - curl http://127.0.0.1:5000/invocations -H 'Content-Type: application/json; format=pandas-records' -d '[[1, 2, 3], [4, 5, 6]]' + curl http://127.0.0.1:5000/invocations -H 'Content-Type: application/json; format=pandas-records' -d '[ + {"a": 1,"b": 2,"c": 3}, + {"a": 4,"b": 5,"c": 6} + ]' For more information about serializing pandas DataFrames, see diff --git a/docs/source/search-syntax.rst b/docs/source/search-syntax.rst index 567a34db9c999..0295252582ab5 100644 --- a/docs/source/search-syntax.rst +++ b/docs/source/search-syntax.rst @@ -185,6 +185,19 @@ To search all known experiments for any MLflow runs created using the Inception all_experiments = [exp.experiment_id for exp in MlflowClient().list_experiments()] runs = MlflowClient().search_runs(experiment_ids=all_experiments, filter_string="params.model = 'Inception'", run_view_type=ViewType.ALL) +R +^^^^^^ +The R API is similar to the Python API. + +.. code-block:: r + + library(mlflow) + mlflow_search_runs( + filter = "metrics.rmse < 0.9 and tags.production = 'true'", + experiment_ids = as.character(1:2), + order_by = "params.lr DESC" + ) + Java ^^^^ The Java API is similar to Python API. diff --git a/docs/source/tracking.rst b/docs/source/tracking.rst index 15375901c3848..d6f852d583a41 100644 --- a/docs/source/tracking.rst +++ b/docs/source/tracking.rst @@ -238,9 +238,9 @@ Autologging captures the following information: +------------------+--------------------------------------------------------+--------------------------------------------------------------+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------+ | Framework | Metrics | Parameters | Tags | Artifacts | +------------------+--------------------------------------------------------+--------------------------------------------------------------+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------+ -| Keras | Training loss; validation loss; user-specified metrics | ``fit()`` parameters; optimizer name; learning rate; epsilon | Model summary | Model summary on training start; `MLflow Model `_ (Keras model) on training end | +| Keras | Training loss; validation loss; user-specified metrics | ``fit()`` parameters; optimizer name; learning rate; epsilon | -- | Model summary on training start; `MLflow Model `_ (Keras model) on training end | +------------------+--------------------------------------------------------+--------------------------------------------------------------+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------+ -| ``tf.keras`` | Training loss; validation loss; user-specified metrics | ``fit()`` parameters; optimizer name; learning rate; epsilon | Model summary | Model summary on training start; `MLflow Model `_ (Keras model), TensorBoard logs on training end | +| ``tf.keras`` | Training loss; validation loss; user-specified metrics | ``fit()`` parameters; optimizer name; learning rate; epsilon | -- | Model summary on training start; `MLflow Model `_ (Keras model), TensorBoard logs on training end | +------------------+--------------------------------------------------------+--------------------------------------------------------------+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------+ | ``tf.estimator`` | TensorBoard metrics | steps, max_steps | -- | `MLflow Model `_ (TF saved model) on call to ``tf.estimator.export_saved_model`` | +------------------+--------------------------------------------------------+--------------------------------------------------------------+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------+ diff --git a/docs/source/tutorials-and-examples/index.rst b/docs/source/tutorials-and-examples/index.rst index 8d807e95b5445..7e5867db9bda5 100644 --- a/docs/source/tutorials-and-examples/index.rst +++ b/docs/source/tutorials-and-examples/index.rst @@ -33,6 +33,8 @@ Below, you can find a number of tutorials and examples for various MLflow use ca - `Glmnet (R) `_ + - `SpaCy `_ + - scikit-learn + `Diabetes example `_ diff --git a/docs/theme/mlflow/static/css/theme.css b/docs/theme/mlflow/static/css/theme.css index e54080c55a02d..f5eb1d889a226 100644 --- a/docs/theme/mlflow/static/css/theme.css +++ b/docs/theme/mlflow/static/css/theme.css @@ -5291,7 +5291,7 @@ a .rst-content code { .rst-content .viewcode-link, .rst-content .viewcode-back { display: inline-block; - color: #27AE60; + color: #2980B9; font-size: 80%; padding-left: 24px } @@ -5340,3 +5340,11 @@ a .rst-content code { src: local("Inconsolata Bold"), local("Inconsolata-Bold"), url(../fonts/Inconsolata-Bold.ttf) format("truetype") } /*# sourceMappingURL=theme.css.map */ + +div.viewcode-block:target { + background-color: #d3f1f3; + border-top: 1px solid #44c1cb; + border-bottom: 1px solid #44c1cb; + margin: 0px -12px; + padding: 0px 12px; +} diff --git a/examples/flower_classifier/README.rst b/examples/flower_classifier/README.rst index f68f7f7fb0111..d11787ae2c293 100644 --- a/examples/flower_classifier/README.rst +++ b/examples/flower_classifier/README.rst @@ -98,14 +98,14 @@ run_id ``101``. .. code-block:: bash # score the deployed model - python score_images_rest.py --model-uri runs:/101/model --port 54321 http://127.0.0.1 --data-path /path/to/images/for/scoring + python score_images_rest.py --host http://127.0.0.1 --port 54321 /path/to/images/for/scoring - To test batch scoring in Spark, run score_images_spark.py to score the model in Spark like this: .. code-block:: bash - python score_images_spark.py --model-uri runs:/101/model --data-path /path/to/images/for/scoring + python score_images_spark.py --model-uri runs:/101/model /path/to/images/for/scoring diff --git a/examples/flower_classifier/conda.yaml b/examples/flower_classifier/conda.yaml index 82ba15e23929a..41897ca24bf01 100644 --- a/examples/flower_classifier/conda.yaml +++ b/examples/flower_classifier/conda.yaml @@ -4,10 +4,11 @@ channels: - anaconda dependencies: - python==3.7 - - pandas - - scikit-learn - - tensorflow-mkl - - keras + - pandas==1.0.3 + - scikit-learn==0.22.1 + - tensorflow==1.13.1 + - keras==2.3.1 + - pillow==7.0.0 + - pip==20.0.2 - pip: - - mlflow>=1.0 - - pillow + - mlflow>=1.6 diff --git a/examples/flower_classifier/image_pyfunc.py b/examples/flower_classifier/image_pyfunc.py index 35e2b256652cb..da694f8108698 100644 --- a/examples/flower_classifier/image_pyfunc.py +++ b/examples/flower_classifier/image_pyfunc.py @@ -10,6 +10,7 @@ import pandas as pd import PIL from PIL import Image +import pip import yaml import tensorflow as tf @@ -128,6 +129,7 @@ def log_model(keras_model, artifact_path, image_dims, domain): keras_version=keras.__version__, tf_name=tf.__name__, # can have optional -gpu suffix tf_version=tf.__version__, + pip_version=pip.__version__, pillow_version=PIL.__version__)) mlflow.pyfunc.log_model(artifact_path=artifact_path, @@ -165,6 +167,8 @@ def _load_pyfunc(path): - python=={python_version} - keras=={keras_version} - {tf_name}=={tf_version} + - pip=={pip_version} + - pillow=={pillow_version} - pip: - - pillow=={pillow_version} + - mlflow>=1.6 """ diff --git a/examples/flower_classifier/score_images_rest.py b/examples/flower_classifier/score_images_rest.py index 0c6da2653ed65..a57827d3e15a6 100644 --- a/examples/flower_classifier/score_images_rest.py +++ b/examples/flower_classifier/score_images_rest.py @@ -14,12 +14,12 @@ from mlflow.utils import cli_args -def score_model(path, uri, port): +def score_model(path, host, port): """ Score images on the local path with MLflow model deployed at given uri and port. :param path: Path to a single image file or a directory of images. - :param uri: URI the model is deployed at + :param host: host the model is deployed at :param port: Port the model is deployed at. :return: Server response. """ @@ -36,7 +36,7 @@ def read_image(x): data = pd.DataFrame(data=[base64.encodebytes(read_image(x)) for x in filenames], columns=["image"]).to_json(orient="split") - response = requests.post(url='{uri}:{port}/invocations'.format(uri=uri, port=port), + response = requests.post(url='{host}:{port}/invocations'.format(host=host, port=port), data=data, headers={"Content-Type": "application/json; format=pandas-split"}) @@ -50,14 +50,14 @@ def read_image(x): @click.command(help="Score images.") @click.option("--port", type=click.INT, default=80, help="Port at which the model is deployed.") -@cli_args.MODEL_URI -@click.argument("--data-path", "-d") -def run(data_path, model_uri, port): +@cli_args.HOST +@click.argument("data-path") +def run(data_path, host, port): """ Score images with MLflow deployed deployed at given uri and port and print out the response to standard out. """ - print(score_model(data_path, model_uri, port).text) + print(score_model(data_path, host, port).text) if __name__ == '__main__': diff --git a/examples/flower_classifier/score_images_spark.py b/examples/flower_classifier/score_images_spark.py index 2f24bd24d155d..932f1b2dca2d0 100644 --- a/examples/flower_classifier/score_images_spark.py +++ b/examples/flower_classifier/score_images_spark.py @@ -69,8 +69,8 @@ def score_model(spark, data_path, model_uri): @click.command(help="Score images.") @cli_args.MODEL_URI -@click.argument("--data-path", "-d") -def run(model_uri, data_path): +@click.argument("data-path") +def run(data_path, model_uri): with pyspark.sql.SparkSession.builder \ .config(key="spark.python.worker.reuse", value=True) \ .config(key="spark.ui.enabled", value=False) \ diff --git a/examples/spacy/MLproject b/examples/spacy/MLproject new file mode 100644 index 0000000000000..0a266a1e9c8b4 --- /dev/null +++ b/examples/spacy/MLproject @@ -0,0 +1,7 @@ +name: spacy_ner_example + +conda_env: conda.yaml + +entry_points: + main: + command: "python train.py" diff --git a/examples/spacy/conda.yaml b/examples/spacy/conda.yaml new file mode 100644 index 0000000000000..afc718db17d8b --- /dev/null +++ b/examples/spacy/conda.yaml @@ -0,0 +1,10 @@ +name: spacy-example +channels: + - defaults + - anaconda +dependencies: + - python==3.6 + - spacy=2.2.3 + - pip: + - mlflow>=1.0 + diff --git a/examples/spacy/train.py b/examples/spacy/train.py new file mode 100644 index 0000000000000..7eb7c6016df9b --- /dev/null +++ b/examples/spacy/train.py @@ -0,0 +1,65 @@ +from __future__ import print_function + +import random + +import spacy +from spacy.util import minibatch, compounding + +import mlflow.spacy + +# training data +TRAIN_DATA = [ + ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}), + ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}), +] + +if __name__ == "__main__": + # Adaptation of spaCy example: https://github.com/explosion/spaCy/blob/master/examples/training/train_ner.py + + # create blank model and add ner to the pipeline + nlp = spacy.blank("en") + ner = nlp.create_pipe("ner") + nlp.add_pipe(ner, last=True) + + # add labels + for _, annotations in TRAIN_DATA: + for ent in annotations.get("entities"): + ner.add_label(ent[2]) + + params = { + 'n_iter':100, + 'drop': 0.5 + } + mlflow.log_params(params) + + nlp.begin_training() + for itn in range(params['n_iter']): + random.shuffle(TRAIN_DATA) + losses = {} + # batch up the examples using spaCy's minibatch + batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) + for batch in batches: + texts, annotations = zip(*batch) + nlp.update( + texts, # batch of texts + annotations, # batch of annotations + drop=params['drop'], # dropout - make it harder to memorise data + losses=losses, + ) + print("Losses", losses) + mlflow.log_metrics(losses) + + # Log the spaCy model using mlflow + mlflow.spacy.log_model(spacy_model=nlp, artifact_path='model') + model_uri = "runs:/{run_id}/{artifact_path}".format( + run_id=mlflow.active_run().info.run_id, + artifact_path='model') + + print("Model saved in run %s" % mlflow.active_run().info.run_uuid) + + # Load the model using mlflow and use it to predict data + nlp2 = mlflow.spacy.load_model(model_uri=model_uri) + for text, _ in TRAIN_DATA: + doc = nlp2(text) + print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) + print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc]) diff --git a/mlflow/azureml/__init__.py b/mlflow/azureml/__init__.py index 05fd35a17902a..c40f765d983fa 100644 --- a/mlflow/azureml/__init__.py +++ b/mlflow/azureml/__init__.py @@ -85,35 +85,38 @@ def build_image(model_uri, workspace, image_name=None, model_name=None, - An ``azureml.core.image.ContainerImage`` object containing metadata for the new image. - An ``azureml.core.model.Model`` object containing metadata for the new model. - >>> import mlflow.azureml - >>> from azureml.core import Workspace - >>> from azureml.core.webservice import AciWebservice, Webservice - >>> - >>> # Load or create an Azure ML Workspace - >>> workspace_name = "" - >>> subscription_id = "" - >>> resource_group = "" - >>> location = "" - >>> azure_workspace = Workspace.create(name=workspace_name, - >>> subscription_id=subscription_id, - >>> resource_group=resource_group, - >>> location=location, - >>> create_resource_group=True, - >>> exist_okay=True) - >>> - >>> # Build an Azure ML Container Image for an MLflow model - >>> azure_image, azure_model = mlflow.azureml.build_image( - >>> model_uri="", - >>> workspace=azure_workspace, - >>> synchronous=True) - >>> # If your image build failed, you can access build logs at the following URI: - >>> print("Access the following URI for build logs: {}".format(azure_image.image_build_log_uri)) - >>> - >>> # Deploy the image to Azure Container Instances (ACI) for real-time serving - >>> webservice_deployment_config = AciWebservice.deploy_configuration() - >>> webservice = Webservice.deploy_from_image( - >>> image=azure_image, workspace=azure_workspace, name="") - >>> webservice.wait_for_deployment() + .. code-block:: python + :caption: Example + + import mlflow.azureml + from azureml.core import Workspace + from azureml.core.webservice import AciWebservice, Webservice + + # Load or create an Azure ML Workspace + workspace_name = "" + subscription_id = "" + resource_group = "" + location = "" + azure_workspace = Workspace.create(name=workspace_name, + subscription_id=subscription_id, + resource_group=resource_group, + location=location, + create_resource_group=True, + exist_okay=True) + + # Build an Azure ML Container Image for an MLflow model + azure_image, azure_model = mlflow.azureml.build_image( + model_uri="", + workspace=azure_workspace, + synchronous=True) + # If your image build failed, you can access build logs at the following URI: + print("Access the following URI for build logs: {}".format(azure_image.image_build_log_uri)) + + # Deploy the image to Azure Container Instances (ACI) for real-time serving + webservice_deployment_config = AciWebservice.deploy_configuration() + webservice = Webservice.deploy_from_image( + image=azure_image, workspace=azure_workspace, name="") + webservice.wait_for_deployment() """ # The Azure ML SDK is only compatible with Python 3. However, the `mlflow.azureml` module should # still be accessible for import from Python 2. Therefore, we will only import from the SDK diff --git a/mlflow/gluon.py b/mlflow/gluon.py index 153ff9456a788..eb670527649d3 100644 --- a/mlflow/gluon.py +++ b/mlflow/gluon.py @@ -43,9 +43,12 @@ def load_model(model_uri, ctx): :return: A Gluon model instance. - >>> # Load persisted model as a Gluon model, make inferences against an NDArray - >>> model = mlflow.gluon.load_model("runs:/" + gluon_random_data_run.info.run_id + "/model") - >>> model(nd.array(np.random.rand(1000, 1, 32))) + .. code-block:: python + :caption: Example + + # Load persisted model as a Gluon model, make inferences against an NDArray + model = mlflow.gluon.load_model("runs:/" + gluon_random_data_run.info.run_id + "/model") + model(nd.array(np.random.rand(1000, 1, 32))) """ local_model_path = _download_artifact_from_uri(artifact_uri=model_uri) @@ -109,25 +112,28 @@ def save_model(gluon_model, path, mlflow_model=Model(), conda_env=None): ] } - >>> from mxnet.gluon import Trainer - >>> from mxnet.gluon.contrib import estimator - >>> from mxnet.gluon.loss import SoftmaxCrossEntropyLoss - >>> from mxnet.gluon.nn import HybridSequential - >>> from mxnet.metric import Accuracy - >>> import mlflow - >>> # Build, compile, and train your model - >>> gluon_model_path = ... - >>> net = HybridSequential() - >>> with net.name_scope(): - >>> ... - >>> net.hybridize() - >>> net.collect_params().initialize() - >>> softmax_loss = SoftmaxCrossEntropyLoss() - >>> trainer = Trainer(net.collect_params()) - >>> est = estimator.Estimator(net=net, loss=softmax_loss, metrics=Accuracy(), trainer=trainer) - >>> est.fit(train_data=train_data, epochs=100, val_data=validation_data) - ... # Save the model as an MLflow Model - >>> mlflow.gluon.save_model(net, gluon_model_path) + .. code-block:: python + :caption: Example + + from mxnet.gluon import Trainer + from mxnet.gluon.contrib import estimator + from mxnet.gluon.loss import SoftmaxCrossEntropyLoss + from mxnet.gluon.nn import HybridSequential + from mxnet.metric import Accuracy + import mlflow + # Build, compile, and train your model + gluon_model_path = ... + net = HybridSequential() + with net.name_scope(): + ... + net.hybridize() + net.collect_params().initialize() + softmax_loss = SoftmaxCrossEntropyLoss() + trainer = Trainer(net.collect_params()) + est = estimator.Estimator(net=net, loss=softmax_loss, metrics=Accuracy(), trainer=trainer) + est.fit(train_data=train_data, epochs=100, val_data=validation_data) + # Save the model as an MLflow Model + mlflow.gluon.save_model(net, gluon_model_path) """ path = os.path.abspath(path) if os.path.exists(path): @@ -187,25 +193,28 @@ def log_model(gluon_model, artifact_path, conda_env=None): ] } - >>> from mxnet.gluon import Trainer - >>> from mxnet.gluon.contrib import estimator - >>> from mxnet.gluon.loss import SoftmaxCrossEntropyLoss - >>> from mxnet.gluon.nn import HybridSequential - >>> from mxnet.metric import Accuracy - >>> import mlflow - >>> # Build, compile, and train your model - >>> net = HybridSequential() - >>> with net.name_scope(): - >>> ... - >>> net.hybridize() - >>> net.collect_params().initialize() - >>> softmax_loss = SoftmaxCrossEntropyLoss() - >>> trainer = Trainer(net.collect_params()) - >>> est = estimator.Estimator(net=net, loss=softmax_loss, metrics=Accuracy(), trainer=trainer) - >>> # Log metrics and log the model - >>> with mlflow.start_run() as run: - >>> est.fit(train_data=train_data, epochs=100, val_data=validation_data) - >>> mlflow.gluon.log_model(net, "model") + .. code-block:: python + :caption: Example + + from mxnet.gluon import Trainer + from mxnet.gluon.contrib import estimator + from mxnet.gluon.loss import SoftmaxCrossEntropyLoss + from mxnet.gluon.nn import HybridSequential + from mxnet.metric import Accuracy + import mlflow + # Build, compile, and train your model + net = HybridSequential() + with net.name_scope(): + ... + net.hybridize() + net.collect_params().initialize() + softmax_loss = SoftmaxCrossEntropyLoss() + trainer = Trainer(net.collect_params()) + est = estimator.Estimator(net=net, loss=softmax_loss, metrics=Accuracy(), trainer=trainer) + # Log metrics and log the model + with mlflow.start_run(): + est.fit(train_data=train_data, epochs=100, val_data=validation_data) + mlflow.gluon.log_model(net, "model") """ Model.log(artifact_path=artifact_path, flavor=mlflow.gluon, gluon_model=gluon_model, conda_env=conda_env) diff --git a/mlflow/keras.py b/mlflow/keras.py index 5cb0fcf6bd9c1..aa847724059bf 100644 --- a/mlflow/keras.py +++ b/mlflow/keras.py @@ -114,15 +114,18 @@ def save_model(keras_model, path, conda_env=None, mlflow_model=Model(), custom_o attempt to infer the Keras module based on the given model. :param kwargs: kwargs to pass to ``keras_model.save`` method. - >>> import mlflow - >>> # Build, compile, and train your model - >>> keras_model = ... - >>> keras_model_path = ... - >>> keras_model.compile(optimizer="rmsprop", loss="mse", metrics=["accuracy"]) - >>> results = keras_model.fit( - ... x_train, y_train, epochs=20, batch_size = 128, validation_data=(x_val, y_val)) - ... # Save the model as an MLflow Model - >>> mlflow.keras.save_model(keras_model, keras_model_path) + .. code-block:: python + :caption: Example + + import mlflow + # Build, compile, and train your model + keras_model = ... + keras_model_path = ... + keras_model.compile(optimizer="rmsprop", loss="mse", metrics=["accuracy"]) + results = keras_model.fit( + x_train, y_train, epochs=20, batch_size = 128, validation_data=(x_val, y_val)) + # Save the model as an MLflow Model + mlflow.keras.save_model(keras_model, keras_model_path) """ if keras_module is None: def _is_plain_keras(model): @@ -247,16 +250,19 @@ def log_model(keras_model, artifact_path, conda_env=None, custom_objects=None, k registered model if one with the given name does not exist. :param kwargs: kwargs to pass to ``keras_model.save`` method. - >>> from keras import Dense, layers - >>> import mlflow - >>> # Build, compile, and train your model - >>> keras_model = ... - >>> keras_model.compile(optimizer="rmsprop", loss="mse", metrics=["accuracy"]) - >>> results = keras_model.fit( - ... x_train, y_train, epochs=20, batch_size = 128, validation_data=(x_val, y_val)) - >>> # Log metrics and log the model - >>> with mlflow.start_run() as run: - >>> mlflow.keras.log_model(keras_model, "models") + .. code-block:: python + :caption: Example + + from keras import Dense, layers + import mlflow + # Build, compile, and train your model + keras_model = ... + keras_model.compile(optimizer="rmsprop", loss="mse", metrics=["accuracy"]) + results = keras_model.fit( + x_train, y_train, epochs=20, batch_size = 128, validation_data=(x_val, y_val)) + # Log metrics and log the model + with mlflow.start_run() as run: + mlflow.keras.log_model(keras_model, "models") """ Model.log(artifact_path=artifact_path, flavor=mlflow.keras, keras_model=keras_model, conda_env=conda_env, custom_objects=custom_objects, @@ -383,9 +389,12 @@ def load_model(model_uri, **kwargs): :return: A Keras model instance. - >>> # Load persisted model as a Keras model or as a PyFunc, call predict() on a pandas DataFrame - >>> keras_model = mlflow.keras.load_model("runs:/96771d893a5e46159d9f3b49bf9013e2" + "/models") - >>> predictions = keras_model.predict(x_test) + .. code-block:: python + :caption: Example + + # Load persisted model as a Keras model or as a PyFunc, call predict() on a pandas DataFrame + keras_model = mlflow.keras.load_model("runs:/96771d893a5e46159d9f3b49bf9013e2" + "/models") + predictions = keras_model.predict(x_test) """ local_model_path = _download_artifact_from_uri(artifact_uri=model_uri) flavor_conf = _get_flavor_configuration(model_path=local_model_path, flavor_name=FLAVOR_NAME) @@ -449,8 +458,6 @@ def on_train_begin(self, logs=None): # pylint: disable=unused-argument sum_list = [] self.model.summary(print_fn=sum_list.append) summary = '\n'.join(sum_list) - try_mlflow_log(mlflow.set_tag, 'model_summary', summary) - tempdir = tempfile.mkdtemp() try: summary_file = os.path.join(tempdir, "model_summary.txt") diff --git a/mlflow/mleap.py b/mlflow/mleap.py index 76dbcd7a535cc..0815512d7db78 100644 --- a/mlflow/mleap.py +++ b/mlflow/mleap.py @@ -46,36 +46,38 @@ def log_model(spark_model, sample_input, artifact_path, registered_model_name=No version under ``registered_model_name``, also creating a registered model if one with the given name does not exist. - >>> import mlflow - >>> import mlflow.mleap - >>> import pyspark - >>> from pyspark.ml import Pipeline - >>> from pyspark.ml.classification import LogisticRegression - >>> from pyspark.ml.feature import HashingTF, Tokenizer - >>># training DataFrame - >>> training = spark.createDataFrame([ - ... (0, "a b c d e spark", 1.0), - ... (1, "b d", 0.0), - ... (2, "spark f g h", 1.0), - ... (3, "hadoop mapreduce", 0.0) ], ["id", "text", "label"]) - >>># testing DataFrame - >>> test_df = spark.createDataFrame([ - ... (4, "spark i j k"), - ... (5, "l m n"), - ... (6, "spark hadoop spark"), - ... (7, "apache hadoop")], ["id", "text"]) - >>> # Create an MLlib pipeline - >>> tokenizer = Tokenizer(inputCol="text", outputCol="words") - >>> hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") - >>> lr = LogisticRegression(maxIter=10, regParam=0.001) - >>> pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) - >>> model = pipeline.fit(training) - >>> #log parameters - >>> mlflow.log_param("max_iter", 10) - >>> mlflow.log_param("reg_param", 0.001) - >>> #log the Spark MLlib model in MLeap format - >>> mlflow.mleap.log_model(spark_model=model, sample_input=test_df, - >>> artifact_path="mleap-model") + .. code-block:: python + :caption: Example + + import mlflow + import mlflow.mleap + import pyspark + from pyspark.ml import Pipeline + from pyspark.ml.classification import LogisticRegression + from pyspark.ml.feature import HashingTF, Tokenizer + # training DataFrame + training = spark.createDataFrame([ + (0, "a b c d e spark", 1.0), + (1, "b d", 0.0), + (2, "spark f g h", 1.0), + (3, "hadoop mapreduce", 0.0) ], ["id", "text", "label"]) + # testing DataFrame + test_df = spark.createDataFrame([ + (4, "spark i j k"), + (5, "l m n"), + (6, "spark hadoop spark"), + (7, "apache hadoop")], ["id", "text"]) + # Create an MLlib pipeline + tokenizer = Tokenizer(inputCol="text", outputCol="words") + hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") + lr = LogisticRegression(maxIter=10, regParam=0.001) + pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) + model = pipeline.fit(training) + # log parameters + mlflow.log_param("max_iter", 10) + mlflow.log_param("reg_param", 0.001) + # log the Spark MLlib model in MLeap format + mlflow.mleap.log_model(spark_model=model, sample_input=test_df, artifact_path="mleap-model") """ return Model.log(artifact_path=artifact_path, flavor=mlflow.mleap, spark_model=spark_model, sample_input=sample_input, diff --git a/mlflow/models/docker_utils.py b/mlflow/models/docker_utils.py index c78c4af7c7f3d..745c39a60b293 100644 --- a/mlflow/models/docker_utils.py +++ b/mlflow/models/docker_utils.py @@ -29,7 +29,7 @@ && rm -rf /var/lib/apt/lists/* # Download and setup miniconda -RUN curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh >> miniconda.sh +RUN curl -L https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh >> miniconda.sh RUN bash ./miniconda.sh -b -p /miniconda; rm ./miniconda.sh; ENV PATH="/miniconda/bin:$PATH" ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 diff --git a/mlflow/pyfunc/__init__.py b/mlflow/pyfunc/__init__.py index b90c61bdb2f5e..2de72257c8b91 100644 --- a/mlflow/pyfunc/__init__.py +++ b/mlflow/pyfunc/__init__.py @@ -65,7 +65,9 @@ .. rubric:: Example ->>> tree example/sklearn_iris/mlruns/run1/outputs/linear-lr +:: + + tree example/sklearn_iris/mlruns/run1/outputs/linear-lr :: @@ -77,7 +79,9 @@ │   └── model.pkl └── mlflow_env.yml ->>> cat example/sklearn_iris/mlruns/run1/outputs/linear-lr/MLmodel +:: + + cat example/sklearn_iris/mlruns/run1/outputs/linear-lr/MLmodel :: @@ -341,8 +345,8 @@ def spark_udf(spark, model_uri, result_type="double"): """ A Spark UDF that can be used to invoke the Python function formatted model. - Parameters passed to the UDF are forwarded to the model as a DataFrame where the names are - ordinals (0, 1, ...). On some versions of Spark, it is also possible to wrap the input in a + Parameters passed to the UDF are forwarded to the model as a DataFrame where the column names + are ordinals (0, 1, ...). On some versions of Spark, it is also possible to wrap the input in a struct. In that case, the data will be passed as a DataFrame with column names given by the struct definition (e.g. when invoked as my_udf(struct('x', 'y'), the model will ge the data as a pandas DataFrame with 2 columns 'x' and 'y'). @@ -352,8 +356,11 @@ def spark_udf(spark, model_uri, result_type="double"): converted to string. If the result type is not an array type, the left most column with matching type is returned. - >>> predict = mlflow.pyfunc.spark_udf(spark, "/my/local/model") - >>> df.withColumn("prediction", predict("name", "age")).show() + .. code-block:: python + :caption: Example + + predict = mlflow.pyfunc.spark_udf(spark, "/my/local/model") + df.withColumn("prediction", predict("name", "age")).show() :param spark: A SparkSession object. :param model_uri: The location, in URI format, of the MLflow model with the diff --git a/mlflow/pytorch/__init__.py b/mlflow/pytorch/__init__.py index 5092decf8b69a..f22077576673a 100644 --- a/mlflow/pytorch/__init__.py +++ b/mlflow/pytorch/__init__.py @@ -107,47 +107,50 @@ def log_model(pytorch_model, artifact_path, conda_env=None, code_paths=None, registered model if one with the given name does not exist. :param kwargs: kwargs to pass to ``torch.save`` method. - >>> import torch - >>> import mlflow - >>> import mlflow.pytorch - >>> # X data - >>> x_data = torch.Tensor([[1.0], [2.0], [3.0]]) - >>> # Y data with its expected value: labels - >>> y_data = torch.Tensor([[2.0], [4.0], [6.0]]) - >>> # Partial Model example modified from Sung Kim - >>> # https://github.com/hunkim/PyTorchZeroToAll - >>> class Model(torch.nn.Module): - >>> def __init__(self): - >>> super(Model, self).__init__() - >>> self.linear = torch.nn.Linear(1, 1) # One in and one out - >>> def forward(self, x): - >>> y_pred = self.linear(x) - >>> return y_pred - >>> # our model - >>> model = Model() - >>> criterion = torch.nn.MSELoss(size_average=False) - >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.01) - >>> # Training loop - >>> for epoch in range(500): - >>> # Forward pass: Compute predicted y by passing x to the model - >>> y_pred = model(x_data) - >>> # Compute and print loss - >>> loss = criterion(y_pred, y_data) - >>> print(epoch, loss.data.item()) - >>> #Zero gradients, perform a backward pass, and update the weights. - >>> optimizer.zero_grad() - >>> loss.backward() - >>> optimizer.step() - >>> - >>> # After training - >>> for hv in [4.0, 5.0, 6.0]: - >>> hour_var = torch.Tensor([[hv]]) - >>> y_pred = model(hour_var) - >>> print("predict (after training)", hv, model(hour_var).data[0][0]) - >>> # log the model - >>> with mlflow.start_run() as run: - >>> mlflow.log_param("epochs", 500) - >>> mlflow.pytorch.log_model(model, "models") + .. code-block:: python + :caption: Example + + import torch + import mlflow + import mlflow.pytorch + # X data + x_data = torch.Tensor([[1.0], [2.0], [3.0]]) + # Y data with its expected value: labels + y_data = torch.Tensor([[2.0], [4.0], [6.0]]) + # Partial Model example modified from Sung Kim + # https://github.com/hunkim/PyTorchZeroToAll + class Model(torch.nn.Module): + def __init__(self): + super(Model, self).__init__() + self.linear = torch.nn.Linear(1, 1) # One in and one out + def forward(self, x): + y_pred = self.linear(x) + return y_pred + # our model + model = Model() + criterion = torch.nn.MSELoss(size_average=False) + optimizer = torch.optim.SGD(model.parameters(), lr=0.01) + # Training loop + for epoch in range(500): + # Forward pass: Compute predicted y by passing x to the model + y_pred = model(x_data) + # Compute and print loss + loss = criterion(y_pred, y_data) + print(epoch, loss.data.item()) + #Zero gradients, perform a backward pass, and update the weights. + optimizer.zero_grad() + loss.backward() + optimizer.step() + + # After training + for hv in [4.0, 5.0, 6.0]: + hour_var = torch.Tensor([[hv]]) + y_pred = model(hour_var) + print("predict (after training)", hv, model(hour_var).data[0][0]) + # log the model + with mlflow.start_run() as run: + mlflow.log_param("epochs", 500) + mlflow.pytorch.log_model(model, "models") """ pickle_module = pickle_module or mlflow_pytorch_pickle_module Model.log(artifact_path=artifact_path, flavor=mlflow.pytorch, pytorch_model=pytorch_model, @@ -197,20 +200,23 @@ def save_model(pytorch_model, path, conda_env=None, mlflow_model=Model(), code_p deserialize ("unpickle") the PyTorch model at load time. :param kwargs: kwargs to pass to ``torch.save`` method. - >>> import torch - >>> import mlflow - >>> import mlflow.pytorch - >>> # create model and set values - >>> pytorch_model = Model() - >>> pytorch_model_path = ... - >>> #train our model - >>> for epoch in range(500): - >>> y_pred = model(x_data) - >>> ... - >>> #save the model - >>> with mlflow.start_run() as run: - >>> mlflow.log_param("epochs", 500) - >>> mlflow.pytorch.save_model(pytorch_model, pytorch_model_path) + .. code-block:: python + :caption: Example + + import torch + import mlflow + import mlflow.pytorch + # Create model and set values + pytorch_model = Model() + pytorch_model_path = ... + # train our model + for epoch in range(500): + y_pred = pytorch_model(x_data) + ... + # Save the model + with mlflow.start_run() as run: + mlflow.log_param("epochs", 500) + mlflow.pytorch.save_model(pytorch_model, pytorch_model_path) """ import torch pickle_module = pickle_module or mlflow_pytorch_pickle_module @@ -323,14 +329,17 @@ def load_model(model_uri, **kwargs): :param kwargs: kwargs to pass to ``torch.load`` method. :return: A PyTorch model. - >>> import torch - >>> import mlflow - >>> import mlflow.pytorch - >>> # set values - >>> model_path_dir = ... - >>> run_id="96771d893a5e46159d9f3b49bf9013e2" - >>> pytorch_model = mlflow.pytorch.load_model("runs:/" + run_id + "/" + model_path_dir) - >>> y_pred = pytorch_model(x_new_data) + .. code-block:: python + :caption: Example + + import torch + import mlflow + import mlflow.pytorch + # Set values + model_path_dir = ... + run_id = "96771d893a5e46159d9f3b49bf9013e2" + pytorch_model = mlflow.pytorch.load_model("runs:/" + run_id + "/" + model_path_dir) + y_pred = pytorch_model(x_new_data) """ import torch diff --git a/mlflow/sagemaker/__init__.py b/mlflow/sagemaker/__init__.py index e5a6a1da2c7cc..a284e7e09648e 100644 --- a/mlflow/sagemaker/__init__.py +++ b/mlflow/sagemaker/__init__.py @@ -233,18 +233,19 @@ def deploy(app_name, model_uri, execution_role_arn=None, bucket=None, #SageMaker.Client.create_model>`_. For more information, see https://docs.aws.amazon.com/sagemaker/latest/dg/API_VpcConfig.html. - Example: - - >>> import mlflow.sagemaker as mfs - >>> vpc_config = { - ... 'SecurityGroupIds': [ - ... 'sg-123456abc', - ... ], - ... 'Subnets': [ - ... 'subnet-123456abc', - ... ] - ... } - >>> mfs.deploy(..., vpc_config=vpc_config) + .. code-block:: python + :caption: Example + + import mlflow.sagemaker as mfs + vpc_config = { + 'SecurityGroupIds': [ + 'sg-123456abc', + ], + 'Subnets': [ + 'subnet-123456abc', + ] + } + mfs.deploy(..., vpc_config=vpc_config) :param flavor: The name of the flavor of the model to use for deployment. Must be either ``None`` or one of mlflow.sagemaker.SUPPORTED_DEPLOYMENT_FLAVORS. If ``None``, diff --git a/mlflow/server/js/src/common/utils/Utils.js b/mlflow/server/js/src/common/utils/Utils.js index 38802e15bb7f2..c5670abbc5726 100644 --- a/mlflow/server/js/src/common/utils/Utils.js +++ b/mlflow/server/js/src/common/utils/Utils.js @@ -433,7 +433,7 @@ class Utils { selectedMetricKeys: [], showPoint: false, yAxisLogScale: false, - lineSmoothness: 0, + lineSmoothness: 1, layout: {}, }; const params = qs.parse(search.slice(1, search.length)); diff --git a/mlflow/server/js/src/experiment-tracking/components/LineSmoothSlider.js b/mlflow/server/js/src/experiment-tracking/components/LineSmoothSlider.js index 06f0431362194..02c4318e7df7e 100644 --- a/mlflow/server/js/src/experiment-tracking/components/LineSmoothSlider.js +++ b/mlflow/server/js/src/experiment-tracking/components/LineSmoothSlider.js @@ -34,8 +34,8 @@ export class LineSmoothSlider extends React.Component { min={min} max={max} onChange={this.onChange} - value={typeof inputValue === 'number' ? inputValue : 0} - step={0.01} + value={typeof inputValue === 'number' ? inputValue : 1} + step={1} /> @@ -43,7 +43,7 @@ export class LineSmoothSlider extends React.Component { min={min} max={max} style={{ marginLeft: 16 }} - step={0.01} + step={1} value={inputValue} onChange={this.onChange} /> diff --git a/mlflow/server/js/src/experiment-tracking/components/MetricsPlotControls.js b/mlflow/server/js/src/experiment-tracking/components/MetricsPlotControls.js index 7bd59a14def5f..8060bc95314d3 100644 --- a/mlflow/server/js/src/experiment-tracking/components/MetricsPlotControls.js +++ b/mlflow/server/js/src/experiment-tracking/components/MetricsPlotControls.js @@ -9,6 +9,7 @@ const RadioGroup = Radio.Group; export const X_AXIS_WALL = 'wall'; export const X_AXIS_STEP = 'step'; export const X_AXIS_RELATIVE = 'relative'; +export const MAX_LINE_SMOOTHNESS = 100; export class MetricsPlotControls extends React.Component { static propTypes = { @@ -43,7 +44,7 @@ export class MetricsPlotControls extends React.Component { render() { const { chartType, yAxisLogScale, initialLineSmoothness, showPoint } = this.props; const lineSmoothnessTooltipText = - 'Make the line between points "smoother" based on generalized Catmull-Rom splines. ' + + 'Make the line between points "smoother" based on Exponential Moving Average. ' + 'Smoothing can be useful for displaying the ' + 'overall trend when the logging frequency is high.'; return ( @@ -69,9 +70,9 @@ export class MetricsPlotControls extends React.Component { diff --git a/mlflow/server/js/src/experiment-tracking/components/MetricsPlotControls.test.js b/mlflow/server/js/src/experiment-tracking/components/MetricsPlotControls.test.js index db2b187f01b6f..1bb6a9c2f3fd8 100644 --- a/mlflow/server/js/src/experiment-tracking/components/MetricsPlotControls.test.js +++ b/mlflow/server/js/src/experiment-tracking/components/MetricsPlotControls.test.js @@ -15,7 +15,7 @@ describe('unit tests', () => { handleYAxisLogScaleChange: jest.fn(), handleLineSmoothChange: jest.fn(), chartType: CHART_TYPE_LINE, - initialLineSmoothness: 0, + initialLineSmoothness: 1, yAxisLogScale: true, xAxis: X_AXIS_RELATIVE, onLayoutChange: jest.fn(), diff --git a/mlflow/server/js/src/experiment-tracking/components/MetricsPlotPanel.js b/mlflow/server/js/src/experiment-tracking/components/MetricsPlotPanel.js index ad5c20b2b78e9..72b4ed72b9386 100644 --- a/mlflow/server/js/src/experiment-tracking/components/MetricsPlotPanel.js +++ b/mlflow/server/js/src/experiment-tracking/components/MetricsPlotPanel.js @@ -413,14 +413,13 @@ export class MetricsPlotPanel extends React.Component { const { popoverVisible, popoverX, popoverY } = this.state; const { points, event: { clientX, clientY } } = data; const samePointClicked = popoverX === clientX && popoverY === clientY; - const runItems = points .sort((a, b) => b.y - a.y) .map(point => ({ runId: point.data.runId, name: point.data.name, color: point.fullData.marker.color, - y: point.y, + y: point.text, })); this.setState({ diff --git a/mlflow/server/js/src/experiment-tracking/components/MetricsPlotView.js b/mlflow/server/js/src/experiment-tracking/components/MetricsPlotView.js index 96fee5aed8922..c9e860a239e43 100644 --- a/mlflow/server/js/src/experiment-tracking/components/MetricsPlotView.js +++ b/mlflow/server/js/src/experiment-tracking/components/MetricsPlotView.js @@ -2,12 +2,35 @@ import React from 'react'; import Utils from '../../common/utils/Utils'; import _ from 'lodash'; import PropTypes from 'prop-types'; -import { X_AXIS_STEP, X_AXIS_RELATIVE } from './MetricsPlotControls'; +import { X_AXIS_STEP, X_AXIS_RELATIVE, MAX_LINE_SMOOTHNESS } from './MetricsPlotControls'; import { CHART_TYPE_BAR } from './MetricsPlotPanel'; import Plot from '../../../node_modules/react-plotly.js/react-plotly'; const MAX_RUN_NAME_DISPLAY_LENGTH = 36; +const EMA = (mArray, smoothingWeight) => { + // If all elements in the set of metric values are constant, or if + // the degree of smoothing is set to the minimum value, return the + // original set of metric values + if (smoothingWeight <= 1 || mArray.every((v) => v === mArray[0])) { + return mArray; + } + + const smoothness = smoothingWeight / (MAX_LINE_SMOOTHNESS + 1); + const smoothedArray = []; + let biasedElement = 0; + for (let i = 0; i < mArray.length; i++) { + biasedElement = (biasedElement * smoothness) + ((1 - smoothness) * mArray[i]); + // To avoid biasing earlier elements toward smaller-than-accurate values, we divide + // all elements by a `debiasedWeight` that asymptotically increases and approaches + // 1 as the element index increases + const debiasWeight = 1.0 - Math.pow(smoothness, i + 1); + const debiasedElement = biasedElement / debiasWeight; + smoothedArray.push(debiasedElement); + } + return smoothedArray; +}; + export class MetricsPlotView extends React.Component { static propTypes = { runUuids: PropTypes.arrayOf(String).isRequired, @@ -61,11 +84,13 @@ export class MetricsPlotView extends React.Component { } return MetricsPlotView.parseTimestamp(entry.timestamp, history, xAxis); }), - y: history.map((entry) => entry.value), + y: EMA(history.map((entry) => entry.value), lineSmoothness), + text: history.map((entry) => entry.value.toFixed(5)), type: 'scatter', mode: isSingleHistory ? 'markers' : 'lines+markers', - line: { shape: 'spline', smoothing: lineSmoothness }, marker: {opacity: isSingleHistory || showPoint ? 1 : 0 }, + hovertemplate: (isSingleHistory || (lineSmoothness === 1)) ? + '%{y}' : 'Value: %{text}
Smoothed: %{y}', visible: visible, runId: runUuid, metricName: metricKey, diff --git a/mlflow/server/js/src/experiment-tracking/components/MetricsPlotView.test.js b/mlflow/server/js/src/experiment-tracking/components/MetricsPlotView.test.js index 14e197c98c40c..82a5d19e6c1e2 100644 --- a/mlflow/server/js/src/experiment-tracking/components/MetricsPlotView.test.js +++ b/mlflow/server/js/src/experiment-tracking/components/MetricsPlotView.test.js @@ -36,7 +36,7 @@ const metricsForLine = [ timestamp: 1556662043000, }, { - key: 'metric_0', + key: 'metric_1', value: 400, step: 1, timestamp: 1556662044000, @@ -80,6 +80,7 @@ describe('unit tests', () => { let wrapper; let instance; let minimalPropsForLineChart; + let minimalPropsForSmoothedLineChart; let minimalPropsForBarChart; beforeEach(() => { @@ -93,13 +94,17 @@ describe('unit tests', () => { chartType: CHART_TYPE_LINE, isComparing: false, yAxisLogScale: false, - lineSmoothness: 0, + lineSmoothness: 1, onClick: jest.fn(), onLayoutChange: jest.fn(), onLegendDoubleClick: jest.fn(), onLegendClick: jest.fn(), deselectedCurves: [], }; + minimalPropsForSmoothedLineChart = { + ...minimalPropsForLineChart, + lineSmoothness: 50, + }; minimalPropsForBarChart = { ...minimalPropsForLineChart, metrics: metricsForBar, @@ -124,13 +129,11 @@ describe('unit tests', () => { runId: 'runUuid1', x: [0, 1], y: [100, 200], + text: ['100.00000', '200.00000'], type: 'scatter', visible: true, mode: 'lines+markers', - line: { - shape: 'spline', - smoothing: 0, - }, + hovertemplate: '%{y}', marker: {"opacity": 0}, }, { @@ -139,13 +142,47 @@ describe('unit tests', () => { runId: 'runUuid2', x: [0, 1], y: [300, 400], + text: ['300.00000', '400.00000'], + type: 'scatter', + visible: true, + mode: 'lines+markers', + hovertemplate: '%{y}', + marker: {"opacity": 0}, + }, + ], + layout: {}, + }); + }); + + test('getPlotPropsForLineChart(lineSmoothness = 50)', () => { + wrapper = shallow(); + instance = wrapper.instance(); + expect(instance.getPlotPropsForLineChart()).toEqual({ + data: [ + { + metricName: 'metric_0', + name: 'metric_0', + runId: 'runUuid1', + x: [0, 1], + y: [100, 166.88741721854302], + text: ['100.00000', '200.00000'], + type: 'scatter', + visible: true, + mode: 'lines+markers', + hovertemplate: 'Value: %{text}
Smoothed: %{y}', + marker: {"opacity": 0}, + }, + { + metricName: 'metric_1', + name: 'metric_1', + runId: 'runUuid2', + x: [0, 1], + y: [300, 366.887417218543], + text: ['300.00000', '400.00000'], type: 'scatter', visible: true, mode: 'lines+markers', - line: { - shape: 'spline', - smoothing: 0, - }, + hovertemplate: 'Value: %{text}
Smoothed: %{y}', marker: {"opacity": 0}, }, ], diff --git a/mlflow/server/js/src/experiment-tracking/routes.js b/mlflow/server/js/src/experiment-tracking/routes.js index bfb4654ca7c4c..deadb164c1f97 100644 --- a/mlflow/server/js/src/experiment-tracking/routes.js +++ b/mlflow/server/js/src/experiment-tracking/routes.js @@ -44,7 +44,7 @@ class Routes { */ static getMetricPageRoute(runUuids, metricKey, experimentId, plotMetricKeys = null, plotLayout = {}, selectedXAxis = X_AXIS_RELATIVE, yAxisLogScale = false, - lineSmoothness = 0, showPoint = false, deselectedCurves = [], + lineSmoothness = 1, showPoint = false, deselectedCurves = [], lastLinearYAxisRange = []) { // If runs to display are specified (e.g. if user filtered to specific runs in a metric // comparison plot), embed them in the URL, otherwise default to metricKey diff --git a/mlflow/spacy.py b/mlflow/spacy.py new file mode 100644 index 0000000000000..0195db5a62f71 --- /dev/null +++ b/mlflow/spacy.py @@ -0,0 +1,209 @@ +""" +The ``mlflow.spacy`` module provides an API for logging and loading spaCy models. +This module exports spacy models with the following flavors: + +spaCy (native) format + This is the main flavor that can be loaded back into spaCy. +:py:mod:`mlflow.pyfunc` + Produced for use by generic pyfunc-based deployment tools and batch inference. +""" + +from __future__ import absolute_import + +import logging +import os + +import pandas as pd +import yaml + +import mlflow +from mlflow import pyfunc +from mlflow.exceptions import MlflowException +from mlflow.models import Model +from mlflow.tracking.artifact_utils import _download_artifact_from_uri +from mlflow.utils.environment import _mlflow_conda_env +from mlflow.utils.model_utils import _get_flavor_configuration + +FLAVOR_NAME = "spacy" + +_logger = logging.getLogger(__name__) + + +def get_default_conda_env(): + """ + :return: The default Conda environment for MLflow Models produced by calls to + :func:`save_model()` and :func:`log_model()`. + """ + import spacy + + return _mlflow_conda_env( + additional_conda_deps=None, + additional_pip_deps=[ + "spacy=={}".format(spacy.__version__), + ], + additional_conda_channels=None) + + +def save_model(spacy_model, path, conda_env=None, mlflow_model=Model()): + """ + Save a spaCy model to a path on the local file system. + + :param spacy_model: spaCy model to be saved. + :param path: Local path where the model is to be saved. + :param conda_env: Either a dictionary representation of a Conda environment or the path to a + Conda environment yaml file. If provided, this describes the environment + this model should be run in. At minimum, it should specify the dependencies + contained in :func:`get_default_conda_env()`. If ``None``, the default + :func:`get_default_conda_env()` environment is added to the model. + The following is an *example* dictionary representation of a Conda + environment:: + + { + 'name': 'mlflow-env', + 'channels': ['defaults'], + 'dependencies': [ + 'python=3.7.0', + 'pip': [ + 'spacy==2.2.3' + ] + ] + } + + :param mlflow_model: :py:mod:`mlflow.models.Model` this flavor is being added to. + """ + import spacy + + path = os.path.abspath(path) + if os.path.exists(path): + raise MlflowException("Unable to save MLflow model to {path} - path '{path}' " + "already exists".format(path=path)) + + model_data_subpath = "model.spacy" + model_data_path = os.path.join(path, model_data_subpath) + os.makedirs(model_data_path) + + # Save spacy-model + spacy_model.to_disk(path=model_data_path) + + conda_env_subpath = "conda.yaml" + if conda_env is None: + conda_env = get_default_conda_env() + elif not isinstance(conda_env, dict): + with open(conda_env, "r") as f: + conda_env = yaml.safe_load(f) + with open(os.path.join(path, conda_env_subpath), "w") as f: + yaml.safe_dump(conda_env, stream=f, default_flow_style=False) + + # Save the pyfunc flavor if at least one text categorizer in spaCy pipeline + if any([isinstance(pipe_component[1], spacy.pipeline.TextCategorizer) + for pipe_component in spacy_model.pipeline]): + pyfunc.add_to_model(mlflow_model, loader_module="mlflow.spacy", + data=model_data_subpath, env=conda_env_subpath) + else: + _logger.warning( + "Generating only the spacy flavor for the provided spacy model. This means the model " + "can be loaded back via `mlflow.spacy.load_model`, but cannot be loaded back using " + "pyfunc APIs like `mlflow.pyfunc.load_model` or via the `mlflow models` CLI commands. " + "MLflow will only generate the pyfunc flavor for spacy models containing a pipeline " + "component that is an instance of spacy.pipeline.TextCategorizer.") + + mlflow_model.add_flavor(FLAVOR_NAME, spacy_version=spacy.__version__, data=model_data_subpath) + mlflow_model.save(os.path.join(path, "MLmodel")) + + +def log_model(spacy_model, artifact_path, conda_env=None, registered_model_name=None, **kwargs): + """ + Log a spaCy model as an MLflow artifact for the current run. + + :param spacy_model: spaCy model to be saved. + :param artifact_path: Run-relative artifact path. + :param conda_env: Either a dictionary representation of a Conda environment or the path to a + Conda environment yaml file. If provided, this decsribes the environment + this model should be run in. At minimum, it should specify the dependencies + contained in :func:`get_default_conda_env()`. If ``None``, the default + :func:`get_default_conda_env()` environment is added to the model. + The following is an *example* dictionary representation of a Conda + environment:: + + { + 'name': 'mlflow-env', + 'channels': ['defaults'], + 'dependencies': [ + 'python=3.7.0', + 'pip': [ + 'spacy==2.2.3' + ] + ] + } + :param registered_model_name: Note:: Experimental: This argument may change or be removed in a + future release without warning. If given, create a model + version under ``registered_model_name``, also creating a + registered model if one with the given name does not exist. + :param kwargs: kwargs to pass to ``spacy.save_model`` method. + """ + Model.log(artifact_path=artifact_path, flavor=mlflow.spacy, + registered_model_name=registered_model_name, + spacy_model=spacy_model, conda_env=conda_env, **kwargs) + + +def _load_model(path): + import spacy + + path = os.path.abspath(path) + return spacy.load(path) + + +class _SpacyModelWrapper: + def __init__(self, spacy_model): + self.spacy_model = spacy_model + + def predict(self, dataframe): + """ + Only works for predicting using text categorizer. + Not suitable for other pipeline components (e.g: parser) + :param dataframe: pandas dataframe containing texts to be categorized + expected shape is (n_rows,1 column) + :return: dataframe with predictions + """ + if len(dataframe.columns) != 1: + raise MlflowException('Shape of input dataframe must be (n_rows, 1column)') + + return pd.DataFrame({ + 'predictions': dataframe.ix[:, 0].apply(lambda text: self.spacy_model(text).cats) + }) + + +def _load_pyfunc(path): + """ + Load PyFunc implementation. Called by ``pyfunc.load_pyfunc``. + + :param path: Local filesystem path to the MLflow Model with the ``spacy`` flavor. + """ + return _SpacyModelWrapper(_load_model(path)) + + +def load_model(model_uri): + """ + Load a spaCy model from a local file (if ``run_id`` is ``None``) or a run. + + :param model_uri: The location, in URI format, of the MLflow model. For example: + + - ``/Users/me/path/to/local/model`` + - ``relative/path/to/local/model`` + - ``s3://my_bucket/path/to/model`` + - ``runs://run-relative/path/to/model`` + - ``models://`` + - ``models://`` + + For more information about supported URI schemes, see + `Referencing Artifacts `_. + + :return: A spaCy loaded model + """ + local_model_path = _download_artifact_from_uri(artifact_uri=model_uri) + flavor_conf = _get_flavor_configuration(model_path=local_model_path, flavor_name=FLAVOR_NAME) + # Flavor configurations for models saved in MLflow version <= 0.8.0 may not contain a + # `data` key; in this case, we assume the model artifact path to be `model.spacy` + spacy_model_file_path = os.path.join(local_model_path, flavor_conf.get("data", "model.spacy")) + return _load_model(path=spacy_model_file_path) diff --git a/mlflow/spark.py b/mlflow/spark.py index 124a030f79b9f..3d5f1d046b5b1 100644 --- a/mlflow/spark.py +++ b/mlflow/spark.py @@ -119,20 +119,23 @@ def log_model(spark_model, artifact_path, conda_env=None, dfs_tmpdir=None, version under ``registered_model_name``, also creating a registered model if one with the given name does not exist. - >>> from pyspark.ml import Pipeline - >>> from pyspark.ml.classification import LogisticRegression - >>> from pyspark.ml.feature import HashingTF, Tokenizer - >>> training = spark.createDataFrame([ - ... (0, "a b c d e spark", 1.0), - ... (1, "b d", 0.0), - ... (2, "spark f g h", 1.0), - ... (3, "hadoop mapreduce", 0.0) ], ["id", "text", "label"]) - >>> tokenizer = Tokenizer(inputCol="text", outputCol="words") - >>> hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") - >>> lr = LogisticRegression(maxIter=10, regParam=0.001) - >>> pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) - >>> model = pipeline.fit(training) - >>> mlflow.spark.log_model(model, "spark-model") + .. code-block:: python + :caption: Example + + from pyspark.ml import Pipeline + from pyspark.ml.classification import LogisticRegression + from pyspark.ml.feature import HashingTF, Tokenizer + training = spark.createDataFrame([ + (0, "a b c d e spark", 1.0), + (1, "b d", 0.0), + (2, "spark f g h", 1.0), + (3, "hadoop mapreduce", 0.0) ], ["id", "text", "label"]) + tokenizer = Tokenizer(inputCol="text", outputCol="words") + hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") + lr = LogisticRegression(maxIter=10, regParam=0.001) + pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) + model = pipeline.fit(training) + mlflow.spark.log_model(model, "spark-model") """ from py4j.protocol import Py4JJavaError @@ -360,12 +363,15 @@ def save_model(spark_model, path, mlflow_model=Model(), conda_env=None, This must be a PySpark DataFrame that the model can evaluate. If ``sample_input`` is ``None``, the MLeap flavor is not added. - >>> from mlflow import spark - >>> from pyspark.ml.pipeline.PipelineModel - >>> - >>> #your pyspark.ml.pipeline.PipelineModel type - >>> model = ... - >>> mlflow.spark.save_model(model, "spark-model") + .. code-block:: python + :caption: Example + + from mlflow import spark + from pyspark.ml.pipeline.PipelineModel + + # your pyspark.ml.pipeline.PipelineModel type + model = ... + mlflow.spark.save_model(model, "spark-model") """ _validate_model(spark_model) from pyspark.ml import PipelineModel @@ -417,16 +423,19 @@ def load_model(model_uri, dfs_tmpdir=None): destination. Defaults to ``/tmp/mlflow``. :return: pyspark.ml.pipeline.PipelineModel - >>> from mlflow import spark - >>> model = mlflow.spark.load_model("spark-model") - >>> # Prepare test documents, which are unlabeled (id, text) tuples. - >>> test = spark.createDataFrame([ - ... (4, "spark i j k"), - ... (5, "l m n"), - ... (6, "spark hadoop spark"), - ... (7, "apache hadoop")], ["id", "text"]) - >>> # Make predictions on test documents. - >>> prediction = model.transform(test) + .. code-block:: python + :caption: Example + + from mlflow import spark + model = mlflow.spark.load_model("spark-model") + # Prepare test documents, which are unlabeled (id, text) tuples. + test = spark.createDataFrame([ + (4, "spark i j k"), + (5, "l m n"), + (6, "spark hadoop spark"), + (7, "apache hadoop")], ["id", "text"]) + # Make predictions on test documents + prediction = model.transform(test) """ if RunsArtifactRepository.is_runs_uri(model_uri): runs_uri = model_uri @@ -498,33 +507,37 @@ def autolog(): exists, datasource information is cached in memory & logged to the next-created active run (but not to successive runs). Note that autologging of Spark ML (MLlib) models is not currently supported via this API. Datasource-autologging is best-effort, meaning that if Spark is under - heavy load or MLflow logging fails for any reason (e.g. if the MLflow server is unavailable), + heavy load or MLflow logging fails for any reason (e.g., if the MLflow server is unavailable), logging may be dropped. For any unexpected issues with autologging, check Spark driver and executor logs in addition to stderr & stdout generated from your MLflow code - datasource information is pulled from Spark, so logs relevant to debugging may show up amongst the Spark logs. - >>> import mlflow.spark - >>> from pyspark.sql import SparkSession - >>> # Create and persist some dummy data - >>> spark = SparkSession.builder\ - >>> .config("spark.jars.packages", "org.mlflow.mlflow-spark").getOrCreate() - >>> df = spark.createDataFrame([ - ... (4, "spark i j k"), - ... (5, "l m n"), - ... (6, "spark hadoop spark"), - ... (7, "apache hadoop")], ["id", "text"]) - >>> import tempfile - >>> tempdir = tempfile.mkdtemp() - >>> df.write.format("csv").save(tempdir) - >>> # Enable Spark datasource autologging. - >>> mlflow.spark.autolog() - >>> loaded_df = spark.read.format("csv").load(tempdir) - >>> # Call collect() to trigger a read of the Spark datasource. Datasource info - >>> # (path and format)is automatically logged to an MLflow run. - >>> loaded_df.collect() - >>> shutil.rmtree(tempdir) # clean up tempdir + .. code-block:: python + :caption: Example + + import mlflow.spark + from pyspark.sql import SparkSession + # Create and persist some dummy data + spark = (SparkSession.builder + .config("spark.jars.packages", "org.mlflow.mlflow-spark") + .getOrCreate()) + df = spark.createDataFrame([ + (4, "spark i j k"), + (5, "l m n"), + (6, "spark hadoop spark"), + (7, "apache hadoop")], ["id", "text"]) + import tempfile + tempdir = tempfile.mkdtemp() + df.write.format("csv").save(tempdir) + # Enable Spark datasource autologging. + mlflow.spark.autolog() + loaded_df = spark.read.format("csv").load(tempdir) + # Call collect() to trigger a read of the Spark datasource. Datasource info + # (path and format)is automatically logged to an MLflow run. + loaded_df.collect() + shutil.rmtree(tempdir) # clean up tempdir """ from mlflow import _spark_autologging _spark_autologging.autolog() diff --git a/mlflow/tensorflow.py b/mlflow/tensorflow.py index 604616453b975..21ec769b89e86 100644 --- a/mlflow/tensorflow.py +++ b/mlflow/tensorflow.py @@ -251,17 +251,20 @@ def load_model(model_uri, tf_sess=None): For TensorFlow >= 2.0.0, A callable graph (tf.function) that takes inputs and returns inferences. - >>> import mlflow.tensorflow - >>> import tensorflow as tf - >>> tf_graph = tf.Graph() - >>> tf_sess = tf.Session(graph=tf_graph) - >>> with tf_graph.as_default(): - >>> signature_definition = mlflow.tensorflow.load_model(model_uri="model_uri", - >>> tf_sess=tf_sess) - >>> input_tensors = [tf_graph.get_tensor_by_name(input_signature.name) - >>> for _, input_signature in signature_definition.inputs.items()] - >>> output_tensors = [tf_graph.get_tensor_by_name(output_signature.name) - >>> for _, output_signature in signature_definition.outputs.items()] + .. code-block:: python + :caption: Example + + import mlflow.tensorflow + import tensorflow as tf + tf_graph = tf.Graph() + tf_sess = tf.Session(graph=tf_graph) + with tf_graph.as_default(): + signature_definition = mlflow.tensorflow.load_model(model_uri="model_uri", + tf_sess=tf_sess) + input_tensors = [tf_graph.get_tensor_by_name(input_signature.name) + for _, input_signature in signature_definition.inputs.items()] + output_tensors = [tf_graph.get_tensor_by_name(output_signature.name) + for _, output_signature in signature_definition.outputs.items()] """ if LooseVersion(tensorflow.__version__) < LooseVersion('2.0.0'): @@ -482,8 +485,6 @@ def on_train_begin(self, logs=None): # pylint: disable=unused-argument sum_list = [] self.model.summary(print_fn=sum_list.append) summary = '\n'.join(sum_list) - try_mlflow_log(mlflow.set_tag, 'model_summary', summary) - tempdir = tempfile.mkdtemp() try: summary_file = os.path.join(tempdir, "model_summary.txt") @@ -522,8 +523,6 @@ def on_train_begin(self, logs=None): # pylint: disable=unused-argument sum_list = [] self.model.summary(print_fn=sum_list.append) summary = '\n'.join(sum_list) - try_mlflow_log(mlflow.set_tag, 'model_summary', summary) - tempdir = tempfile.mkdtemp() try: summary_file = os.path.join(tempdir, "model_summary.txt") diff --git a/tests/keras_autolog/test_keras_autolog.py b/tests/keras_autolog/test_keras_autolog.py index 2a38eb8ab73e8..bb0d7952a47ad 100644 --- a/tests/keras_autolog/test_keras_autolog.py +++ b/tests/keras_autolog/test_keras_autolog.py @@ -130,8 +130,6 @@ def test_keras_autolog_logs_expected_data(keras_random_data_run): assert data.params['optimizer_name'] == 'Adam' assert 'epsilon' in data.params assert data.params['epsilon'] == '1e-07' - assert 'model_summary' in data.tags - assert 'Total params: 6,922' in data.tags['model_summary'] client = mlflow.tracking.MlflowClient() artifacts = client.list_artifacts(keras_random_data_run.info.run_id) artifacts = map(lambda x: x.path, artifacts) diff --git a/tests/models/test_cli.py b/tests/models/test_cli.py index 550e0921644a8..ca6175cdf428a 100644 --- a/tests/models/test_cli.py +++ b/tests/models/test_cli.py @@ -127,6 +127,7 @@ def test_model_with_no_deployable_flavors_fails_pollitely(): assert "No suitable flavor backend was found for the model." in stderr +@pytest.mark.large def test_serve_gunicorn_opts(iris_data, sk_model): if sys.platform == "win32": pytest.skip("This test requires gunicorn which is not available on windows.") @@ -159,6 +160,7 @@ def test_serve_gunicorn_opts(iris_data, sk_model): assert expected_command_pattern.search(stdout) is not None +@pytest.mark.large def test_predict(iris_data, sk_model): with TempDir(chdr=True) as tmp: with mlflow.start_run() as active_run: @@ -245,6 +247,7 @@ def test_predict(iris_data, sk_model): assert all(expected == actual) +@pytest.mark.large def test_prepare_env_passes(sk_model): if no_conda: pytest.skip("This test requires conda.") @@ -270,6 +273,7 @@ def test_prepare_env_passes(sk_model): assert p.wait() == 0 +@pytest.mark.large def test_prepare_env_fails(sk_model): if no_conda: pytest.skip("This test requires conda.") diff --git a/tests/projects/test_docker_projects.py b/tests/projects/test_docker_projects.py index 1e558759da979..24368b248aad3 100644 --- a/tests/projects/test_docker_projects.py +++ b/tests/projects/test_docker_projects.py @@ -26,6 +26,7 @@ def _build_uri(base_uri, subdirectory): @pytest.mark.parametrize("use_start_run", map(str, [0, 1])) +@pytest.mark.large def test_docker_project_execution( use_start_run, tmpdir, docker_example_base_image): # pylint: disable=unused-argument @@ -68,6 +69,7 @@ def test_docker_project_execution( ("databricks://some-profile", "-e MLFLOW_TRACKING_URI=databricks ") ]) @mock.patch('databricks_cli.configure.provider.ProfileConfigProvider') +@pytest.mark.large def test_docker_project_tracking_uri_propagation( ProfileConfigProvider, tmpdir, tracking_uri, expected_command_segment, docker_example_base_image): # pylint: disable=unused-argument diff --git a/tests/projects/test_projects.py b/tests/projects/test_projects.py index 6297a31345eb7..d69cb56ef37a9 100644 --- a/tests/projects/test_projects.py +++ b/tests/projects/test_projects.py @@ -189,6 +189,7 @@ def test_invalid_run_mode(): mlflow.projects.run(uri=TEST_PROJECT_DIR, backend="some unsupported mode") +@pytest.mark.large def test_use_conda(): """ Verify that we correctly handle the `use_conda` argument.""" # Verify we throw an exception when conda is unavailable @@ -207,6 +208,7 @@ def test_use_conda(): os.environ["CONDA_EXE"] = conda_exe_path +@pytest.mark.large def test_expected_tags_logged_when_using_conda(): with mock.patch.object(mlflow.tracking.MlflowClient, "set_tag") as tag_mock: try: diff --git a/tests/spacy/__init__.py b/tests/spacy/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/spacy/test_spacy_model_export.py b/tests/spacy/test_spacy_model_export.py new file mode 100644 index 0000000000000..6a48aea5e631c --- /dev/null +++ b/tests/spacy/test_spacy_model_export.py @@ -0,0 +1,264 @@ +import os +import random +from collections import namedtuple + +import pandas as pd +import pytest +import six +import spacy +import yaml +from spacy.util import compounding, minibatch + +import mlflow.spacy +from sklearn.datasets import fetch_20newsgroups + +from mlflow import pyfunc +from mlflow.exceptions import MlflowException +from mlflow.models import Model +from mlflow.tracking.artifact_utils import _download_artifact_from_uri +from mlflow.utils.environment import _mlflow_conda_env +from mlflow.utils.file_utils import TempDir +from mlflow.utils.model_utils import _get_flavor_configuration +from tests.conftest import tracking_uri_mock # pylint: disable=unused-import, E0611 + +ModelWithData = namedtuple("ModelWithData", ["model", "inference_data"]) + + +@pytest.fixture(scope="module") +def spacy_model_with_data(): + # Creating blank model and setting up the spaCy pipeline + nlp = spacy.blank("en") + textcat = nlp.create_pipe( + "textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"} + ) + nlp.add_pipe(textcat, last=True) + + # Training the model to recognize between computer graphics and baseball in 20newsgroups dataset + categories = ['comp.graphics', 'rec.sport.baseball'] + for cat in categories: + textcat.add_label(cat) + + # Split train/test and train the model + train_x, train_y, test_x, _ = _get_train_test_dataset(categories) + train_data = list(zip(train_x, [{"cats": cats} for cats in train_y])) + _train_model(nlp, train_data) + return ModelWithData(nlp, pd.DataFrame(test_x)) + + +@pytest.fixture +def spacy_custom_env(tmpdir): + conda_env = os.path.join(str(tmpdir), "conda_env.yml") + _mlflow_conda_env( + conda_env, + additional_conda_deps=["pytest"], + additional_pip_deps=["spacy"]) + return conda_env + + +@pytest.fixture +def model_path(tmpdir): + return os.path.join(str(tmpdir), "model") + + +@pytest.mark.large +def test_model_save_load(spacy_model_with_data, model_path): + spacy_model = spacy_model_with_data.model + mlflow.spacy.save_model(spacy_model=spacy_model, path=model_path) + loaded_model = mlflow.spacy.load_model(model_path) + + # Comparing the meta dictionaries for the original and loaded models + assert spacy_model.meta == loaded_model.meta + + # Load pyfunc model using saved model and asserting its predictions are equal to the created one + pyfunc_loaded = mlflow.pyfunc.load_pyfunc(model_path) + assert all(_predict(spacy_model, spacy_model_with_data.inference_data) == + pyfunc_loaded.predict(spacy_model_with_data.inference_data)) + + +@pytest.mark.large +def test_predict_df_with_wrong_shape(spacy_model_with_data, model_path): + mlflow.spacy.save_model(spacy_model=spacy_model_with_data.model, path=model_path) + pyfunc_loaded = mlflow.pyfunc.load_pyfunc(model_path) + + # Concatenating with itself to duplicate column and mess up input shape + # then asserting n MlflowException is raised + with pytest.raises(MlflowException): + pyfunc_loaded.predict(pd.concat([spacy_model_with_data.inference_data, + spacy_model_with_data.inference_data], axis=1)) + + +@pytest.mark.large +def test_model_log(spacy_model_with_data, tracking_uri_mock): # pylint: disable=unused-argument + spacy_model = spacy_model_with_data.model + old_uri = mlflow.get_tracking_uri() + # should_start_run tests whether or not calling log_model() automatically starts a run. + for should_start_run in [False, True]: + with TempDir(chdr=True, remove_on_exit=True): + try: + artifact_path = "model" + if should_start_run: + mlflow.start_run() + mlflow.spacy.log_model(spacy_model=spacy_model, artifact_path=artifact_path) + model_uri = "runs:/{run_id}/{artifact_path}".format( + run_id=mlflow.active_run().info.run_id, + artifact_path=artifact_path) + + # Load model + spacy_model_loaded = mlflow.spacy.load_model(model_uri=model_uri) + assert all(_predict(spacy_model, spacy_model_with_data.inference_data) == + _predict(spacy_model_loaded, spacy_model_with_data.inference_data)) + finally: + mlflow.end_run() + mlflow.set_tracking_uri(old_uri) + + +@pytest.mark.large +def test_model_save_persists_specified_conda_env_in_mlflow_model_directory( + spacy_model_with_data, model_path, spacy_custom_env): + mlflow.spacy.save_model(spacy_model=spacy_model_with_data.model, path=model_path, + conda_env=spacy_custom_env) + + pyfunc_conf = _get_flavor_configuration(model_path=model_path, flavor_name=pyfunc.FLAVOR_NAME) + saved_conda_env_path = os.path.join(model_path, pyfunc_conf[pyfunc.ENV]) + assert os.path.exists(saved_conda_env_path) + assert saved_conda_env_path != spacy_custom_env + + with open(spacy_custom_env, "r") as f: + spacy_custom_env_text = f.read() + with open(saved_conda_env_path, "r") as f: + saved_conda_env_text = f.read() + assert saved_conda_env_text == spacy_custom_env_text + + +@pytest.mark.large +def test_model_save_accepts_conda_env_as_dict(spacy_model_with_data, model_path): + conda_env = dict(mlflow.spacy.get_default_conda_env()) + conda_env["dependencies"].append("pytest") + mlflow.spacy.save_model(spacy_model=spacy_model_with_data.model, path=model_path, + conda_env=conda_env) + + pyfunc_conf = _get_flavor_configuration(model_path=model_path, flavor_name=pyfunc.FLAVOR_NAME) + saved_conda_env_path = os.path.join(model_path, pyfunc_conf[pyfunc.ENV]) + assert os.path.exists(saved_conda_env_path) + + with open(saved_conda_env_path, "r") as f: + saved_conda_env_parsed = yaml.safe_load(f) + assert saved_conda_env_parsed == conda_env + + +@pytest.mark.large +def test_model_log_persists_specified_conda_env_in_mlflow_model_directory( + spacy_model_with_data, spacy_custom_env): + artifact_path = "model" + with mlflow.start_run(): + mlflow.spacy.log_model(spacy_model=spacy_model_with_data.model, + artifact_path=artifact_path, + conda_env=spacy_custom_env) + model_path = _download_artifact_from_uri("runs:/{run_id}/{artifact_path}".format( + run_id=mlflow.active_run().info.run_id, artifact_path=artifact_path)) + + pyfunc_conf = _get_flavor_configuration(model_path=model_path, flavor_name=pyfunc.FLAVOR_NAME) + saved_conda_env_path = os.path.join(model_path, pyfunc_conf[pyfunc.ENV]) + assert os.path.exists(saved_conda_env_path) + assert saved_conda_env_path != spacy_custom_env + + with open(spacy_custom_env, "r") as f: + spacy_custom_env_text = f.read() + with open(saved_conda_env_path, "r") as f: + saved_conda_env_text = f.read() + assert saved_conda_env_text == spacy_custom_env_text + + +@pytest.mark.large +def test_model_save_without_specified_conda_env_uses_default_env_with_expected_dependencies( + spacy_model_with_data, model_path): + mlflow.spacy.save_model(spacy_model=spacy_model_with_data.model, path=model_path, + conda_env=None) + + pyfunc_conf = _get_flavor_configuration(model_path=model_path, flavor_name=pyfunc.FLAVOR_NAME) + conda_env_path = os.path.join(model_path, pyfunc_conf[pyfunc.ENV]) + with open(conda_env_path, "r") as f: + conda_env = yaml.safe_load(f) + + assert conda_env == mlflow.spacy.get_default_conda_env() + + +@pytest.mark.large +def test_model_log_without_specified_conda_env_uses_default_env_with_expected_dependencies( + spacy_model_with_data): + artifact_path = "model" + with mlflow.start_run(): + mlflow.spacy.log_model(spacy_model=spacy_model_with_data.model, artifact_path=artifact_path) + model_path = _download_artifact_from_uri("runs:/{run_id}/{artifact_path}".format( + run_id=mlflow.active_run().info.run_id, artifact_path=artifact_path)) + + pyfunc_conf = _get_flavor_configuration(model_path=model_path, flavor_name=pyfunc.FLAVOR_NAME) + conda_env_path = os.path.join(model_path, pyfunc_conf[pyfunc.ENV]) + with open(conda_env_path, "r") as f: + conda_env = yaml.safe_load(f) + + assert conda_env == mlflow.spacy.get_default_conda_env() + + +@pytest.mark.large +def test_model_log_with_pyfunc_flavor(spacy_model_with_data): + artifact_path = "model" + with mlflow.start_run(): + mlflow.spacy.log_model(spacy_model=spacy_model_with_data.model, artifact_path=artifact_path) + model_path = _download_artifact_from_uri("runs:/{run_id}/{artifact_path}".format( + run_id=mlflow.active_run().info.run_id, artifact_path=artifact_path)) + + loaded_model = Model.load(model_path) + assert pyfunc.FLAVOR_NAME in loaded_model.flavors + + +@pytest.mark.large +def test_model_log_without_pyfunc_flavor(): + artifact_path = "model" + nlp = spacy.blank("en") + + # Add a component not compatible with pyfunc + ner = nlp.create_pipe("ner") + nlp.add_pipe(ner, last=True) + + # Ensure the pyfunc flavor is not present after logging and loading the model + with mlflow.start_run(): + mlflow.spacy.log_model(spacy_model=nlp, artifact_path=artifact_path) + model_path = _download_artifact_from_uri("runs:/{run_id}/{artifact_path}".format( + run_id=mlflow.active_run().info.run_id, artifact_path=artifact_path)) + + loaded_model = Model.load(model_path) + assert loaded_model.flavors.keys() == {"spacy"} + + +def _train_model(nlp, train_data, n_iter=5): + optimizer = nlp.begin_training() + batch_sizes = compounding(4.0, 32.0, 1.001) + for _ in range(n_iter): + losses = {} + random.shuffle(train_data) + batches = minibatch(train_data, size=batch_sizes) + for batch in batches: + texts, annotations = zip(*batch) + nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses) + + +def _get_train_test_dataset(cats_to_fetch, limit=100): + newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'), shuffle=True, + categories=cats_to_fetch) + X = newsgroups.data[:limit] + y = newsgroups.target[:limit] + + X = [six.text_type(x) for x in X] # Ensure all strings to unicode for python 2.7 compatibility + + # Category 0 comp-graphic, 1 rec.sport baseball. We can threat it as a binary class. + cats = [{"comp.graphics": not bool(el), "rec.sport.baseball": bool(el)} for el in y] + + split = int(len(X) * 0.8) + return X[:split], cats[:split], X[split:], cats[split:] + + +def _predict(spacy_model, test_x): + return pd.DataFrame({ + 'predictions': test_x.ix[:, 0].apply(lambda text: spacy_model(text).cats) + }) diff --git a/tests/tensorflow_autolog/test_tensorflow2_autolog.py b/tests/tensorflow_autolog/test_tensorflow2_autolog.py index 4656e9659a0bb..4b596f0e83f2f 100644 --- a/tests/tensorflow_autolog/test_tensorflow2_autolog.py +++ b/tests/tensorflow_autolog/test_tensorflow2_autolog.py @@ -151,8 +151,6 @@ def test_tf_keras_autolog_logs_expected_data(tf_keras_random_data_run): assert 'opt_epsilon' in data.params assert 'opt_amsgrad' in data.params assert data.params['opt_amsgrad'] == 'False' - assert 'model_summary' in data.tags - assert 'Total params: 6,922' in data.tags['model_summary'] client = mlflow.tracking.MlflowClient() all_epoch_acc = client.get_metric_history(tf_keras_random_data_run.info.run_id, 'accuracy') assert all((x.step - 1) % 5 == 0 for x in all_epoch_acc) diff --git a/tests/tensorflow_autolog/test_tensorflow_autolog.py b/tests/tensorflow_autolog/test_tensorflow_autolog.py index 67d3e58919555..00c6122ac8270 100644 --- a/tests/tensorflow_autolog/test_tensorflow_autolog.py +++ b/tests/tensorflow_autolog/test_tensorflow_autolog.py @@ -132,8 +132,6 @@ def test_tf_keras_autolog_logs_expected_data(tf_keras_random_data_run): # Testing optimizer parameters are logged assert 'optimizer_name' in data.params assert data.params['optimizer_name'] == 'AdamOptimizer' - assert 'model_summary' in data.tags - assert 'Total params: 6,922' in data.tags['model_summary'] client = mlflow.tracking.MlflowClient() all_epoch_acc = client.get_metric_history(tf_keras_random_data_run.info.run_id, 'epoch_acc') assert all((x.step - 1) % 5 == 0 for x in all_epoch_acc) diff --git a/travis/install-common-deps.sh b/travis/install-common-deps.sh index 3d2b059d84214..819c16ca5b745 100755 --- a/travis/install-common-deps.sh +++ b/travis/install-common-deps.sh @@ -2,17 +2,15 @@ set -ex sudo mkdir -p /travis-install -sudo chown travis /travis-install +if [[ -z $GITHUB_WORKFLOW ]]; then + sudo chown travis /travis-install +fi # (The conda installation steps below are taken from http://conda.pydata.org/docs/travis.html) # We do this conditionally because it saves us some downloading if the # version is the same. -if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then - wget https://repo.anaconda.com/miniconda/Miniconda2-latest-Linux-x86_64.sh -O /travis-install/miniconda.sh; -else - wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /travis-install/miniconda.sh; -fi +wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O $HOME/miniconda.sh -bash /travis-install/miniconda.sh -b -p $HOME/miniconda +bash $HOME/miniconda.sh -b -p $HOME/miniconda export PATH="$HOME/miniconda/bin:$PATH" hash -r conda config --set always_yes yes --set changeps1 no @@ -28,21 +26,23 @@ python --version pip install --upgrade pip==19.3.1 # Install Python test dependencies only if we're running Python tests if [[ "$INSTALL_SMALL_PYTHON_DEPS" == "true" ]]; then - pip install -r ./travis/small-requirements.txt + pip install --quiet -r ./travis/small-requirements.txt fi if [[ "$INSTALL_LARGE_PYTHON_DEPS" == "true" ]]; then - pip install -r ./travis/large-requirements.txt + pip install --quiet -r ./travis/large-requirements.txt # Hack: make sure all spark-* scripts are executable. # Conda installs 2 version spark-* scripts and makes the ones spark # uses not executable. This is a temporary fix to unblock the tests. - ls -lha `find /home/travis/miniconda/envs/test-environment/ -path "*bin/spark-*"` - chmod 777 `find /home/travis/miniconda/envs/test-environment/ -path "*bin/spark-*"` - ls -lha `find /home/travis/miniconda/envs/test-environment/ -path "*bin/spark-*"` + ls -lha $(find $HOME/miniconda/envs/test-environment/ -path "*bin/spark-*") + chmod 777 $(find $HOME/miniconda/envs/test-environment/ -path "*bin/spark-*") + ls -lha $(find $HOME/miniconda/envs/test-environment/ -path "*bin/spark-*") fi pip install . export MLFLOW_HOME=$(pwd) # Remove boto config present in Travis VMs (https://github.com/travis-ci/travis-ci/issues/7940) -sudo rm -f /etc/boto.cfg +if [[ -z $GITHUB_WORKFLOW ]]; then + sudo rm -f /etc/boto.cfg +fi # Print current environment info pip list which mlflow diff --git a/travis/large-requirements.txt b/travis/large-requirements.txt index 68a1ad706467c..59f36bc337eda 100644 --- a/travis/large-requirements.txt +++ b/travis/large-requirements.txt @@ -18,6 +18,7 @@ pytest==3.2.1 pytest-cov==2.6.0 scikit-learn==0.20.2 scipy==1.2.1 +spacy==2.2.3 tensorflow==1.15.2 tf2onnx==1.5.4; torch==1.4.0 diff --git a/travis/run-large-python-tests.sh b/travis/run-large-python-tests.sh index 31287abe4cd4b..bb70a5e5a91f8 100755 --- a/travis/run-large-python-tests.sh +++ b/travis/run-large-python-tests.sh @@ -16,39 +16,39 @@ else fi # NB: Also add --ignore'd tests to run-small-python-tests.sh -pytest tests --large --ignore=tests/examples --ignore=tests/h2o --ignore=tests/keras \ +pytest tests --color=yes --large --ignore=tests/examples --ignore=tests/h2o --ignore=tests/keras \ --ignore=tests/pytorch --ignore=tests/pyfunc --ignore=tests/sagemaker --ignore=tests/sklearn \ --ignore=tests/spark --ignore=tests/tensorflow --ignore=tests/azureml --ignore=tests/onnx \ --ignore=tests/keras_autolog --ignore=tests/tensorflow_autolog --ignore=tests/gluon \ --ignore=tests/gluon_autolog --ignore=tests/xgboost --ignore=tests/lightgbm \ - --ignore tests/spark_autologging --ignore=tests/models + --ignore=tests/spacy --ignore=tests/spark_autologging --ignore=tests/models # Run ML framework tests in their own Python processes to avoid OOM issues due to per-framework # overhead -pytest --verbose tests/pytorch --large -pytest --verbose tests/h2o --large -pytest --verbose tests/onnx --large -pytest --verbose tests/pyfunc --large -pytest --verbose tests/sagemaker --large -pytest --verbose tests/sagemaker/mock --large -pytest --verbose tests/sklearn --large -pytest --verbose tests/spark --large -pytest --verbose tests/tensorflow/test_tensorflow_model_export.py --large -pytest --verbose tests/tensorflow_autolog/test_tensorflow_autolog.py --large -pytest --verbose tests/azureml --large -pytest --verbose tests/models --large -pytest --verbose tests/xgboost --large -pytest --verbose tests/lightgbm --large +pytest --color=yes --verbose tests/pytorch --large +pytest --color=yes --verbose tests/h2o --large +pytest --color=yes --verbose tests/onnx --large +pytest --color=yes --verbose tests/pyfunc --large +pytest --color=yes --verbose tests/sagemaker --large +pytest --color=yes --verbose tests/sagemaker/mock --large +pytest --color=yes --verbose tests/sklearn --large +pytest --color=yes --verbose tests/spark --large +pytest --color=yes --verbose tests/tensorflow/test_tensorflow_model_export.py --large +pytest --color=yes --verbose tests/tensorflow_autolog/test_tensorflow_autolog.py --large +pytest --color=yes --verbose tests/azureml --large +pytest --color=yes --verbose tests/models --large +pytest --color=yes --verbose tests/xgboost --large +pytest --color=yes --verbose tests/lightgbm --large # TODO(smurching) Unpin TensorFlow dependency version once test failures with TF 2.1.0 have been # fixed pip install 'tensorflow==2.0.0' -pytest --verbose tests/tensorflow/test_tensorflow2_model_export.py --large -pytest --verbose tests/tensorflow_autolog/test_tensorflow2_autolog.py --large -pytest --verbose tests/keras --large -pytest --verbose tests/keras_autolog --large -pytest --verbose tests/gluon --large -pytest --verbose tests/gluon_autolog --large +pytest --color=yes --verbose tests/tensorflow/test_tensorflow2_model_export.py --large +pytest --color=yes --verbose tests/tensorflow_autolog/test_tensorflow2_autolog.py --large +pytest --color=yes --verbose tests/keras --large +pytest --color=yes --verbose tests/keras_autolog --large +pytest --color=yes --verbose tests/gluon --large +pytest --color=yes --verbose tests/gluon_autolog --large +pytest --color=yes --verbose tests/spacy --large # Run Spark autologging tests ./travis/test-spark-autologging.sh - test $err = 0 diff --git a/travis/run-small-python-tests.sh b/travis/run-small-python-tests.sh index 4ed60ccbf0cc2..def1e51964711 100755 --- a/travis/run-small-python-tests.sh +++ b/travis/run-small-python-tests.sh @@ -7,10 +7,10 @@ trap 'err=1' ERR export MLFLOW_HOME=$(pwd) # NB: Also add --ignore'd tests to run-large-python-tests.sh -pytest --cov=mlflow --verbose --ignore=tests/h2o --ignore=tests/keras \ +pytest --color=yes --cov=mlflow --verbose --ignore=tests/h2o --ignore=tests/keras \ --ignore=tests/pytorch --ignore=tests/pyfunc --ignore=tests/sagemaker --ignore=tests/sklearn \ --ignore=tests/spark --ignore=tests/tensorflow --ignore=tests/keras_autolog \ --ignore=tests/tensorflow_autolog --ignore tests/azureml --ignore tests/onnx \ - --ignore=tests/xgboost --ignore=tests/lightgbm tests --ignore=tests/spark_autologging + --ignore=tests/xgboost --ignore=tests/spacy --ignore=tests/lightgbm tests --ignore=tests/spark_autologging test $err = 0 diff --git a/travis/stage-python3.sh b/travis/stage-python3.sh index 1b055bab0b786..cae8b4491cb9d 100755 --- a/travis/stage-python3.sh +++ b/travis/stage-python3.sh @@ -7,7 +7,8 @@ then echo "skipping this step on windows." elif [[ "$TRAVIS_BUILD_STAGE_NAME" == "Small" ]] then - ./travis/run-small-python-tests.sh && ./test-generate-protos.sh + ./travis/run-small-python-tests.sh + ./test-generate-protos.sh else ./travis/run-large-python-tests.sh ./travis/test-anaconda-compatibility.sh "anaconda3:2020.02" @@ -17,7 +18,7 @@ fi CHANGED_FILES=$(git diff --name-only master..HEAD | grep "tests/examples\|examples") || true if [[ "$TRAVIS_EVENT_TYPE" == "cron" || "$CHANGED_FILES" == *"examples"* ]] && [[ "$TRAVIS_BUILD_STAGE_NAME" == "Nightly" ]] then - pytest --verbose tests/examples --large; + pytest --color=yes --verbose tests/examples --large; fi if [[ "$TRAVIS_EVENT_TYPE" == "cron" || "$CHANGED_FILES" == *"Dockerfile"* ]] && [[ "$TRAVIS_BUILD_STAGE_NAME" == "Nightly" ]] then