Mlflow Release Pipeline #42

Workflow file for this run

.github/workflows/mlflow-ci.yml at ff43ddd

	name: MLflow CI

	on:
	pull_request:
	paths:
	- 'applications/mlflow/charts/**'
	- 'applications/mlflow/kots/**'
	- 'applications/mlflow/tests/**'
	- '.github/workflows/mlflow-ci.yml'
	push:
	branches:
	- main
	paths:
	- 'applications/mlflow/charts/**'
	- 'applications/mlflow/kots/**'
	- 'applications/mlflow/tests/**'
	- '.github/workflows/mlflow-ci.yml'

	env:
	APP_SLUG: diamon-mlflow

	jobs:
	lint-and-template:
	runs-on: ubuntu-22.04
	steps:
	- name: Checkout
	uses: actions/checkout@v4
	with:
	fetch-depth: 0

	- name: Set up Helm
	uses: azure/[email protected]
	with:
	version: v3.13.3

	- name: Set up Python
	uses: actions/setup-python@v4
	with:
	python-version: 3.12

	- name: Add Helm repositories
	run: \|
	helm repo add cnpg https://cloudnative-pg.github.io/charts
	helm repo add minio https://operator.min.io/
	helm repo update

	- name: Update Helm dependencies
	run: \|
	echo "Updating Helm dependencies for all charts..."
	helm dependency update applications/mlflow/charts/infra
	helm dependency update applications/mlflow/charts/mlflow
	echo "Helm dependencies updated successfully."

	- name: Lint charts
	run: \|
	echo "Linting Helm charts..."
	for chart in mlflow infra; do
	echo "Linting $chart chart..."
	helm lint applications/mlflow/charts/$chart
	done
	echo "Linting completed successfully."

	- name: Template charts with SDK disabled
	run: \|
	cd applications/mlflow
	echo "Templating Helm charts with Replicated SDK disabled..."
	mkdir -p ./rendered-templates

	echo "Rendering templates for infra..."
	helm template ./charts/infra --output-dir ./rendered-templates/infra --debug

	echo "Rendering templates for mlflow..."
	helm template ./charts/mlflow --output-dir ./rendered-templates/mlflow --set replicated.enabled=false --debug

	echo "Templates rendered in ./rendered-templates directory."

	- name: Upload templates (if templating failed)
	uses: actions/upload-artifact@v4
	if: failure()
	with:
	name: failed-templates
	path: applications/mlflow/rendered-templates
	if-no-files-found: warn

	create-release:
	runs-on: ubuntu-22.04
	needs: [lint-and-template]
	outputs:
	customer-id: ${{ steps.create-customer.outputs.customer-id }}
	channel-slug: ${{ steps.create-release.outputs.channel-slug }}
	chart-version: ${{ steps.chart-version.outputs.chart_version }}
	steps:
	- name: Checkout
	uses: actions/checkout@v4
	with:
	fetch-depth: 0

	- name: Set up Helm
	uses: azure/[email protected]
	with:
	version: v3.13.3

	- name: Add Helm repositories
	run: \|
	helm repo add cnpg https://cloudnative-pg.github.io/charts
	helm repo add minio https://operator.min.io/
	helm repo update

	- name: Update Helm dependencies
	run: \|
	echo "Updating Helm dependencies for all charts..."
	helm dependency update applications/mlflow/charts/infra
	helm dependency update applications/mlflow/charts/mlflow
	echo "Helm dependencies updated successfully."

	- name: Package infra chart
	run: \|
	helm package applications/mlflow/charts/infra -d applications/mlflow/kots/ -u
	if [ ! -f applications/mlflow/kots/infra-*.tgz ]; then
	echo "Error: Infra chart packaging failed"
	exit 1
	fi

	- name: Package mlflow chart
	run: \|
	helm package applications/mlflow/charts/mlflow -d applications/mlflow/kots/ -u
	if [ ! -f applications/mlflow/kots/mlflow-*.tgz ]; then
	echo "Error: MLflow chart packaging failed"
	exit 1
	fi

	# The following steps implement our versioning strategy:
	# 1. We extract the chart version from mlflow-chart.yaml
	# 2. We use this version for the Replicated release
	# This ensures that the Replicated release version always matches the MLflow chart version
	- name: Extract MLflow chart version
	id: chart-version
	run: \|
	CHART_VERSION=$(grep 'chartVersion:' applications/mlflow/kots/mlflow-chart.yaml \| awk '{print $2}')
	echo "chart_version=$CHART_VERSION" >> $GITHUB_OUTPUT
	echo "Using MLflow chart version: $CHART_VERSION"

	- name: Create release
	id: create-release
	uses: replicatedhq/replicated-actions/[email protected]
	with:
	app-slug: ${{ env.APP_SLUG }}
	api-token: ${{ secrets.REPLICATED_PLATFORM_EXAMPLES_TOKEN }}
	yaml-dir: applications/mlflow/kots/
	promote-channel: ci-automation-${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }}
	version: ${{ steps.chart-version.outputs.chart_version }}

	- name: Create customer
	id: create-customer
	uses: replicatedhq/replicated-actions/create-customer@main
	with:
	app-slug: ${{ env.APP_SLUG }}
	api-token: ${{ secrets.REPLICATED_PLATFORM_EXAMPLES_TOKEN }}
	customer-name: automated-${{ github.run_id }}
	customer-email: [email protected]
	license-type: dev
	channel-slug: ${{ steps.create-release.outputs.channel-slug }}
	is-kots-install-enabled: "true"

	helm-install-test:
	runs-on: ubuntu-22.04
	needs: [create-release]
	strategy:
	fail-fast: false
	matrix:
	cluster:
	- distribution: kind
	version: 1.32
	#- distribution: kind
	# version: 1.31
	#- distribution: kind
	# version: 1.30
	config:
	- name: nodeport-ingress-disabled
	values_file: tests/helm/nodeport-ingress-disabled.yaml
	steps:
	- name: Checkout
	uses: actions/checkout@v4
	with:
	fetch-depth: 0

	- name: Set up Helm
	uses: azure/[email protected]
	with:
	version: v3.13.3

	- name: Set up Python
	uses: actions/setup-python@v4
	with:
	python-version: 3.12

	# Install jq via apt-get
	- name: Install jq
	run: \|
	sudo apt-get update
	sudo apt-get install -y jq

	# Get license ID from customer inspect
	- name: Get License ID
	id: get-license
	run: \|
	# Run vendor-cli to inspect the customer and get the installation ID
	CUSTOMER_JSON=$(docker run --rm \
	-e REPLICATED_API_TOKEN=${{ secrets.REPLICATED_PLATFORM_EXAMPLES_TOKEN }} \
	-e REPLICATED_APP=${{ env.APP_SLUG }} \
	replicated/vendor-cli:latest \
	customer inspect --customer "automated-${{ github.run_id }}" --output json)

	# Use jq to properly extract the installationId
	INSTALLATION_ID=$(echo "$CUSTOMER_JSON" \| jq -r '.installationId')

	if [ -z "$INSTALLATION_ID" ] \|\| [ "$INSTALLATION_ID" = "null" ]; then
	echo "Failed to extract installationId from customer JSON"
	echo "JSON structure:"
	echo "$CUSTOMER_JSON" \| jq 'del(.installationId)' # Print JSON without the license ID
	exit 1
	fi

	# Don't print the actual license ID, just indicate success
	echo "Successfully extracted installationId"
	echo "license_id=$INSTALLATION_ID" >> $GITHUB_OUTPUT

	- name: Create Cluster
	id: create-cluster
	uses: replicatedhq/replicated-actions/[email protected]
	with:
	api-token: ${{ secrets.REPLICATED_PLATFORM_EXAMPLES_TOKEN }}
	kubernetes-distribution: ${{ matrix.cluster.distribution }}
	kubernetes-version: ${{ matrix.cluster.version }}
	cluster-name: mlflow-ci-${{ github.run_id }}-${{ matrix.cluster.distribution }}-${{ matrix.cluster.version }}-${{ matrix.config.name }}
	disk: 100
	instance-type: r1.large
	ttl: 1h
	export-kubeconfig: true

	- name: Add Helm repositories
	run: \|
	helm repo add cnpg https://cloudnative-pg.github.io/charts
	helm repo add minio https://operator.min.io/
	helm repo update

	- name: Update Helm dependencies
	run: \|
	# This step ensures dependencies are updated for local chart references
	# Note: For OCI charts, dependencies are already included in the packaged chart
	echo "Updating Helm dependencies for all charts..."
	helm dependency update applications/mlflow/charts/infra
	helm dependency update applications/mlflow/charts/mlflow
	echo "Helm dependencies updated successfully."

	# Authenticate with the Replicated registry
	- name: Login to Replicated registry
	run: \|
	echo "Authenticating with Replicated registry..."
	if [ -z "$REPLICATED_LICENSE_ID" ]; then
	echo "ERROR: REPLICATED_LICENSE_ID environment variable must be set"
	exit 1
	fi

	helm registry login registry.replicated.com \
	--username="$REPLICATED_LICENSE_ID" \
	--password="$REPLICATED_LICENSE_ID"
	echo "Registry login successful."
	env:
	REPLICATED_LICENSE_ID: ${{ steps.get-license.outputs.license_id }}

	- name: Run Helm installation test with custom values
	run: \|
	cd applications/mlflow

	# Save kubeconfig to a file
	KUBECONFIG_FILE="/tmp/kubeconfig-helm-test-${{ github.run_id }}"
	echo "$KUBECONFIG" > "$KUBECONFIG_FILE"
	echo "Saved kubeconfig to $KUBECONFIG_FILE"

	echo "Running Helm installation test with custom values..."

	# Determine OCI URL - prefer direct OCI_URL if provided, otherwise construct from app/channel
	if [ -n "$OCI_URL" ]; then
	echo "Using provided OCI URL: $OCI_URL"
	else
	echo "No direct OCI_URL provided. Constructing from REPLICATED_APP and REPLICATED_CHANNEL"
	echo "Note: This requires REPLICATED_APP and REPLICATED_CHANNEL env vars."
	if [ -z "$REPLICATED_APP" ] \|\| [ -z "$REPLICATED_CHANNEL" ]; then
	echo "ERROR: REPLICATED_APP and REPLICATED_CHANNEL must be set"
	exit 1
	fi
	OCI_URL="oci://registry.replicated.com/$REPLICATED_APP/$REPLICATED_CHANNEL"
	echo "Constructed OCI URL: $OCI_URL"
	fi

	# Validate OCI_URL is set and not empty
	if [ -z "$OCI_URL" ]; then
	echo "ERROR: OCI_URL is empty. Check that REPLICATED_APP and REPLICATED_CHANNEL are correctly set and not being overridden."
	echo "REPLICATED_APP=$REPLICATED_APP"
	echo "REPLICATED_CHANNEL=$REPLICATED_CHANNEL"
	exit 1
	fi

	# Prepare values arguments if provided
	MLFLOW_VALUES_ARGS=""
	if [ -n "$MLFLOW_VALUES" ]; then
	echo "Using MLflow values file: $MLFLOW_VALUES"
	# Check if values file exists
	if [ ! -f "$MLFLOW_VALUES" ]; then
	echo "ERROR: Values file '$MLFLOW_VALUES' does not exist"
	exit 1
	fi
	MLFLOW_VALUES_ARGS="--values $MLFLOW_VALUES"
	echo "Values args: $MLFLOW_VALUES_ARGS"
	else
	echo "No custom values file provided. Using default values."
	fi

	# Create namespace if it doesn't exist
	KUBECONFIG="$KUBECONFIG_FILE" kubectl create namespace values-test 2>/dev/null \|\| true

	# Install infra chart from Replicated registry
	echo "Installing infra chart from Replicated registry..."
	echo "Chart path: $OCI_URL/infra"
	KUBECONFIG="$KUBECONFIG_FILE" helm upgrade --install infra $OCI_URL/infra \
	--namespace values-test \
	--wait --timeout 5m --debug \|\| {
	echo "ERROR: Failed to install infra chart from $OCI_URL/infra"
	echo "Please check that registry login was successful and the chart exists in the registry."
	exit 1
	}

	# Install MLflow chart from Replicated registry with custom values
	echo "Installing mlflow chart from Replicated registry with custom values..."
	echo "Chart path: $OCI_URL/mlflow"
	echo "Using values args: $MLFLOW_VALUES_ARGS"
	KUBECONFIG="$KUBECONFIG_FILE" helm upgrade --install mlflow $OCI_URL/mlflow \
	--namespace values-test \
	$MLFLOW_VALUES_ARGS \
	--wait --timeout 5m --debug \|\| {
	echo "ERROR: Failed to install mlflow chart from $OCI_URL/mlflow"
	echo "Please check that registry login was successful and the chart exists in the registry."
	exit 1
	}
	echo "Helm installation with custom values completed successfully."
	env:
	KUBECONFIG: ${{ steps.create-cluster.outputs.cluster-kubeconfig }}
	REPLICATED_APP: ${{ env.APP_SLUG }}
	REPLICATED_CHANNEL: ${{ needs.create-release.outputs.channel-slug }}
	REPLICATED_LICENSE_ID: ${{ steps.get-license.outputs.license_id }}
	MLFLOW_VALUES: ${{ matrix.config.values_file }}
	OCI_URL: "oci://registry.replicated.com/${{ env.APP_SLUG }}/${{ needs.create-release.outputs.channel-slug }}"

	# Set up port forwarding after installation is complete
	- name: Set up port forwarding
	id: port-forward
	run: \|
	# Use the same kubeconfig file from previous step
	KUBECONFIG_FILE="/tmp/kubeconfig-helm-test-${{ github.run_id }}"
	if [ ! -f "$KUBECONFIG_FILE" ]; then
	echo "$KUBECONFIG" > "$KUBECONFIG_FILE"
	echo "Created new kubeconfig at $KUBECONFIG_FILE"
	fi

	# Hardcoded port 5000 for simplicity
	PORT="5000"
	echo "Using port: $PORT for testing"

	# The Helm installation with --wait should already ensure services are ready
	# But let's verify the services are present
	echo "Verifying MLflow service exists..."
	KUBECONFIG="$KUBECONFIG_FILE" kubectl get svc -n values-test

	# Check pod status and wait for them to be running
	echo "Checking pod status..."
	KUBECONFIG="$KUBECONFIG_FILE" kubectl get pods -n values-test

	echo "Waiting for MLflow pods to be running..."
	KUBECONFIG="$KUBECONFIG_FILE" kubectl wait --for=condition=Ready pods --selector=app.kubernetes.io/name=mlflow -n values-test --timeout=2m \|\| {
	echo "WARNING: Timed out waiting for pods to be ready, will try port-forwarding anyway"
	KUBECONFIG="$KUBECONFIG_FILE" kubectl describe pods -n values-test
	}

	# Set up port forwarding in the background
	echo "Setting up port forwarding to run in the background"
	nohup bash -c "KUBECONFIG='$KUBECONFIG_FILE' kubectl port-forward -n values-test svc/mlflow $PORT:5000 &>/tmp/port-forward-${{ github.run_id }}.log" &
	PORT_FORWARD_PID=$!
	echo "port_forward_pid=$PORT_FORWARD_PID" >> $GITHUB_OUTPUT
	echo "Set up port forwarding with PID: $PORT_FORWARD_PID"

	# Set hostname for testing
	echo "hostname=localhost:$PORT" >> $GITHUB_OUTPUT
	echo "Test endpoint will be: localhost:$PORT"

	# Give port-forward more time to establish
	echo "Waiting for port-forward to establish..."
	sleep 15

	# Basic connectivity check
	echo "Checking connectivity to MLflow..."
	if curl -s -o /dev/null -w "%{http_code}" http://localhost:$PORT/; then
	echo "Successfully connected to MLflow service!"
	else
	echo "Warning: Initial connection attempt failed, service may still be starting"
	# Show the port-forward log for debugging
	echo "Port-forward log:"
	cat /tmp/port-forward-${{ github.run_id }}.log \|\| true

	# If port-forward failed, check pod logs
	echo "Pod logs:"
	KUBECONFIG="$KUBECONFIG_FILE" kubectl logs -n values-test -l app.kubernetes.io/name=mlflow --tail=20 \|\| true
	fi
	env:
	KUBECONFIG: ${{ steps.create-cluster.outputs.cluster-kubeconfig }}

	# Application testing with our consolidated test file
	- name: Run Application Tests
	run: \|
	cd applications/mlflow

	# Use the same kubeconfig file from previous steps
	KUBECONFIG_FILE="/tmp/kubeconfig-helm-test-${{ github.run_id }}"
	if [ ! -f "$KUBECONFIG_FILE" ]; then
	echo "$KUBECONFIG" > "$KUBECONFIG_FILE"
	fi

	echo "Installing Python dependencies for tests..."
	pip install mlflow pandas scikit-learn requests urllib3

	echo "Running MLflow application tests against ${{ steps.port-forward.outputs.hostname }}"
	echo "This may take some time as it will retry connections for up to 2 minutes"
	KUBECONFIG="$KUBECONFIG_FILE" python tests/mlflow_test.py ${{ steps.port-forward.outputs.hostname }} \
	--protocol http \
	--connection-timeout 180 \
	--debug
	timeout-minutes: 5
	env:
	KUBECONFIG: ${{ steps.create-cluster.outputs.cluster-kubeconfig }}

	- name: Install troubleshoot
	run: curl -L https://github.com/replicatedhq/troubleshoot/releases/latest/download/support-bundle_linux_amd64.tar.gz \| tar xzvf -
	if: failure()

	- name: Collect bundle
	run: \|
	# Save kubeconfig to a file
	KUBECONFIG_FILE="/tmp/kubeconfig-helm-bundle-${{ github.run_id }}"
	echo "$KUBECONFIG" > "$KUBECONFIG_FILE"
	echo "Saved kubeconfig to $KUBECONFIG_FILE"

	./support-bundle --kubeconfig="$KUBECONFIG_FILE" --interactive=false -o ci-bundle-${{ matrix.cluster.distribution }}-${{ matrix.cluster.version }}-${{ matrix.config.name }} https://raw.githubusercontent.com/replicatedhq/troubleshoot-specs/main/in-cluster/default.yaml
	if: failure()
	env:
	KUBECONFIG: ${{ steps.create-cluster.outputs.cluster-kubeconfig }}

	- name: Upload support bundle artifact
	uses: actions/upload-artifact@v4
	if: failure()
	with:
	name: mlflow-bundle-${{ matrix.cluster.distribution }}-${{ matrix.cluster.version }}-${{ matrix.config.name }}
	path: 'ci-bundle-${{ matrix.cluster.distribution }}-${{ matrix.cluster.version }}-${{ matrix.config.name }}.tar.gz'

	- name: Remove Cluster
	uses: replicatedhq/replicated-actions/[email protected]
	if: ${{ always() && steps.create-cluster.outputs.cluster-id != '' }}
	with:
	api-token: ${{ secrets.REPLICATED_PLATFORM_EXAMPLES_TOKEN }}
	cluster-id: ${{ steps.create-cluster.outputs.cluster-id }}

	kots-install-test:
	runs-on: ubuntu-22.04
	needs: [create-release]
	strategy:
	fail-fast: false
	matrix:
	cluster:
	- distribution: kind
	version: 1.32
	#- distribution: kind
	# version: 1.31
	#- distribution: kind
	# version: 1.30
	#- distribution: aks
	# version: 1.31
	#- distribution: aks
	# version: 1.30
	#- distribution: gke
	# version: 1.32
	#- distribution: gke
	# version: 1.31
	#- distribution: gke
	# version: 1.30
	steps:
	- name: Checkout
	uses: actions/checkout@v4
	with:
	fetch-depth: 0

	- name: Create Cluster
	id: create-cluster
	uses: replicatedhq/replicated-actions/[email protected]
	with:
	api-token: ${{ secrets.REPLICATED_PLATFORM_EXAMPLES_TOKEN }}
	kubernetes-distribution: ${{ matrix.cluster.distribution }}
	kubernetes-version: ${{ matrix.cluster.version }}
	cluster-name: mlflow-kots-${{ github.run_id }}-${{ matrix.cluster.distribution }}-${{ matrix.cluster.version }}
	disk: 100
	instance-type: r1.large
	ttl: 1h
	export-kubeconfig: true

	# Download license using Replicated vendor-cli Docker container
	- name: Download license
	id: download-license
	run: \|
	# Create a temporary file to store the license
	mkdir -p /tmp/replicated
	# Run the vendor-cli command and capture its output
	docker run --rm \
	-e REPLICATED_API_TOKEN=${{ secrets.REPLICATED_PLATFORM_EXAMPLES_TOKEN }} \
	-e REPLICATED_APP=${{ env.APP_SLUG }} \
	replicated/vendor-cli:latest \
	customer download-license --customer ${{ needs.create-release.outputs.customer-id }} > /tmp/replicated/license.yaml
	# Read the license and set it as an output
	LICENSE_CONTENT=$(cat /tmp/replicated/license.yaml)
	# Use EOF delimiter for multi-line output
	echo "license<<EOF" >> $GITHUB_OUTPUT
	echo "$LICENSE_CONTENT" >> $GITHUB_OUTPUT
	echo "EOF" >> $GITHUB_OUTPUT

	# Install using KOTS
	- name: KOTS Install
	uses: replicatedhq/replicated-actions/[email protected]
	with:
	kubeconfig: ${{ steps.create-cluster.outputs.cluster-kubeconfig }}
	kots-version: latest
	app-slug: ${{ env.APP_SLUG }}/${{ needs.create-release.outputs.channel-slug }}
	app-version-label: ${{ needs.create-release.outputs.chart-version }}
	license-file: ${{ steps.download-license.outputs.license }}
	namespace: default
	wait-duration: 10m
	shared-password: 'replicatedmlflow'

	- name: Install troubleshoot
	run: curl -L https://github.com/replicatedhq/troubleshoot/releases/latest/download/support-bundle_linux_amd64.tar.gz \| tar xzvf -
	if: failure()

	- name: Collect bundle
	run: \|
	# Save kubeconfig to a file
	KUBECONFIG_FILE="/tmp/kubeconfig-kots-bundle-${{ github.run_id }}"
	echo "$KUBECONFIG" > "$KUBECONFIG_FILE"
	echo "Saved kubeconfig to $KUBECONFIG_FILE"

	./support-bundle --kubeconfig="$KUBECONFIG_FILE" --interactive=false -o kots-ci-bundle-${{ matrix.cluster.distribution }}-${{ matrix.cluster.version }} https://raw.githubusercontent.com/replicatedhq/troubleshoot-specs/main/in-cluster/default.yaml
	if: failure()
	env:
	KUBECONFIG: ${{ steps.create-cluster.outputs.cluster-kubeconfig }}

	- name: Upload support bundle artifact
	uses: actions/upload-artifact@v4
	if: failure()
	with:
	name: mlflow-kots-bundle-${{ matrix.cluster.distribution }}-${{ matrix.cluster.version }}
	path: 'kots-ci-bundle-${{ matrix.cluster.distribution }}-${{ matrix.cluster.version }}.tar.gz'

	- name: Remove Cluster
	uses: replicatedhq/replicated-actions/[email protected]
	if: ${{ always() && steps.create-cluster.outputs.cluster-id != '' }}
	with:
	api-token: ${{ secrets.REPLICATED_PLATFORM_EXAMPLES_TOKEN }}
	cluster-id: ${{ steps.create-cluster.outputs.cluster-id }}

	cleanup-test-release:
	runs-on: ubuntu-22.04
	needs: [create-release, kots-install-test, helm-install-test]
	if: always()
	steps:
	- name: Archive Customer
	if: ${{ always() && needs.create-release.outputs.customer-id != '' }}
	uses: replicatedhq/replicated-actions/[email protected]
	with:
	api-token: ${{ secrets.REPLICATED_PLATFORM_EXAMPLES_TOKEN }}
	customer-id: ${{ needs.create-release.outputs.customer-id }}

	- name: Archive Channel
	if: ${{ always() && needs.create-release.outputs.channel-slug != '' }}
	uses: replicatedhq/replicated-actions/[email protected]
	with:
	app-slug: ${{ env.APP_SLUG }}
	api-token: ${{ secrets.REPLICATED_PLATFORM_EXAMPLES_TOKEN }}
	channel-slug: ${{ needs.create-release.outputs.channel-slug }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Mlflow Release Pipeline #42

Workflow file

Mlflow Release Pipeline #42

Uh oh!

Jobs

Run details

Workflow file for this run