Skip to content

Deploy MLflow on Kubernetes


Architecture (Click to Enlarge)

Artifact Store (MinIO)

First deploy an S3-compatible object store - MinIO for our MLflow artifact store to store artifacts like figures, models, reports, etc. See here for deploying MinIO.

After deploying MinIO and the mlflow bucket created, in MLflow's helm chart, we could specify artifact store's configuration

values.yaml
1
2
3
4
5
6
7
8
9
artifactStore:
  name: minio-api # API Service name for MinIO
  namespace: minio
  user: minio_user
  password: minio_password
  apiPort: 9000
  bucketName: mlflow
  hostPath: /home/docker/data/minio
  mountPath: /data

Backend Store

values.yaml
1
2
3
4
5
6
7
8
9
backendStore:
  name: backend-store
  db: mlflow
  user: user
  password: password
  host: postgres
  port: 5432
  hostPath: /home/docker/data/mlflow/backend-store
  mountPath: /var/lib/postgresql/data
backend-store.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: {{ .Values.backendStore.name }}
  namespace: {{ .Release.Namespace }}
spec:
  replicas: 1
  selector:
    matchLabels:
      app: {{ .Values.backendStore.name }}
  strategy:
    type: Recreate
  template:
    metadata:
      labels:
        app: {{ .Values.backendStore.name }}
    spec:
      containers:
        - name: {{ .Values.backendStore.name }}
          image: postgres:latest
          env:
            - name: POSTGRES_DB
              value: {{ .Values.backendStore.db }}
            - name: POSTGRES_USER
              value: {{ .Values.backendStore.user }}
            - name: POSTGRES_PASSWORD
              value: {{ .Values.backendStore.password }}
          ports:
            - containerPort: {{ .Values.backendStore.port }}
              protocol: TCP
          volumeMounts:
            - name: storage
              mountPath: {{ .Values.backendStore.mountPath }}
      restartPolicy: Always
      volumes:
        - name: storage
          hostPath:
            path: {{ .Values.backendStore.hostPath }}
            type: DirectoryOrCreate
backend-store.yaml
apiVersion: v1
kind: Service
metadata:
  name: {{ .Values.backendStore.name }}
  namespace: {{ .Release.Namespace }}
spec:
  selector:
    app: {{ .Values.backendStore.name }}
  type: ClusterIP
  ports:
    - port: {{ .Values.backendStore.port }}
      targetPort: {{ .Values.backendStore.port }}

Tracking Server

values.yaml
1
2
3
4
trackingServer:
  name: tracking-server
  host: 0.0.0.0
  port: 5000
tracking-server.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: {{ .Values.trackingServer.name }}
  namespace: {{ .Release.Namespace }}
spec:
  replicas: 1
  selector:
    matchLabels:
      app: {{ .Values.trackingServer.name }}
  template:
    metadata:
      labels:
        app: {{ .Values.trackingServer.name }}
    spec:
      initContainers:
        - name: wait-for-backend-store
          image: busybox
          command:
            - sh
            - -c
            - |
              until nc -z {{ .Values.backendStore.name }}.{{ .Release.Namespace }}.svc.cluster.local {{ .Values.backendStore.port }}; do
                echo "Waiting for backend store..."
                sleep 2
              done
              echo "Backend store is ready!"
        - name: wait-for-artifact-store
          image: busybox
          command:
            - sh
            - -c
            - |
              until nc -z {{ .Values.artifactStore.name }}.{{ .Values.artifactStore.namespace }}.svc.cluster.local {{ .Values.artifactStore.apiPort }}; do
                echo "Waiting for artifact store..."
                sleep 2
              done
              echo "Artifact store is ready!"
      containers:
        - name: {{ .Values.trackingServer.name }}
          image: bitnami/mlflow:2.22.0
          env:
            - name: MLFLOW_S3_ENDPOINT_URL
              value: http://{{ .Values.artifactStore.name }}.{{ .Values.artifactStore.namespace }}.svc.cluster.local:{{ .Values.artifactStore.apiPort }}
            - name: AWS_ACCESS_KEY_ID
              value: {{ .Values.artifactStore.user }}
            - name: AWS_SECRET_ACCESS_KEY
              value: {{ .Values.artifactStore.password }}
            - name: MLFLOW_S3_IGNORE_TLS
              value: "true"
          command: ["mlflow"]
          args:
            [
              "server",
              "--backend-store-uri", "postgresql://{{ .Values.backendStore.user }}:{{ .Values.backendStore.password }}@{{ .Values.backendStore.name }}:{{ .Values.backendStore.port }}/{{ .Values.backendStore.db }}",
              "--artifacts-destination", "s3://{{ .Values.artifactStore.bucketName }}",
              "--host", "{{ .Values.trackingServer.host }}",
              "--port", "{{ .Values.trackingServer.port }}",
            ]
          ports:
            - containerPort: {{ .Values.trackingServer.port }}
tracking-server.yaml
apiVersion: v1
kind: Service
metadata:
  name: {{ .Values.trackingServer.name }}
  namespace: {{ .Release.Namespace }}
spec:
  type: NodePort
  selector:
    app: {{ .Values.trackingServer.name }}
  ports:
    - port: {{ .Values.trackingServer.port }}
      targetPort: {{ .Values.trackingServer.port }}
      nodePort: 30500