Skip to content

Deploy Your Model on Kubernetes Using Kserve InferenceService

Prerequisites

S3 Credential

secret.yaml
apiVersion: v1
kind: Secret
metadata:
  name: s3creds
  annotations:
     serving.kserve.io/s3-endpoint: minio:9000
     serving.kserve.io/s3-usehttps: "0" # by default 1, if testing with minio you can set to 0
type: Opaque
stringData: # This is for raw credential string. For base64 encoded string, use `data` instead
  AWS_ACCESS_KEY_ID: minio_user
  AWS_SECRET_ACCESS_KEY: minio_password
sa.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: sa
secrets:
- name: s3creds

Install grpcurl

brew install grpcurl

Deploy Model

inference-service-http.yaml
apiVersion: "serving.kserve.io/v1beta1"
kind: "InferenceService"
metadata:
  name: "apple-demand-http"
spec:
  predictor:
    model:
      modelFormat:
        name: mlflow
      protocolVersion: v2
      storageUri: s3://bucket/1/b1d6cefb8b9c434895e7627fe7529e4e/artifacts/model
    serviceAccountName: sa
kubectl apply -f inference-service-http.yaml
inference-service-grpc.yaml
apiVersion: "serving.kserve.io/v1beta1"
kind: "InferenceService"
metadata:
  name: "apple-demand-grpc"
spec:
  predictor:
    model:
      modelFormat:
        name: mlflow
      protocolVersion: v2
      storageUri: s3://bucket/1/b1d6cefb8b9c434895e7627fe7529e4e/artifacts/model
      ports:
        - containerPort: 9000
          name: h2c
          protocol: TCP
    serviceAccountName: sa
kubectl apply -f inference-service-grpc.yaml

Test Endpoints

curl -v \
-H "Host: ${SERVICE_HOSTNAME}" \
-H "Content-Type: application/json" \
-d @./input_example.json \
http://127.0.0.1:80/v2/models/mlflow-apple-demand/infer
Result
*   Trying 127.0.0.1:80...
* Connected to 127.0.0.1 (127.0.0.1) port 80
> POST /v2/models/mlflow-apple-demand/infer HTTP/1.1
> Host: mlflow-apple-demand.default.127.0.0.1.sslip.io
> User-Agent: curl/8.7.1
> Accept: */*
> Content-Type: application/json
> Content-Length: 1089
> 
* upload completely sent off: 1089 bytes
< HTTP/1.1 200 OK
< ce-endpoint: mlflow-apple-demand
< ce-id: 9ddc841e-a8d4-405f-a7e4-73f7aa9bab09
< ce-inferenceservicename: mlserver
< ce-modelid: mlflow-apple-demand
< ce-namespace: default
< ce-requestid: 9ddc841e-a8d4-405f-a7e4-73f7aa9bab09
< ce-source: io.seldon.serving.deployment.mlserver.default
< ce-specversion: 0.3
< ce-type: io.seldon.serving.inference.response
< content-length: 240
< content-type: application/json
< date: Fri, 02 May 2025 04:06:58 GMT
< server: istio-envoy
< x-envoy-upstream-service-time: 247
< 
* Connection #0 to host 127.0.0.1 left intact
{"model_name":"mlflow-apple-demand","id":"9ddc841e-a8d4-405f-a7e4-73f7aa9bab09","parameters":{"content_type":"np"},"outputs":[{"name":"output-1","shape":[1,1],"datatype":"FP32","parameters":{"content_type":"np"},"data":[1486.56298828125]}]}
grpcurl \
  -vv \
  -plaintext \
  -proto ${PROTO_FILE} \
  -authority ${SERVICE_HOSTNAME} \
  -d @ \
  ${INGRESS_HOST}:${INGRESS_PORT} \
  inference.GRPCInferenceService.ModelInfer \
  <<< $(cat "$INPUT_PATH")

References