diff --git a/helm/.helmignore b/helm/.helmignore new file mode 100644 index 0000000..0e8a0eb --- /dev/null +++ b/helm/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/helm/Chart.yaml b/helm/Chart.yaml new file mode 100644 index 0000000..300e738 --- /dev/null +++ b/helm/Chart.yaml @@ -0,0 +1,24 @@ +apiVersion: v2 +name: llm-d-modelservice +description: A Helm chart for ModelService + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.0.1 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "0.0.1" diff --git a/helm/examples/README.md b/helm/examples/README.md new file mode 100644 index 0000000..3b8bcce --- /dev/null +++ b/helm/examples/README.md @@ -0,0 +1,44 @@ +# Examples + +Contains example values file and their rendered templates. + +``` +cd helm +helm template [RELEASE-NAME] . -f [VALUES-FILEPATH] +``` + +1. `vllm-sim` in Kind + + Make sure there is a gateway (Kgteway or Istio) deployed in the cluster named `llm-d-inference-gateway` or change values file accordingly. + + ``` + helm template llmd-sim . -f examples/values-vllm-sim.yaml > examples/output-vllm-sim.yaml + ``` + + Remove `protocol: tcp` in `initContainers` and `readinessProbe` and `livenessProbe` from epp deployment + + +2. `facebook/opt-125m`: downloads from Hugging Face + + ``` + helm template facebook . -f examples/values-facebook.yaml > examples/output-facebook.yaml + ``` + + +Port forward the inference gateway + +``` +k port-forward svc/llm-d-inference-gateway-istio 8000:80 +``` + +Send a request + +``` +curl http://localhost:8000/v1/completions -vvv \ + -H "Content-Type: application/json" \ + -H "x-model-name: facebook/opt-125m" \ + -d '{ + "model": "facebook/opt-125m", + "prompt": "Hello, " +}' +``` \ No newline at end of file diff --git a/helm/examples/output-facebook.yaml b/helm/examples/output-facebook.yaml new file mode 100644 index 0000000..3313c67 --- /dev/null +++ b/helm/examples/output-facebook.yaml @@ -0,0 +1,365 @@ +--- +# Source: llm-d-modelservice/templates/epp-sa.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: facebook-sim-test-llm-d-modelservice-epp-sa + labels: + helm.sh/chart: llm-d-modelservice-0.0.1 + app.kubernetes.io/version: "0.0.1" + app.kubernetes.io/managed-by: Helm +--- +# Source: llm-d-modelservice/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: facebook-sim-test-llm-d-modelservice-sa + labels: + helm.sh/chart: llm-d-modelservice-0.0.1 + app.kubernetes.io/version: "0.0.1" + app.kubernetes.io/managed-by: Helm +--- +# Source: llm-d-modelservice/templates/epp-service.yaml +apiVersion: v1 +kind: Service +metadata: + name: facebook-sim-test-llm-d-modelservice-epp-service + labels: + helm.sh/chart: llm-d-modelservice-0.0.1 + app.kubernetes.io/version: "0.0.1" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 9002 + targetPort: 9002 + protocol: TCP + appProtocol: http2 + selector: + app.kubernetes.io/name: llm-d-modelservice + app.kubernetes.io/instance: facebook-sim-test + llm-d.ai/epp: facebook-sim-test-llm-d-modelservice-epp +--- +# Source: llm-d-modelservice/templates/decode-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: facebook-sim-test-llm-d-modelservice-decode + labels: + helm.sh/chart: llm-d-modelservice-0.0.1 + app.kubernetes.io/version: "0.0.1" + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: facebook-sim-test + llm-d.ai/role: decode + template: + metadata: + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: facebook-sim-test + llm-d.ai/role: decode + spec: + initContainers: + - name: routing-proxy + args: + - --port=8000 + - --vllm-port=8200 + - --connector=nixlv2 + - -v=5 + image: ghcr.io/llm-d/llm-d-routing-sidecar:0.0.6 + imagePullPolicy: Always + ports: + - containerPort: 8000 + resources: {} + restartPolicy: Always + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + + serviceAccountName: facebook-sim-test-llm-d-modelservice-sa + containers: + - name: vllm + image: ghcr.io/llm-d/llm-d:0.0.8 + command: + - vllm + - serve + args: + - facebook/opt-125m + - --port + - "8200" + - --enforce-eager + - --kv-transfer-config + - '{"kv_connector":"NixlConnector", "kv_role":"kv_both"}' + env: + - name: CUDA_VISIBLE_DEVICES + value: "0" + - name: UCX_TLS + value: cuda_ipc,cuda_copy,tcp + - name: HF_HOME + value: /model-cache + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "5557" + - name: VLLM_LOGGING_LEVEL + value: DEBUG + - name: HF_HOME + value: /model-cache + + resources: + limits: + cpu: "16" + memory: 16Gi + nvidia.com/gpu: "1" + requests: + cpu: "16" + memory: 16Gi + nvidia.com/gpu: "1" + volumeMounts: + - name: model-storage + mountPath: /model-cache + volumes: + - name: model-storage + emptyDir: + sizeLimit: 5Mi +--- +# Source: llm-d-modelservice/templates/epp-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: facebook-sim-test-llm-d-modelservice-epp + labels: + llm-d.ai/epp: facebook-sim-test-llm-d-modelservice-epp + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + llm-d.ai/epp: facebook-sim-test-llm-d-modelservice-epp + template: + metadata: + labels: + llm-d.ai/epp: facebook-sim-test-llm-d-modelservice-epp + spec: + containers: + - name: epp + imagePullPolicy: Always + image: ghcr.io/llm-d/llm-d-inference-scheduler:0.0.3 + args: + - --poolName + - facebook-sim-test-llm-d-modelservice-inference-pool + - --poolNamespace + - default + - -v + - "4" + - --zap-encoder + - json + - --grpcPort + - "9002" + - --grpcHealthPort + - "9003" + env: + - name: ENABLE_KVCACHE_AWARE_SCORER + value: "false" + - name: ENABLE_LOAD_AWARE_SCORER + value: "true" + - name: ENABLE_PREFIX_AWARE_SCORER + value: "true" + - name: ENABLE_SESSION_AWARE_SCORER + value: "false" + - name: KVCACHE_AWARE_SCORER_WEIGHT + value: "1" + - name: KVCACHE_INDEXER_REDIS_ADDR + - name: LOAD_AWARE_SCORER_WEIGHT + value: "1" + - name: PD_ENABLED + value: "false" + - name: PD_PROMPT_LEN_THRESHOLD + value: "10" + - name: PREFILL_ENABLE_KVCACHE_AWARE_SCORER + value: "false" + - name: PREFILL_ENABLE_LOAD_AWARE_SCORER + value: "false" + - name: PREFILL_ENABLE_PREFIX_AWARE_SCORER + value: "false" + - name: PREFILL_ENABLE_SESSION_AWARE_SCORER + value: "false" + - name: PREFILL_KVCACHE_AWARE_SCORER_WEIGHT + value: "1" + - name: PREFILL_KVCACHE_INDEXER_REDIS_ADDR + - name: PREFILL_LOAD_AWARE_SCORER_WEIGHT + value: "1" + - name: PREFILL_PREFIX_AWARE_SCORER_WEIGHT + value: "1" + - name: PREFILL_SESSION_AWARE_SCORER_WEIGHT + value: "1" + - name: PREFIX_AWARE_SCORER_WEIGHT + value: "2" + - name: SESSION_AWARE_SCORER_WEIGHT + value: "1" + ports: + - containerPort: 9002 + name: grpc + protocol: TCP + - containerPort: 9003 + name: grpc-health + protocol: TCP + - containerPort: 9090 + name: metrics + protocol: TCP + serviceAccount: facebook-sim-test-llm-d-modelservice-epp-sa + serviceAccountName: facebook-sim-test-llm-d-modelservice-epp-sa + readinessProbe: + grpc: + port: 9003 + service: envoy.service.ext_proc.v3.ExternalProcessor + initialDelaySeconds: 5 + timeoutSeconds: 1 + periodSeconds: 10 + successThreshold: 1 + failureThreshold: 3 + livenessProbe: + grpc: + port: 9003 + service: envoy.service.ext_proc.v3.ExternalProcessor + initialDelaySeconds: 5 + timeoutSeconds: 1 + periodSeconds: 10 + successThreshold: 1 + failureThreshold: 3 +--- +# Source: llm-d-modelservice/templates/prefill-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: facebook-sim-test-llm-d-modelservice-prefill + labels: + helm.sh/chart: llm-d-modelservice-0.0.1 + app.kubernetes.io/version: "0.0.1" + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: facebook-sim-test + llm-d.ai/role: prefill + template: + metadata: + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: facebook-sim-test + llm-d.ai/role: prefill + spec: + + serviceAccountName: facebook-sim-test-llm-d-modelservice-sa + containers: + - name: vllm + image: ghcr.io/llm-d/llm-d:0.0.8 + command: + - vllm + - serve + args: + - facebook/opt-125m + - --port + - "8000" + - --enforce-eager + - --kv-transfer-config + - '{"kv_connector":"NixlConnector", "kv_role":"kv_both"}' + env: + - name: CUDA_VISIBLE_DEVICES + value: "0" + - name: UCX_TLS + value: cuda_ipc,cuda_copy,tcp + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "5557" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: VLLM_LOGGING_LEVEL + value: DEBUG + + resources: + limits: + cpu: "16" + memory: 16Gi + nvidia.com/gpu: "1" + requests: + cpu: "16" + memory: 16Gi + nvidia.com/gpu: "1" + volumes: + - name: model-storage + emptyDir: + sizeLimit: 5Mi +--- +# Source: llm-d-modelservice/templates/routing.yaml +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: facebook-sim-test-llm-d-modelservice-http-route + namespace: default + labels: + helm.sh/chart: llm-d-modelservice-0.0.1 + app.kubernetes.io/version: "0.0.1" + app.kubernetes.io/managed-by: Helm +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: facebook-sim-test-llm-d-modelservice-inference-pool + port: 8000 + weight: 1 + matches: + - headers: + - name: x-model-name + type: Exact + value: facebook/opt-125m + path: + type: PathPrefix + value: / +--- +# Source: llm-d-modelservice/templates/routing.yaml +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: facebook-sim-test-llm-d-modelservice-inference-model + namespace: default + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: facebook-sim-test +spec: + modelName: facebook/opt-125m + poolRef: + group: inference.networking.x-k8s.io + kind: InferencePool + name: facebook-sim-test-llm-d-modelservice-inference-pool +--- +# Source: llm-d-modelservice/templates/routing.yaml +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferencePool +metadata: + name: facebook-sim-test-llm-d-modelservice-inference-pool + namespace: default +spec: + extensionRef: + failureMode: FailClose + group: "" + kind: Service + name: facebook-sim-test-llm-d-modelservice-epp-service + selector: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: facebook-sim-test + targetPortNumber: 8000 diff --git a/helm/examples/output-vllm-sim.yaml b/helm/examples/output-vllm-sim.yaml new file mode 100644 index 0000000..547b20e --- /dev/null +++ b/helm/examples/output-vllm-sim.yaml @@ -0,0 +1,302 @@ +--- +# Source: llm-d-modelservice/templates/epp-sa.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: facebook-sim-test-llm-d-modelservice-epp-sa + labels: + helm.sh/chart: llm-d-modelservice-0.0.1 + app.kubernetes.io/version: "0.0.1" + app.kubernetes.io/managed-by: Helm +--- +# Source: llm-d-modelservice/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: facebook-sim-test-llm-d-modelservice-sa + labels: + helm.sh/chart: llm-d-modelservice-0.0.1 + app.kubernetes.io/version: "0.0.1" + app.kubernetes.io/managed-by: Helm +--- +# Source: llm-d-modelservice/templates/epp-service.yaml +apiVersion: v1 +kind: Service +metadata: + name: facebook-sim-test-llm-d-modelservice-epp-service + labels: + helm.sh/chart: llm-d-modelservice-0.0.1 + app.kubernetes.io/version: "0.0.1" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 9002 + targetPort: 9002 + protocol: TCP + appProtocol: http2 + selector: + app.kubernetes.io/name: llm-d-modelservice + app.kubernetes.io/instance: facebook-sim-test + llm-d.ai/epp: facebook-sim-test-llm-d-modelservice-epp +--- +# Source: llm-d-modelservice/templates/decode-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: facebook-sim-test-llm-d-modelservice-decode + labels: + helm.sh/chart: llm-d-modelservice-0.0.1 + app.kubernetes.io/version: "0.0.1" + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: facebook-sim-test + llm-d.ai/role: decode + template: + metadata: + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: facebook-sim-test + llm-d.ai/role: decode + spec: + initContainers: + - name: routing-proxy + args: + - --port=8000 + - --vllm-port=8200 + - --connector=nixlv2 + - -v=5 + image: ghcr.io/llm-d/llm-d-routing-sidecar:0.0.6 + imagePullPolicy: Always + ports: + - containerPort: 8000 + resources: {} + restartPolicy: Always + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + + serviceAccountName: facebook-sim-test-llm-d-modelservice-sa + containers: + - name: vllm + image: ghcr.io/llm-d/llm-d-inference-sim:0.0.4 + args: + - --model + - facebook/opt-125m + - --port + - "8200" + env: + - name: HF_HOME + value: /model-cache + + resources: + limits: + {} + requests: + {} + volumeMounts: + - name: model-storage + mountPath: /model-cache + volumes: +--- +# Source: llm-d-modelservice/templates/epp-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: facebook-sim-test-llm-d-modelservice-epp + labels: + llm-d.ai/epp: facebook-sim-test-llm-d-modelservice-epp + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + llm-d.ai/epp: facebook-sim-test-llm-d-modelservice-epp + template: + metadata: + labels: + llm-d.ai/epp: facebook-sim-test-llm-d-modelservice-epp + spec: + containers: + - name: epp + imagePullPolicy: Always + image: ghcr.io/llm-d/llm-d-inference-scheduler:0.0.3 + args: + - --poolName + - facebook-sim-test-llm-d-modelservice-inference-pool + - --poolNamespace + - default + - -v + - "6" + - --zap-encoder + - json + - --grpcPort + - "9002" + - --grpcHealthPort + - "9003" + env: + - name: ENABLE_KVCACHE_AWARE_SCORER + value: "false" + - name: ENABLE_LOAD_AWARE_SCORER + value: "true" + - name: ENABLE_PREFIX_AWARE_SCORER + value: "true" + - name: ENABLE_SESSION_AWARE_SCORER + value: "false" + - name: KVCACHE_AWARE_SCORER_WEIGHT + value: "1" + - name: KVCACHE_INDEXER_REDIS_ADDR + - name: LOAD_AWARE_SCORER_WEIGHT + value: "1" + - name: PD_ENABLED + value: "false" + - name: PD_PROMPT_LEN_THRESHOLD + value: "10" + - name: PREFILL_ENABLE_KVCACHE_AWARE_SCORER + value: "false" + - name: PREFILL_ENABLE_LOAD_AWARE_SCORER + value: "false" + - name: PREFILL_ENABLE_PREFIX_AWARE_SCORER + value: "false" + - name: PREFILL_ENABLE_SESSION_AWARE_SCORER + value: "false" + - name: PREFILL_KVCACHE_AWARE_SCORER_WEIGHT + value: "1" + - name: PREFILL_KVCACHE_INDEXER_REDIS_ADDR + - name: PREFILL_LOAD_AWARE_SCORER_WEIGHT + value: "1" + - name: PREFILL_PREFIX_AWARE_SCORER_WEIGHT + value: "1" + - name: PREFILL_SESSION_AWARE_SCORER_WEIGHT + value: "1" + - name: PREFIX_AWARE_SCORER_WEIGHT + value: "2" + - name: SESSION_AWARE_SCORER_WEIGHT + value: "1" + ports: + - containerPort: 9002 + name: grpc + protocol: TCP + - containerPort: 9003 + name: grpc-health + protocol: TCP + - containerPort: 9090 + name: metrics + protocol: TCP + serviceAccount: facebook-sim-test-llm-d-modelservice-epp-sa + serviceAccountName: facebook-sim-test-llm-d-modelservice-epp-sa +--- +# Source: llm-d-modelservice/templates/prefill-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: facebook-sim-test-llm-d-modelservice-prefill + labels: + helm.sh/chart: llm-d-modelservice-0.0.1 + app.kubernetes.io/version: "0.0.1" + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: facebook-sim-test + llm-d.ai/role: prefill + template: + metadata: + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: facebook-sim-test + llm-d.ai/role: prefill + spec: + + serviceAccountName: facebook-sim-test-llm-d-modelservice-sa + containers: + - name: vllm + image: ghcr.io/llm-d/llm-d-inference-sim:0.0.4 + args: + - --model + - facebook/opt-125m + - --port + - "8000" + env: + - name: HF_HOME + value: /model-cache + + resources: + limits: + {} + requests: + {} + volumeMounts: + - name: model-storage + mountPath: /model-cache + volumes: +--- +# Source: llm-d-modelservice/templates/routing.yaml +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: facebook-sim-test-llm-d-modelservice-http-route + namespace: default + labels: + helm.sh/chart: llm-d-modelservice-0.0.1 + app.kubernetes.io/version: "0.0.1" + app.kubernetes.io/managed-by: Helm +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: llm-d-inference-gateway + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: facebook-sim-test-llm-d-modelservice-inference-pool + port: 8000 + weight: 1 + matches: + - headers: + - name: x-model-name + type: Exact + value: facebook/opt-125m + path: + type: PathPrefix + value: / +--- +# Source: llm-d-modelservice/templates/routing.yaml +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: facebook-sim-test-llm-d-modelservice-inference-model + namespace: default + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: facebook-sim-test +spec: + modelName: facebook/opt-125m + poolRef: + group: inference.networking.x-k8s.io + kind: InferencePool + name: facebook-sim-test-llm-d-modelservice-inference-pool +--- +# Source: llm-d-modelservice/templates/routing.yaml +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferencePool +metadata: + name: facebook-sim-test-llm-d-modelservice-inference-pool + namespace: default +spec: + extensionRef: + failureMode: FailClose + group: "" + kind: Service + name: facebook-sim-test-llm-d-modelservice-epp-service + selector: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: facebook-sim-test + targetPortNumber: 8000 diff --git a/helm/examples/values-deepseek.yaml b/helm/examples/values-deepseek.yaml new file mode 100644 index 0000000..f8a6590 --- /dev/null +++ b/helm/examples/values-deepseek.yaml @@ -0,0 +1,377 @@ +# This values.yaml file creates the resources for deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct + +# If true, use a LeaderWorkerSet instead of a Deployment to host the model +multinode: true +inferencePool: true +inferenceModel: true +httpRoute: true + +routing: + # This is the model name for the OpenAI request + modelName: deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct + servicePort: 8080 # Sidecar listens on this port for requests. If there's no sidecar, the request goes here + proxy: + image: ghcr.io/llm-d/llm-d-routing-sidecar:0.0.6 + targetPort: 8200 + debugLevel: 5 + parentRefs: + - name: inference-gateway + +modelArtifacts: + # When specfying the URI with `hf` prefix, the / string + # is extracted and exposed as a template variable that can be used as {{ .HFModelName }} + + uri: "pvc://tms-hf-cache/model-cache" + authSecretName: "hf-secret" + size: 5Mi + +# describe decode pods +decode: + autoscaling: + enabled: false + replicas: 1 + + parallelism: + tensor: 1 + data: 1 + dataLocal: 1 + + acceleratorTypes: + labelKey: gpu.nvidia.com/model + labelValues: + - H200 + + volumes: + # Volume for the init script from ConfigMap + - name: init-scripts-volume + configMap: + defaultMode: 0755 + name: vllm-init-scripts-config + # Needed for NCCL to function + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 1Gi + # - name: hf-cache + # persistentVolumeClaim: + # claimName: tms-hf-cache + - name: vllm + persistentVolumeClaim: + claimName: tms-vllm + + containers: + - name: vllm-worker + image: "quay.io/tms/vllm-dev-pplx:0.1.0" + imagePullPolicy: Always + workingDir: /app + stdin: true + tty: true + command: ["/bin/sh","-c"] + args: + - | + ################# + # Install vLLM + ################# + VLLM_USE_PRECOMPILED=1 /init-scripts/vllm.sh + ################# + # RUN vLLM + ################# + START_RANK=$(( ${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL )) + if [ "${LWS_WORKER_INDEX:-0}" -eq 0 ]; then + ################# + # Leader-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8200 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --enforce-eager + else + ################# + # Worker-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8200 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --enforce-eager \ + --headless + fi + env: + - name: CUDA_LAUNCH_BLOCKING + value: "1" + - name: VLLM_REPO_URL + value: "https://github.com/vllm-project/vllm.git" + - name: VLLM_BRANCH + value: "main" + #- name: VLLM_USE_DEEP_GEMM + # value: "1" + - name: VLLM_ALL2ALL_BACKEND + # value: "naive" + value: "pplx" + # value: "deepep_high_throughput" + # value: "deepep_low_latency" + # Needed for GDRCOPY to be used. + # See: https://github.com/NVIDIA/nvidia-container-toolkit/releases/tag/v1.15.0 + - name: NVIDIA_GDRCOPY + value: "enabled" + - name: NVSHMEM_DEBUG + value: "INFO" + # Uncomment for debugging + #- name: NVSHMEM_DEBUG_SUBSYS + # value: "TRANSPORT,INIT,MEM,COLL,BOOTSTRAP" + - name: NVSHMEM_REMOTE_TRANSPORT + value: "ibgda" + - name: NVSHMEM_IB_ENABLE_IBGDA + value: "true" + - name: NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME + value: "eth0" + - name: GLOO_SOCKET_IFNAME + value: "eth0" + - name: NCCL_SOCKET_IFNAME + value: "eth0" + - name: NCCL_IB_HCA + value: "ibp" + - name: VLLM_LOGGING_LEVEL + value: "DEBUG" + #MK - name: HF_HUB_CACHE + #MK value: /huggingface-cache + - name: GH_TOKEN_FROM_SECRET + valueFrom: + secretKeyRef: + name: gh-token-secret + key: GH_TOKEN + optional: true + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "6555" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + + securityContext: + capabilities: + add: + - "IPC_LOCK" + - "SYS_RAWIO" + resources: + limits: + nvidia.com/gpu: 1 + memory: 64Gi + ephemeral-storage: 64Gi + rdma/ib: 1 + requests: + cpu: 8 + memory: 64Gi + ephemeral-storage: 64Gi + nvidia.com/gpu: 1 + rdma/ib: 1 + volumeMounts: + - mountPath: /dev/shm + name: dshm + - name: init-scripts-volume + mountPath: /init-scripts + #MK - name: hf-cache + #MK mountPath: /huggingface-cache + - name: vllm + mountPath: /code + mountModelVolume: true + +# describe prefill pods +prefill: + autoscaling: + enabled: false + replicas: 1 + + parallelism: + tensor: 1 + data: 1 + dataLocal: 1 + + acceleratorTypes: + labelKey: gpu.nvidia.com/model + labelValues: + - H200 + + volumes: + # Volume for the init script from ConfigMap + - name: init-scripts-volume + configMap: + defaultMode: 0755 + name: vllm-init-scripts-config + # Needed for NCCL to function + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 1Gi + # - name: hf-cache + # persistentVolumeClaim: + # claimName: tms-hf-cache + - name: vllm + persistentVolumeClaim: + claimName: tms-vllm + + + containers: + - name: vllm-worker + image: "quay.io/tms/vllm-dev-pplx:0.1.0" + imagePullPolicy: Always + workingDir: /app + stdin: true + tty: true + command: ["/bin/sh","-c"] + args: + - | + ################# + # Install vLLM + ################# + VLLM_USE_PRECOMPILED=1 /init-scripts/vllm.sh + ################# + # RUN vLLM + ################# + START_RANK=$(( ${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL )) + if [ "${LWS_WORKER_INDEX:-0}" -eq 0 ]; then + ################# + # Leader-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8080 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --enforce-eager + else + ################# + # Worker-only launch + ################# + exec /app/venv/bin/vllm serve \ + deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \ + --port 8080 \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --headless + fi + env: + - name: CUDA_LAUNCH_BLOCKING + value: "1" + - name: VLLM_REPO_URL + value: "https://github.com/vllm-project/vllm.git" + - name: VLLM_BRANCH + value: "main" + #- name: VLLM_USE_DEEP_GEMM + # value: "1" + - name: VLLM_ALL2ALL_BACKEND + # value: "naive" + value: "pplx" + # value: "deepep_high_throughput" + # value: "deepep_low_latency" + # Needed for GDRCOPY to be used. + # See: https://github.com/NVIDIA/nvidia-container-toolkit/releases/tag/v1.15.0 + - name: NVIDIA_GDRCOPY + value: "enabled" + - name: NVSHMEM_DEBUG + value: "INFO" + # Uncomment for debugging + #- name: NVSHMEM_DEBUG_SUBSYS + # value: "TRANSPORT,INIT,MEM,COLL,BOOTSTRAP" + - name: NVSHMEM_REMOTE_TRANSPORT + value: "ibgda" + - name: NVSHMEM_IB_ENABLE_IBGDA + value: "true" + - name: NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME + value: "eth0" + - name: GLOO_SOCKET_IFNAME + value: "eth0" + - name: NCCL_SOCKET_IFNAME + value: "eth0" + - name: NCCL_IB_HCA + value: "ibp" + - name: VLLM_LOGGING_LEVEL + value: "DEBUG" + #MK - name: HF_HUB_CACHE + #MK value: /huggingface-cache + - name: GH_TOKEN_FROM_SECRET + valueFrom: + secretKeyRef: + name: gh-token-secret + key: GH_TOKEN + optional: true + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "6555" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + + securityContext: + capabilities: + add: + - "IPC_LOCK" + - "SYS_RAWIO" + resources: + limits: + nvidia.com/gpu: 1 + memory: 64Gi + ephemeral-storage: 64Gi + rdma/ib: 1 + requests: + cpu: 8 + memory: 64Gi + ephemeral-storage: 64Gi + nvidia.com/gpu: 1 + rdma/ib: 1 + volumeMounts: + - mountPath: /dev/shm + name: dshm + - name: init-scripts-volume + mountPath: /init-scripts + #MK - name: hf-cache + #MK mountPath: /huggingface-cache + - name: vllm + mountPath: /code + mountModelVolume: true + +endpointPicker: + image: ghcr.io/llm-d/llm-d-inference-scheduler:0.0.3 + debugLevel: 5 + service: + type: ClusterIP + port: 9002 + targetPort: 9002 + appProtocol: http2 + # The name of cluster role containing permissions to be granted to endpointPicker (via a role binding to the service account) + permissions: pod-read + + autoscaling: + enabled: false + replicas: 1 diff --git a/helm/examples/values-facebook.yaml b/helm/examples/values-facebook.yaml new file mode 100644 index 0000000..3382040 --- /dev/null +++ b/helm/examples/values-facebook.yaml @@ -0,0 +1,116 @@ +# This values.yaml file creates the resources for facebook/opt-125m + +multinode: false # If true, creates LWS instead of deployments +inferencePool: true +inferenceModel: true +httpRoute: true + +routing: + # This is the model name for the OpenAI request + modelName: facebook/opt-125m + servicePort: 8000 # Sidecar listens on this port for requests. If there's no sidecar, the request goes here + proxy: + image: ghcr.io/llm-d/llm-d-routing-sidecar:0.0.6 + targetPort: 8200 + parentRefs: + - name: inference-gateway + +modelArtifacts: + uri: "hf://facebook/opt-125m" + +# describe decode pods +decode: + enableService: false + replicas: 1 + containers: + - name: "vllm" + image: "ghcr.io/llm-d/llm-d:0.0.8" + command: + - vllm + - serve + args: + - "--enforce-eager" + - "--kv-transfer-config" + - '{"kv_connector":"NixlConnector", "kv_role":"kv_both"}' + env: + - name: CUDA_VISIBLE_DEVICES + value: "0" + - name: UCX_TLS + value: "cuda_ipc,cuda_copy,tcp" + - name: HF_HOME + value: /model-cache + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "5557" + - name: VLLM_LOGGING_LEVEL + value: DEBUG + ports: + - containerPort: 5557 + protocol: TCP + resources: + limits: + memory: 16Gi + cpu: "16" + nvidia.com/gpu: "1" + requests: + cpu: "16" + memory: 16Gi + nvidia.com/gpu: "1" + mountModelVolume: true + +# describe the prefill pods (looks the same as above) +prefill: + replicas: 1 + containers: + - name: "vllm" + image: "ghcr.io/llm-d/llm-d:0.0.8" + command: + - vllm + - serve + args: + - "--enforce-eager" + - "--kv-transfer-config" + - '{"kv_connector":"NixlConnector", "kv_role":"kv_both"}' + env: + - name: CUDA_VISIBLE_DEVICES + value: "0" + - name: UCX_TLS + value: "cuda_ipc,cuda_copy,tcp" + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "5557" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: VLLM_LOGGING_LEVEL + value: DEBUG + ports: + - containerPort: 8000 + protocol: TCP + - containerPort: 5557 + protocol: TCP + resources: + limits: + memory: 16Gi + cpu: "16" + nvidia.com/gpu: "1" + requests: + cpu: "16" + memory: 16Gi + nvidia.com/gpu: "1" + +endpointPicker: + service: + type: ClusterIP + port: 9002 + targetPort: 9002 + appProtocol: http2 + # The name of cluster role containing permissions to be granted to endpointPicker (via a role binding to the service account) + permissions: pod-read + + autoscaling: + enabled: false + replicas: 1 diff --git a/helm/examples/values-vllm-sim.yaml b/helm/examples/values-vllm-sim.yaml new file mode 100644 index 0000000..3c4f159 --- /dev/null +++ b/helm/examples/values-vllm-sim.yaml @@ -0,0 +1,78 @@ +# This values.yaml file creates the resources for random + +multinode: false # If true, creates LWS instead of deployments +inferencePool: true +inferenceModel: true +httpRoute: true + +routing: + # This is the model name for the OpenAI request + modelName: random + servicePort: 8000 # Sidecar listens on this port for requests. If there's no sidecar, the request goes here + proxy: + image: ghcr.io/llm-d/llm-d-routing-sidecar:0.0.6 + targetPort: 8200 + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: llm-d-inference-gateway + +modelArtifacts: + uri: "hf://random" + size: 5Mi + +# describe decode pods +decode: + replicas: 1 + containers: + - name: "vllm" + image: "ghcr.io/llm-d/llm-d-inference-sim:0.0.4" + args: + - "--model" + - "random" + - "--port" + - "8200" # targetPort + ports: + - containerPort: 5557 + protocol: TCP + mountModelVolume: true +prefill: + replicas: 1 + containers: + - name: "vllm" + image: "ghcr.io/llm-d/llm-d-inference-sim:0.0.4" + args: + - "--model" + - "random" + - "--port" + - "8000" # servicePort + ports: + - containerPort: 5557 + protocol: TCP + mountModelVolume: true + +endpointPicker: + # This is for setting up a service more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/ + service: + # This sets the service type more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types + type: ClusterIP + # This sets the ports more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#field-spec-ports + port: 9002 + targetPort: 9002 + appProtocol: http2 + debugLevel: 6 + disableReadinessProbe: true + disableLivenessProbe: true + + autoscaling: + enabled: false + replicas: 1 + + +curl http://localhost:8000/v1/completions -vvv \ + -H "Content-Type: application/json" \ + -H "x-model-name: facebook/opt-125m" \ + -d '{ + "model": "facebook/opt-125m", + "prompt": "Hello, " +}' \ No newline at end of file diff --git a/helm/templates/_helpers.tpl b/helm/templates/_helpers.tpl new file mode 100644 index 0000000..d6d74d7 --- /dev/null +++ b/helm/templates/_helpers.tpl @@ -0,0 +1,321 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "llm-d-modelservice.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "llm-d-modelservice.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "llm-d-modelservice.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "llm-d-modelservice.labels" -}} +helm.sh/chart: {{ include "llm-d-modelservice.chart" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* Sanitized model name (DNS compliant) */}} +{{- define "llm-d-modelservice.sanitizedModelName" -}} + {{- $name := .Release.Name | lower | trim -}} + {{- $name = regexReplaceAll "[^a-z0-9_.-]" $name "-" -}} + {{- $name = regexReplaceAll "^[\\-._]+" $name "" -}} + {{- $name = regexReplaceAll "[\\-._]+$" $name "" -}} + {{- $name = regexReplaceAll "\\." $name "-" -}} + + {{- if gt (len $name) 63 -}} + {{- $name = substr 0 63 $name -}} + {{- end -}} + +{{- $name -}} +{{- end }} + +{{/* Common P/D labels */}} +{{- define "llm-d-modelservice.pdlabels" -}} +llm-d.ai/inferenceServing: "true" +llm-d.ai/model: {{ (include "llm-d-modelservice.fullname" .) -}} +{{- end }} + +{{/* prefill labels */}} +{{- define "llm-d-modelservice.prefilllabels" -}} +{{ include "llm-d-modelservice.pdlabels" . }} +llm-d.ai/role: prefill +{{- end }} + +{{/* decode labels */}} +{{- define "llm-d-modelservice.decodelabels" -}} +{{ include "llm-d-modelservice.pdlabels" . }} +llm-d.ai/role: decode +{{- end }} + +{{/* affinity from acceleratorTypes */}} +{{- define "llm-d-modelservice.acceleratorTypes" -}} +affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: {{ .labelKey }} + operator: In + {{- with .labelValues }} + values: + {{- toYaml . | nindent 14 }} + {{- end }} +{{- end }} + +{{/* Routing proxy -- sidecar for decode pods */}} +{{- define "llm-d-modelservice.routingProxy" -}} +initContainers: + - name: routing-proxy + args: + - --port={{ default 8080 .servicePort }} + - --vllm-port={{ default 8200 .proxy.targetPort }} + - --connector=nixlv2 + - -v={{ default 5 .proxy.debugLevel }} + image: {{ default "ghcr.io/llm-d/llm-d-routing-sidecar:0.0.6" .proxy.image }} + imagePullPolicy: Always + ports: + - containerPort: {{ default 8080 .servicePort }} + resources: {} + restartPolicy: Always + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true +{{- end }} + +{{/* Desired P/d tensor parallelism -- user set or defaults to 1 */}} +{{- define "llm-d-modelservice.tensorParallelism" -}} +{{- if and . .tensor }}{{ .tensor }}{{ else }}1{{ end }} +{{- end }} + +{{/* Desired P/D data parallelism -- user set or defaults to 1 */}} +{{- define "llm-d-modelservice.dataParallelism" -}} +{{- if and . .data }}{{ .data }}{{ else }}1{{ end }} +{{- end }} + +{{/* +Port on which vllm container should listen. +Context is helm root context plus key "role" ("decode" or "prefill") +*/}} +{{- define "llm-d-modelservice.vllmPort" -}} +{{- if eq .role "prefill" }}{{ .Values.routing.servicePort }}{{ else }}{{ .Values.routing.proxy.targetPort }}{{ end }} +{{- end }} + +{{/* P/D deployment container resources */}} +{{- define "llm-d-modelservice.resources" -}} +{{- $tensorParallelism := int (include "llm-d-modelservice.tensorParallelism" .parallelism) -}} +{{- $limits := dict }} +{{- if and .resources .resources.limits }} +{{- $limits = deepCopy .resources.limits }} +{{- end }} +{{- if gt (int $tensorParallelism) 1 }} +{{- $limits = mergeOverwrite $limits (dict "nvidia.com/gpu" $tensorParallelism) }} +{{- end }} +{{- $requests := dict }} +{{- if and .resources .resources.requests }} +{{- $requests = deepCopy .resources.requests }} +{{- end }} +{{- if gt (int $tensorParallelism) 1 }} +{{- $requests = mergeOverwrite $requests (dict "nvidia.com/gpu" $tensorParallelism) }} +{{- end }} +resources: + limits: + {{- toYaml $limits | nindent 4 }} + requests: + {{- toYaml $requests | nindent 4 }} +{{- end }} + +{{/* P/D service account name */}} +{{- define "llm-d-modelservice.pdServiceAccountName" -}} +{{ include "llm-d-modelservice.fullname" . }}-sa +{{- end }} + +{{/* +EPP service account name +Context is helm root context +*/}} +{{- define "llm-d-modelservice.eppServiceAccountName" -}} +{{ include "llm-d-modelservice.fullname" . }}-epp-sa +{{- end }} + +{{/* +Volumes for PD containers based on model artifact prefix +Context is .Values.modelArtifacts +*/}} +{{- define "llm-d-modelservice.mountModelVolumeVolumes" -}} +{{- $parsedArtifacts := regexSplit "://" .uri -1 -}} +{{- $protocol := first $parsedArtifacts -}} +{{- $path := last $parsedArtifacts -}} +{{- if eq $protocol "hf" -}} +- name: model-storage + emptyDir: + sizeLimit: {{ default "0" .size }} +{{- else if eq $protocol "pvc" }} +{{- $parsedArtifacts := regexSplit "/" $path -1 -}} +{{- $claim := first $parsedArtifacts -}} +- name: model-storage + persistentVolumeClaim: + claimName: {{ $claim }} + readOnly: true +{{- else if eq $protocol "oci" }} +- name: model-storage + image: + reference: {{ $path }} + pullPolicy: {{ default "Always" .imagePullPolicy }} +{{- end }} +{{- end }} + +{{/* +VolumeMount for a PD container +Supplies model-storage mount if mountModelVolume: true for the container +*/}} +{{- define "llm-d-modelservice.mountModelVolumeVolumeMounts" -}} +{{- if or .volumeMounts .mountModelVolume }} +volumeMounts: +{{- end }} +{{- /* user supplied volume mount in values */}} +{{- with .volumeMounts }} + {{- toYaml . | nindent 8 }} +{{- end }} +{{- /* what we add if mounModelVolume is true */}} +{{- if .mountModelVolume }} + - name: model-storage + mountPath: /model-cache +{{- end }} +{{- end }} + +{{/* +Pod elements of deployment/lws spec template +context is a pdSpec +*/}} +{{- define "llm-d-modelservice.modelPod" -}} + {{- with .pdSpec.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 2 }} + {{- end }} + serviceAccountName: {{ include "llm-d-modelservice.pdServiceAccountName" . }} + {{- with .pdSpec.podSecurityContext }} + securityContext: + {{- toYaml . | nindent 4 }} + {{- end }} + {{- with .pdSpec.acceleratorTypes }} + {{- include "llm-d-modelservice.acceleratorTypes" . | nindent 2 }} + {{- end }} + {{- if or .pdSpec.volumes .pdSpec.mountModelVolume }} + volumes: + {{- toYaml .pdSpec.volumes | nindent 4 }} + {{ include "llm-d-modelservice.mountModelVolumeVolumes" .Values.modelArtifacts | nindent 4}} + {{- end }} +{{- end }} {{- /* define "llm-d-modelservice.modelPod" */}} + +{{/* +Container elements of deployment/lws spec template +context is a dict with helm root context plus: + key - "container"; value - container spec + key - "roll"; value - either "decode" or "prefill" + key - "parallelism"; value - $.Values.decode.parallelism +*/}} +{{- define "llm-d-modelservice.container" -}} +- name: {{ default "vllm" .container.name }} + image: {{ required "image of container is required" .container.image }} + {{- with .container.securityContext }} + securityContext: + {{- toYaml . | nindent 4 }} + {{- end }} + {{- with .container.imagePullPolicy }} + imagePullPolicy: {{ . }} + {{- end }} + {{- with .container.command }} + command: + {{- toYaml . | nindent 2 }} + {{- end }} + args: + - {{ .Values.routing.modelName | quote }} + - --port + - {{ (include "llm-d-modelservice.vllmPort" .) | quote }} + {{- with .container.args }} + {{- toYaml . | nindent 2 }} + {{- end }} + {{- /* insert user's env for this container */}} + {{- if or .container.env .container.mountModelVolume }} + env: + {{- end }} + {{- with .container.env }} + {{- toYaml . | nindent 2 }} + {{- end }} + - name: DP_SIZE + value: {{ include "llm-d-modelservice.tensorParallelism" .parallelism | quote }} + - name: TP_SIZE + value: {{ include "llm-d-modelservice.dataParallelism" .parallelism | quote }} + - name: DP_SIZE_LOCAL + value: "1" + {{- /* insert envs based on what modelArtifact prefix */}} + {{- if .container.mountModelVolume }} + - name: HF_HOME + value: /model-cache + {{- with .Values.modelArtifacts.authSecretName }} + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: {{ . }} + key: HF_TOKEN + {{- end }} + {{- end }} + {{- with .container.livenessProbe }} + livenessProbe: + {{- toYaml . | nindent 2 }} + {{- end }} + {{- with .container.readinessProbe }} + readinessProbe: + {{- toYaml . | nindent 2 }} + {{- end }} + {{- (include "llm-d-modelservice.resources" (dict "resources" .container.resources "parallelism" .parallelism)) | nindent 2 }} + {{- /* volumeMount */}} + {{- if or .container.volumeMounts .container.mountModelVolume }} + volumeMounts: + {{- end -}} + {{- /* user supplied volume mount in values */}} + {{- with .container.volumeMounts }} + {{- toYaml . | nindent 2 }} + {{- end }} + {{- /* what we add if mounModelVolume is true */}} + {{- if .container.mountModelVolume }} + - name: model-storage + mountPath: /model-cache + {{- end }} + {{- with .container.workingDir }} + workingDir: {{ . }} + {{- end }} + {{- with .container.stdin }} + stdin: {{ . }} + {{- end }} + {{- with .container.tty }} + tty: {{ . }} + {{- end }} +{{- end }} {{- /* define "llm-d-modelservice.container" */}} diff --git a/helm/templates/decode-deployment.yaml b/helm/templates/decode-deployment.yaml new file mode 100644 index 0000000..52bc147 --- /dev/null +++ b/helm/templates/decode-deployment.yaml @@ -0,0 +1,28 @@ +{{- if and .Values.decode (not .Values.multinode) }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "llm-d-modelservice.fullname" . }}-decode + labels: + {{- include "llm-d-modelservice.labels" . | nindent 4 }} +spec: + replicas: {{ default 1 .Values.decode.replicas }} + selector: + matchLabels: + {{- include "llm-d-modelservice.decodelabels" . | nindent 6 }} + template: + metadata: + labels: + {{- include "llm-d-modelservice.decodelabels" . | nindent 8 }} + spec: + {{- with .Values.routing }} + {{- (include "llm-d-modelservice.routingProxy" .) | nindent 6 }} + {{- end }} + {{- (include "llm-d-modelservice.modelPod" (merge . (dict "pdSpec" .Values.decode))) | nindent 4 }} + {{- with .Values.decode.containers }} + containers: + {{- range . }} + {{- (include "llm-d-modelservice.container" (merge (dict "role" "decode" "container" . "parallelism" $.Values.decode.parallelism) $)) | nindent 8 }} + {{- end }} + {{- end }} +{{- end }} \ No newline at end of file diff --git a/helm/templates/decode-lws.yaml b/helm/templates/decode-lws.yaml new file mode 100644 index 0000000..a3e840d --- /dev/null +++ b/helm/templates/decode-lws.yaml @@ -0,0 +1,44 @@ +{{- if and .Values.decode .Values.multinode }} +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + name: {{ include "llm-d-modelservice.fullname" . }}-decode + labels: + {{- include "llm-d-modelservice.labels" . | nindent 4 }} + {{- include "llm-d-modelservice.decodelabels" . | nindent 4 }} +spec: + {{- if not .Values.decode.autoscaling.enabled }} + replicas: {{ default 1 .Values.decode.replicas }} + {{- end }} + leaderWorkerTemplate: + size: {{ int (include "llm-d-modelservice.dataParallelism" .Values.decode.parallelism) }} + leaderTemplate: + metadata: + labels: + {{- include "llm-d-modelservice.decodelabels" . | nindent 10 }} + spec: + {{- with .Values.routing }} + {{ (include "llm-d-modelservice.routingProxy" .) | nindent 8 }} + {{- end }} + {{- (include "llm-d-modelservice.modelPod" (merge . (dict "pdSpec" .Values.decode))) | nindent 6 }} + {{- with .Values.decode.containers }} + containers: + {{- range . }} + {{- (include "llm-d-modelservice.container" (merge (dict "role" "decode" "container" . "parallelism" $.Values.decode.parallelism) $)) | nindent 8 }} + {{- end }} + {{- end }} + + workerTemplate: + metadata: + labels: + {{- include "llm-d-modelservice.decodelabels" . | nindent 10 }} + spec: + {{- (include "llm-d-modelservice.modelPod" (merge . (dict "pdSpec" .Values.decode))) | nindent 6 }} + {{- with .Values.decode.containers }} + containers: + {{- range . }} + {{- (include "llm-d-modelservice.container" (merge (dict "role" "decode" "container" . "parallelism" $.Values.decode.parallelism) $)) | nindent 8 }} + {{- end }} + {{- end }} + +{{- end }} diff --git a/helm/templates/epp-deployment.yaml b/helm/templates/epp-deployment.yaml new file mode 100644 index 0000000..825d0bc --- /dev/null +++ b/helm/templates/epp-deployment.yaml @@ -0,0 +1,108 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "llm-d-modelservice.fullname" . }}-epp + labels: + llm-d.ai/epp: {{ include "llm-d-modelservice.fullname" . }}-epp + namespace: {{ .Release.Namespace }} +spec: + replicas: 1 + selector: + matchLabels: + llm-d.ai/epp: {{ include "llm-d-modelservice.fullname" . }}-epp + template: + metadata: + labels: + llm-d.ai/epp: {{ include "llm-d-modelservice.fullname" . }}-epp + spec: + containers: + - name: epp + imagePullPolicy: Always + image: {{ default "ghcr.io/llm-d/llm-d-inference-scheduler:0.0.3" .Values.endpointPicker.image }} + args: + - --poolName + - {{ include "llm-d-modelservice.fullname" . }}-inference-pool + - --poolNamespace + - {{ .Release.Namespace }} + - -v + - "{{ default 4 .Values.endpointPicker.debugLevel }}" + - --zap-encoder + - json + - --grpcPort + - "9002" + - --grpcHealthPort + - "9003" + # using defaults from https://github.com/llm-d/llm-d-deployer/blob/main/charts/llm-d/values.yaml#L563-L603 + env: + - name: ENABLE_KVCACHE_AWARE_SCORER + value: "false" + - name: ENABLE_LOAD_AWARE_SCORER + value: "true" + - name: ENABLE_PREFIX_AWARE_SCORER + value: "true" + - name: ENABLE_SESSION_AWARE_SCORER + value: "false" + - name: KVCACHE_AWARE_SCORER_WEIGHT + value: "1" + - name: KVCACHE_INDEXER_REDIS_ADDR + - name: LOAD_AWARE_SCORER_WEIGHT + value: "1" + - name: PD_ENABLED + value: "false" + - name: PD_PROMPT_LEN_THRESHOLD + value: "10" + - name: PREFILL_ENABLE_KVCACHE_AWARE_SCORER + value: "false" + - name: PREFILL_ENABLE_LOAD_AWARE_SCORER + value: "false" + - name: PREFILL_ENABLE_PREFIX_AWARE_SCORER + value: "false" + - name: PREFILL_ENABLE_SESSION_AWARE_SCORER + value: "false" + - name: PREFILL_KVCACHE_AWARE_SCORER_WEIGHT + value: "1" + - name: PREFILL_KVCACHE_INDEXER_REDIS_ADDR + - name: PREFILL_LOAD_AWARE_SCORER_WEIGHT + value: "1" + - name: PREFILL_PREFIX_AWARE_SCORER_WEIGHT + value: "1" + - name: PREFILL_SESSION_AWARE_SCORER_WEIGHT + value: "1" + - name: PREFIX_AWARE_SCORER_WEIGHT + value: "2" + - name: SESSION_AWARE_SCORER_WEIGHT + value: "1" + ports: + - containerPort: 9002 + name: grpc + protocol: TCP + - containerPort: 9003 + name: grpc-health + protocol: TCP + - containerPort: 9090 + name: metrics + protocol: TCP + {{- if (not .Values.endpointPicker.disableReadinessProbe) }} + readinessProbe: + grpc: + port: 9003 + service: envoy.service.ext_proc.v3.ExternalProcessor + initialDelaySeconds: 5 + timeoutSeconds: 1 + periodSeconds: 10 + successThreshold: 1 + failureThreshold: 3 + {{- end }} + {{- if (not .Values.endpointPicker.disableLivenessProbe) }} + livenessProbe: + grpc: + port: 9003 + service: envoy.service.ext_proc.v3.ExternalProcessor + initialDelaySeconds: 5 + timeoutSeconds: 1 + periodSeconds: 10 + successThreshold: 1 + failureThreshold: 3 + {{- end }} + serviceAccount: {{ include "llm-d-modelservice.eppServiceAccountName" . }} + serviceAccountName: {{ include "llm-d-modelservice.eppServiceAccountName" . }} \ No newline at end of file diff --git a/helm/templates/epp-sa.yaml b/helm/templates/epp-sa.yaml new file mode 100644 index 0000000..5503b9d --- /dev/null +++ b/helm/templates/epp-sa.yaml @@ -0,0 +1,8 @@ +{{- if .Values.endpointPicker -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "llm-d-modelservice.eppServiceAccountName" . }} + labels: + {{- include "llm-d-modelservice.labels" . | nindent 4 }} +{{- end }} diff --git a/helm/templates/epp-service.yaml b/helm/templates/epp-service.yaml new file mode 100644 index 0000000..9c182a9 --- /dev/null +++ b/helm/templates/epp-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "llm-d-modelservice.fullname" . }}-epp-service + labels: + {{- include "llm-d-modelservice.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.endpointPicker.service.port }} + targetPort: {{ .Values.endpointPicker.service.targetPort }} + protocol: TCP + appProtocol: {{ .Values.endpointPicker.service.appProtocol }} + selector: + llm-d.ai/epp: {{ include "llm-d-modelservice.fullname" . }}-epp diff --git a/helm/templates/prefill-deployment.yaml b/helm/templates/prefill-deployment.yaml new file mode 100644 index 0000000..1be4784 --- /dev/null +++ b/helm/templates/prefill-deployment.yaml @@ -0,0 +1,25 @@ +{{- if and .Values.prefill (not .Values.multinode) }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "llm-d-modelservice.fullname" . }}-prefill + labels: + {{- include "llm-d-modelservice.labels" . | nindent 4 }} +spec: + replicas: {{ default 1 .Values.prefill.replicas }} + selector: + matchLabels: + {{- include "llm-d-modelservice.prefilllabels" . | nindent 6 }} + template: + metadata: + labels: + {{- include "llm-d-modelservice.prefilllabels" . | nindent 8 }} + spec: + {{- (include "llm-d-modelservice.modelPod" (merge . (dict "pdSpec" .Values.prefill))) | nindent 4 }} + {{- with .Values.prefill.containers }} + containers: + {{- range . }} + {{- (include "llm-d-modelservice.container" (merge (dict "role" "prefill" "container" . "parallelism" $.Values.prefill.parallelism) $)) | nindent 8 }} + {{- end }} + {{- end }} +{{- end }} \ No newline at end of file diff --git a/helm/templates/prefill-lws.yaml b/helm/templates/prefill-lws.yaml new file mode 100644 index 0000000..618633a --- /dev/null +++ b/helm/templates/prefill-lws.yaml @@ -0,0 +1,41 @@ +{{- if and .Values.prefill .Values.multinode }} +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + name: {{ include "llm-d-modelservice.fullname" . }}-prefill + labels: + {{- include "llm-d-modelservice.labels" . | nindent 4 }} + {{- include "llm-d-modelservice.prefilllabels" . | nindent 4 }} +spec: + {{- if not .Values.prefill.autoscaling.enabled }} + replicas: {{ default 1 .Values.prefill.replicas }} + {{- end }} + leaderWorkerTemplate: + size: {{ int (include "llm-d-modelservice.dataParallelism" .Values.prefill.parallelism) }} + leaderTemplate: + metadata: + labels: + {{- include "llm-d-modelservice.prefilllabels" . | nindent 10 }} + spec: + {{- (include "llm-d-modelservice.modelPod" (merge . (dict "pdSpec" .Values.prefill))) | nindent 6 }} + {{- with .Values.prefill.containers }} + containers: + {{- range . }} + {{- (include "llm-d-modelservice.container" (merge (dict "role" "prefill" "container" . "parallelism" $.Values.prefill.parallelism) $)) | nindent 8 }} + {{- end }} + {{- end }} + + workerTemplate: + metadata: + labels: + {{- include "llm-d-modelservice.prefilllabels" . | nindent 10 }} + spec: + {{- (include "llm-d-modelservice.modelPod" (merge . (dict "pdSpec" .Values.prefill))) | nindent 6 }} + {{- with .Values.prefill.containers }} + containers: + {{- range . }} + {{- (include "llm-d-modelservice.container" (merge (dict "role" "prefill" "container" . "parallelism" $.Values.prefill.parallelism) $)) | nindent 8 }} + {{- end }} + {{- end }} + +{{- end }} diff --git a/helm/templates/rolebinding.yaml b/helm/templates/rolebinding.yaml new file mode 100644 index 0000000..e9ad13b --- /dev/null +++ b/helm/templates/rolebinding.yaml @@ -0,0 +1,11 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "llm-d-modelservice.fullname" . }}-epp-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ required "role for inference schedule required" .Values.endpointPicker.permissions }} +subjects: +- kind: ServiceAccount + name: {{ include "llm-d-modelservice.eppServiceAccountName" . }} \ No newline at end of file diff --git a/helm/templates/routing.yaml b/helm/templates/routing.yaml new file mode 100644 index 0000000..8e7c42c --- /dev/null +++ b/helm/templates/routing.yaml @@ -0,0 +1,66 @@ +{{- /* Routing templates: InferencePool, InferenceModel, and HttpRoute */}} +{{- if .Values.inferencePool }} +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferencePool +metadata: + name: {{ include "llm-d-modelservice.fullname" . }}-inference-pool + namespace: {{ .Release.Namespace }} +spec: + extensionRef: + failureMode: FailClose + group: "" + kind: Service + name: {{ include "llm-d-modelservice.fullname" . }}-epp-service + selector: + {{- if .Values.multinode }} + leaderworkerset.sigs.k8s.io/worker-index: "0" + {{- end }} + {{- include "llm-d-modelservice.pdlabels" . | nindent 4 }} + targetPortNumber: {{ .Values.routing.servicePort }} +{{- end }} +--- +{{- if .Values.inferenceModel }} +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: {{ include "llm-d-modelservice.fullname" . }}-inference-model + namespace: {{ .Release.Namespace }} + labels: + {{- include "llm-d-modelservice.pdlabels" . | nindent 4 }} +spec: + modelName: {{ .Values.routing.modelName }} + poolRef: + group: inference.networking.x-k8s.io + kind: InferencePool + name: {{ include "llm-d-modelservice.fullname" . }}-inference-pool +{{- end }} +--- +{{- if .Values.httpRoute }} +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: {{ include "llm-d-modelservice.fullname" . }}-http-route + namespace: {{ .Release.Namespace }} + labels: + {{- include "llm-d-modelservice.labels" . | nindent 4 }} +spec: + {{- with .Values.routing.parentRefs }} + parentRefs: + {{- . | toYaml | nindent 2}} + {{- end }} + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: {{ include "llm-d-modelservice.fullname" . }}-inference-pool + port: {{ .Values.routing.servicePort }} + weight: 1 + matches: + - path: + type: PathPrefix + value: / + headers: + - name: x-model-name + type: Exact + value: {{ .Values.routing.modelName }} +{{- end }} \ No newline at end of file diff --git a/helm/templates/serviceaccount.yaml b/helm/templates/serviceaccount.yaml new file mode 100644 index 0000000..463b584 --- /dev/null +++ b/helm/templates/serviceaccount.yaml @@ -0,0 +1,8 @@ +{{- if or .Values.prefill .Values.decode -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "llm-d-modelservice.pdServiceAccountName" . }} + labels: + {{- include "llm-d-modelservice.labels" . | nindent 4 }} +{{- end }} diff --git a/helm/values-msvc.yaml b/helm/values-msvc.yaml new file mode 100644 index 0000000..2fa22fa --- /dev/null +++ b/helm/values-msvc.yaml @@ -0,0 +1,98 @@ +multinode: false # If true, creates LWS instead of deployments +inferencePool: true +inferenceModel: true +httpRoute: true + +routing: + # This is the model name for the OpenAI request + modelName: facebook/opt-125m + ports: + servicePort: 8000 # Sidecar listens on this port for requests. If there's no sidecar, the request goes here + internalPort: 8200 # Sidecar forwards request to vllm container on this port + proxy: + targetPort: 8000 + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway-kgateway + +modelArtifacts: + # When specfying the URI with `hf` prefix, the / string + # is extracted and exposed as a template variable that can be used as {{ .HFModelName }} + prefix: "oci" + artifact: facebook/opt-125m + authSecretName: "hf-secret" + size: 5Mi + imagePullPolicy: IfNotPresent + +# describe decode pods +decode: + enableService: false + replicas: 1 + + # for LWS + parallelism: + tensor: 8 + data: 1 + dataLocal: 1 + + acceleratorTypes: + labelKey: nvidia.com/gpu.product + labelValues: + # According to the blog, Scout requires H100s + - NVIDIA-H100 + # initContainers: + containers: + - name: "vllm" + image: "vllm-ai/vllm:latest" + args: + - "HFModelName" + env: + - name: "VLLM_LOG_LEVEL" + value: "DEBUG" # Set to DEBUG for more detailed logs, or INFO for less verbose logs + envFrom: + - configMapRef: + name: vllm-config + resources: + requests: + cpu: "1" # Request 1 CPU core + memory: "4Gi" # Request 4 GiB of memory + limits: + cpu: "2" # Limit to 2 CPU cores + memory: "8Gi" # Limit to 8 GiB of memory + mountModelVolume: true + +# describe the prefill pods (looks the same as above) +prefill: + replicas: 1 + containers: + - name: "vllm" + image: "vllm-ai/vllm:latest" + args: + - "HFModelName" + env: + - name: ok + value: ok + mountModelVolume: true + - name: "v2" + image: "vllm-ai/vllm:latest" + volumeMounts: + - name: whatever + mountPath: something + volumes: + - name: ok + emptyDir: + sizeLimit: 5Gi + - name: ok2 + emptyDir: + sizeLimit: 5Gi + +endpointPicker: + # This is for setting up a service more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/ + service: + # This sets the service type more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types + type: ClusterIP + # This sets the ports more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#field-spec-ports + port: 9002 + targetPort: 9002 + appProtocol: http2 diff --git a/helm/values.yaml b/helm/values.yaml new file mode 100644 index 0000000..3e5575d --- /dev/null +++ b/helm/values.yaml @@ -0,0 +1,123 @@ +# Default values for llm-d-modelservice. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +# This will set the replicaset count more information can be found here: https://kubernetes.io/docs/concepts/workloads/controllers/replicaset/ +replicaCount: 1 + +# This sets the container image more information can be found here: https://kubernetes.io/docs/concepts/containers/images/ +image: + repository: nginx + # This sets the pull policy for images. + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: "" + +# This is for the secrets for pulling an image from a private repository more information can be found here: https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/ +imagePullSecrets: [] +# This is to override the chart name. +nameOverride: "" +fullnameOverride: "" + +# This section builds out the service account more information can be found here: https://kubernetes.io/docs/concepts/security/service-accounts/ +serviceAccount: + # Specifies whether a service account should be created + create: true + # Automatically mount a ServiceAccount's API credentials? + automount: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + +# This is for setting Kubernetes Annotations to a Pod. +# For more information checkout: https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/ +podAnnotations: {} +# This is for setting Kubernetes Labels to a Pod. +# For more information checkout: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ +podLabels: {} + +podSecurityContext: {} + # fsGroup: 2000 + +securityContext: {} + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + # runAsNonRoot: true + # runAsUser: 1000 + +# This is for setting up a service more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/ +service: + # This sets the service type more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types + type: ClusterIP + # This sets the ports more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#field-spec-ports + port: 80 + +# This block is for setting up the ingress for more information can be found here: https://kubernetes.io/docs/concepts/services-networking/ingress/ +ingress: + enabled: false + className: "" + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + hosts: + - host: chart-example.local + paths: + - path: / + pathType: ImplementationSpecific + tls: [] + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + +resources: {} + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + +# This is to setup the liveness and readiness probes more information can be found here: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/ +livenessProbe: + httpGet: + path: / + port: http +readinessProbe: + httpGet: + path: / + port: http + +# This section is for setting up autoscaling more information can be found here: https://kubernetes.io/docs/concepts/workloads/autoscaling/ +autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 100 + targetCPUUtilizationPercentage: 80 + # targetMemoryUtilizationPercentage: 80 + +# Additional volumes on the output Deployment definition. +volumes: [] +# - name: foo +# secret: +# secretName: mysecret +# optional: false + +# Additional volumeMounts on the output Deployment definition. +volumeMounts: [] +# - name: foo +# mountPath: "/etc/foo" +# readOnly: true + +nodeSelector: {} + +tolerations: [] + +affinity: {}