diff --git a/helm/.helmignore b/helm/.helmignore new file mode 100644 index 0000000..2b2f68b --- /dev/null +++ b/helm/.helmignore @@ -0,0 +1,28 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ +# ReadMEs +*.md +# Rendered templates +values-*.yaml +output-*.yaml \ No newline at end of file diff --git a/helm/Chart.yaml b/helm/Chart.yaml new file mode 100644 index 0000000..300e738 --- /dev/null +++ b/helm/Chart.yaml @@ -0,0 +1,24 @@ +apiVersion: v2 +name: llm-d-modelservice +description: A Helm chart for ModelService + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.0.1 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "0.0.1" diff --git a/helm/templates/_helpers.tpl b/helm/templates/_helpers.tpl new file mode 100644 index 0000000..92a620c --- /dev/null +++ b/helm/templates/_helpers.tpl @@ -0,0 +1,203 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "llm-d-modelservice.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "llm-d-modelservice.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "llm-d-modelservice.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "llm-d-modelservice.labels" -}} +helm.sh/chart: {{ include "llm-d-modelservice.chart" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* Sanitized model name (DNS compliant) */}} +{{- define "llm-d-modelservice.sanitizedModelName" -}} + {{- $name := .Release.Name | lower | trim -}} + {{- $name = regexReplaceAll "[^a-z0-9_.-]" $name "-" -}} + {{- $name = regexReplaceAll "^[\\-._]+" $name "" -}} + {{- $name = regexReplaceAll "[\\-._]+$" $name "" -}} + {{- $name = regexReplaceAll "\\." $name "-" -}} + + {{- if gt (len $name) 63 -}} + {{- $name = substr 0 63 $name -}} + {{- end -}} + +{{- $name -}} +{{- end }} + +{{/* Common P/D labels */}} +{{- define "llm-d-modelservice.pdlabels" -}} +llm-d.ai/inferenceServing: "true" +llm-d.ai/model: {{ (include "llm-d-modelservice.sanitizedModelName" .) -}} +{{- end }} + +{{/* prefill labels */}} +{{- define "llm-d-modelservice.prefilllabels" -}} +{{ include "llm-d-modelservice.pdlabels" . }} +llm-d.ai/role: prefill +{{- end }} + +{{/* decode labels */}} +{{- define "llm-d-modelservice.decodelabels" -}} +{{ include "llm-d-modelservice.pdlabels" . }} +llm-d.ai/role: decode +{{- end }} + +{{/* affinity from acceleratorTypes */}} +{{- define "llm-d-modelservice.acceleratorTypes" -}} +affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: {{ .labelKey }} + operator: In + {{- with .labelValues }} + values: + {{- toYaml . | nindent 14 }} + {{- end }} +{{- end }} + +{{/* Routing proxy -- sidecar for decode pods */}} +{{- define "llm-d-modelservice.routingProxy" -}} +initContainers: + - name: routing-proxy + args: + - --port={{ default 8080 .servicePort }} + - --vllm-port={{ default 8200 .proxy.targetPort }} + - --connector=nixlv2 + - -v={{ default 5 .proxy.debugLevel }} + image: {{ .image }} + imagePullPolicy: Always + ports: + - containerPort: {{ default 8080 .servicePort }} + protocol: TCP + resources: {} + restartPolicy: Always + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true +{{- end }} + +{{- define "llm-d-modelservice.parallelism" -}} +{{- $parallelism := dict "tensor" 1 "data" 1 -}} +{{- if and . .tensor }} +{{- $parallelism = mergeOverwrite $parallelism (dict "tensor" .tensor) -}} +{{- end }} +{{- if and . .data }} +{{- $parallelism = mergeOverwrite $parallelism (dict "data" .data) -}} +{{- end }} +{{- $parallelism | toYaml | nindent 0 }} +{{- end }} + +{{- define "llm-d-modelservice.resources" -}} +resources: + limits: + {{- $limits := dict -}} + {{- if and .resources .resources.limits -}} + {{- $limits = omit .resources.limits "nvidia.com/gpu" }} + {{- if gt (len $limits) 0 }} + {{- toYaml $limits | nindent 4 }} + {{- end }} + {{- end }} + nvidia.com/gpu: {{ .parallelism.tensor }} + requests: + {{- $requests := dict -}} + {{- if and .resources .resources.requests -}} + {{- $requests = omit .resources.requests "nvidia.com/gpu" }} + {{- end }} + {{- if gt (len $requests) 0 }} + {{- toYaml $requests | nindent 4 }} + {{- end }} + nvidia.com/gpu: {{ .parallelism.tensor }} +{{- end }} + +{{/* P/D service account name */}} +{{- define "llm-d-modelservice.pdServiceAccountName" -}} +{{ include "llm-d-modelservice.sanitizedModelName" . }}-sa +{{- end }} + +{{/* EPP service account name */}} +{{- define "llm-d-modelservice.eppServiceAccountName" -}} +{{ include "llm-d-modelservice.sanitizedModelName" . }}-epp-sa +{{- end }} + +{{/* +EPP selector labels +*/}} +{{- define "llm-d-modelservice.eppSelectorLabels" -}} +app.kubernetes.io/name: {{ include "llm-d-modelservice.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +llm-d.ai/epp: {{ include "llm-d-modelservice.fullname" . }}-epp +{{- end }} + +{{/* +Volumes for PD containers based on model artifact prefix +*/}} +{{- define "llm-d-modelservice.mountModelVolumeVolumes" -}} +{{- if eq .Values.modelArtifacts.prefix "hf" -}} +- name: model-storage + emptyDir: + sizeLimit: {{ default "0" .Values.modelArtifacts.size }} +{{- else if eq .Values.modelArtifacts.prefix "pvc" }} +- name: model-storage + persistentVolumeClaim: + claimName: {{ .Values.modelArtifacts.artifact }} + readOnly: true +{{- else if eq .Values.modelArtifacts.prefix "oci" }} +- name: model-storage + image: + reference: {{ .Values.modelArtifacts.artifact }} + pullPolicy: {{ default "Always" .Values.modelArtifacts.imagePullPolicy }} +{{- end }} +{{- end }} + +{{/* +VolumeMount for a PD container +Supplies model-storage mount if mountModelVolume: true for the container +*/}} +{{- define "llm-d-modelservice.mountModelVolumeVolumeMounts" -}} +{{- if or .volumeMounts .mountModelVolume }} +volumeMounts: +{{- end }} +{{- /* user supplied volume mount in values */}} +{{- with .volumeMounts }} + {{- toYaml . | nindent 8 }} +{{- end }} +{{- /* what we add if mounModelVolume is true */}} +{{- if .mountModelVolume }} + - name: model-storage + mountPath: /model-cache +{{- end }} +{{- end }} diff --git a/helm/templates/decode-deployment.yaml b/helm/templates/decode-deployment.yaml new file mode 100644 index 0000000..e1be745 --- /dev/null +++ b/helm/templates/decode-deployment.yaml @@ -0,0 +1,106 @@ +{{- $parallelism := (include "llm-d-modelservice.parallelism" .Values.decode.parallelism) | fromYaml -}} +{{- if .Values.decode }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "llm-d-modelservice.fullname" . }}-decode + labels: + {{- include "llm-d-modelservice.labels" . | nindent 4 }} +spec: + replicas: {{ default 1 .Values.decode.replicas }} + selector: + matchLabels: + {{- include "llm-d-modelservice.decodelabels" . | nindent 6 }} + template: + metadata: + labels: + {{- include "llm-d-modelservice.decodelabels" . | nindent 8 }} + spec: + {{- with .Values.decode.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "llm-d-modelservice.pdServiceAccountName" . }} + {{- with .Values.podSecurityContext }} + securityContext: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.decode.acceleratorTypes }} + {{- include "llm-d-modelservice.acceleratorTypes" . | nindent 6 }} + {{- end }} + {{- /* initContainers */}} + {{- with .Values.decode.initContainers }} + initContainers: + {{- toYaml . | nindent 6 }} + {{- end }} + {{- /* range $.Values.decode.containers */}} + {{- with .Values.decode.containers }} + containers: + {{- range . }} + - name: {{ default "vllm" .name }} + image: {{ required "image of container is required" .image }} + {{- with .securityContext }} + securityContext: + {{- toYaml . | nindent 10 }} + {{- end }} + {{- with .imagePullPolicy }} + imagePullPolicy: {{ . }} + {{- end }} + {{- with .command }} + command: + {{- toYaml . | nindent 10 }} + {{- end }} + {{- with .args }} + args: + {{- toYaml . | nindent 10 }} + {{- end }} + {{- /* insert user's env for this container */}} + {{- if or .env .mountModelVolume }} + env: + {{- end }} + {{- with .env }} + {{- toYaml . | nindent 10 }} + {{- end }} + {{- /* insert envs based on what modelArtifact prefix */}} + {{- if .mountModelVolume }} + - name: HF_HOME + value: /model-cache + {{- with $.Values.modelArtifacts.authSecretName }} + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: {{ . }} + key: HF_TOKEN + {{- end }} + {{- end }} + {{- with .livenessProbe }} + livenessProbe: + {{- toYaml . | nindent 10 }} + {{- end }} + {{- with .readinessProbe }} + readinessProbe: + {{- toYaml . | nindent 10 }} + {{- end }} + {{- with .resources }} + resources: + limits: + {{- if .limits -}} + {{- omit .limits "nvidia.com/gpu" | toYaml | nindent 12 }} + {{- end }} + {{- /* nvidia.com/gpu: "{{ $parallelism.tensor }}" */}} + requests: + {{- if .limits -}} + {{- omit .requests "nvidia.com/gpu" | toYaml | nindent 12 }} + {{- end }} + {{- /* nvidia.com/gpu: "{{ $parallelism.tensor }}" */}} + {{- end }} + {{- /* volumeMount */}} + {{- include "llm-d-modelservice.mountModelVolumeVolumeMounts" . | nindent 8 }} + {{- end }} + {{- end }} + volumes: + {{- with .Values.decode.volumes }} + {{- toYaml . | nindent 8 }} + {{- end -}} + {{- include "llm-d-modelservice.mountModelVolumeVolumes" . | nindent 8 }} +{{- end }} \ No newline at end of file diff --git a/helm/templates/decode-lws.yaml b/helm/templates/decode-lws.yaml new file mode 100644 index 0000000..9300c09 --- /dev/null +++ b/helm/templates/decode-lws.yaml @@ -0,0 +1,151 @@ +{{- $parallelism := (include "llm-d-modelservice.parallelism" .Values.decode.parallelism) | fromYaml -}} +{{- if and $parallelism.data (gt (int $parallelism.data) 1) }} +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + name: {{ include "llm-d-modelservice.fullname" . }}-decode + labels: + {{- include "llm-d-modelservice.labels" . | nindent 4 }} + {{- include "llm-d-modelservice.decodelabels" . | nindent 4 }} +spec: + {{- if not .Values.decode.autoscaling.enabled }} + replicas: {{ default 1 .Values.decode.replicas }} + {{- end }} + leaderWorkerTemplate: + size: {{ .Values.decode.parallelism.data }} + leaderTemplate: + metadata: + labels: + {{- include "llm-d-modelservice.decodelabels" . | nindent 10 }} + spec: + {{- with .Values.routing }} + {{ (include "llm-d-modelservice.routingProxy" .) | nindent 8 }} + {{- end }} + + {{- with .Values.decode.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "llm-d-modelservice.pdServiceAccountName" . }} + {{- with .Values.podSecurityContext }} + securityContext: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.decode.acceleratorTypes }} + {{- include "llm-d-modelservice.acceleratorTypes" . | nindent 8 }} + {{- end }} + {{- with .Values.decode.containers }} + containers: + {{- range . }} + - name: {{ default "vllm" .name }} + image: {{ required "image of container is required" .image }} + {{- with .securityContext }} + securityContext: + {{- toYaml . | nindent 14 }} + {{- end }} + {{- with .imagePullPolicy }} + imagePullPolicy: {{ . }} + {{- end }} + {{- with .command }} + command: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .args }} + args: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- /* insert user's env for this container */}} + {{- if or .env .mountModelVolume}} + env: + {{- end }} + {{- with .env }} + {{- toYaml . | nindent 12 }} + {{- end }} + {{- /* insert envs based on what modelArtifact prefix */}} + {{- if .mountModelVolume }} + - name: HF_HOME + value: /model-cache + {{- with $.Values.modelArtifacts.authSecretName }} + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: {{ . }} + key: HF_TOKEN + {{- end }} + {{- end }} + {{- with .livenessProbe }} + livenessProbe: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .readinessProbe }} + readinessProbe: + {{- toYaml . | nindent 12 }} + {{- end }} + resources: + limits: + {{- if and .resources .resources.limits }} + {{- omit .resources.limits "nvidia.com/gpu" | toYaml | nindent 16 }} + {{- end }} + nvidia.com/gpu: {{ $parallelism.tensor }} + requests: + {{- if and .resources .resources.limits }} + {{- omit .resources.requests "nvidia.com/gpu" | toYaml | nindent 16 }} + {{- end }} + nvidia.com/gpu: {{ $parallelism.tensor }} + {{- /* volumeMount */}} + {{- if or .volumeMounts .mountModelVolume }} + volumeMounts: + {{- end -}} + {{- /* user supplied volume mount in values */}} + {{- with .volumeMounts }} + {{- toYaml . | nindent 14 }} + {{- end }} + {{- /* what we add if mounModelVolume is true */}} + {{- if .mountModelVolume }} + - name: model-storage + mountPath: /model-cache + {{- end }} + + {{- with .workingDir }} + workingDir: {{ . }} + {{- end }} + {{- with .stdin }} + stdin: {{ . }} + {{- end }} + {{- with .tty }} + tty: {{ . }} + {{- end }} + {{- end }} {{/* range . */}} + {{- end }} {{/* with .Values.decode.containers */}} + volumes: + {{- with .Values.decode.volumes }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- if eq .Values.modelArtifacts.prefix "hf" }} + - name: model-storage + emptyDir: + sizeLimit: {{ default "0" .Values.modelArtifacts.size }} + {{- else if eq .Values.modelArtifacts.prefix "pvc" }} + - name: model-storage + persistentVolumeClaim: + claimName: {{ .Values.modelArtifacts.artifact }} + readOnly: true + {{- else if eq .Values.modelArtifacts.prefix "oci" }} + - name: model-storage + image: + reference: {{ .Values.modelArtifacts.artifact }} + {{- with .Values.modelArtifacts.imagePullPolicy }} + pullPolicy: {{ . }} + {{- end }} + {{- end }} + + workerTemplate: + metadata: + labels: + {{- include "llm-d-modelservice.decodelabels" . | nindent 10 }} + spec: + {{- with .Values.decode.acceleratorTypes }} + {{- (include "llm-d-modelservice.acceleratorTypes" .) | nindent 6 }} + {{- end }} + serviceAccountName: {{ (include "llm-d-modelservice.pdServiceAccountName" .)}} +{{- end }} {{/* if and $parallelism.data (gt $parallelism.data 1) */}} \ No newline at end of file diff --git a/helm/templates/epp-deployment-mk.yaml b/helm/templates/epp-deployment-mk.yaml new file mode 100644 index 0000000..fc3e1d2 --- /dev/null +++ b/helm/templates/epp-deployment-mk.yaml @@ -0,0 +1,91 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "llm-d-modelservice.fullname" . }}-epp + labels: + {{- include "llm-d-modelservice.labels" . | nindent 4 }} +spec: + {{- if not .Values.endpointPicker.autoscaling.enabled }} + replicas: {{ default 1 .Values.decode.replicas }} + {{- end }} + selector: + matchLabels: + {{- include "llm-d-modelservice.eppSelectorLabels" . | nindent 6 }} + template: + metadata: + labels: + {{- include "llm-d-modelservice.labels" . | nindent 8 }} + spec: + {{- with .Values.endpointPicker.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "llm-d-modelservice.eppServiceAccountName" . }} + {{- with .Values.endpointPicker.podSecurityContext }} + securityContext: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- range $.Values.endpointPicker.containers }} + containers: + - name: {{ .name }} + {{- with $.Values.endpointPicker.securityContext }} + securityContext: + {{- toYaml . | nindent 12 }} + {{- end }} + image: "{{ .image }}" + {{- with .imagePullPolicy }} + imagePullPolicy: {{ . }} + {{- end }} + {{- with .command }} + command: + {{- toYaml . | nindent 12 }} + {{- end }} + args: + - poolName + - POOLNAME + - poolNamespace + - {{ $.Release.Namespace }} + {{- with .args }} + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .env }} + env: + {{- toYaml . | nindent 12 }} + {{- end }} + ports: + - name: http2 + containerPort: {{ $.Values.endpointPicker.service.port }} + protocol: TCP + {{- with .livenessProbe }} + livenessProbe: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .readinessProbe }} + readinessProbe: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .volumeMounts }} + volumeMounts: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- end }} {{/* range $.Values.endpointPicker.containers */}} + {{- with .Values.endpointPicker.volumes }} + volumes: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.endpointPicker.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.endpointPicker.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.endpointPicker.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/helm/templates/epp-deployment.yaml b/helm/templates/epp-deployment.yaml new file mode 100644 index 0000000..59c38f8 --- /dev/null +++ b/helm/templates/epp-deployment.yaml @@ -0,0 +1,103 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "llm-d-modelservice.fullname" . }}-epp + labels: + llm-d.ai/epp: {{ include "llm-d-modelservice.fullname" . }}-epp + namespace: {{ .Release.Namespace }} +spec: + replicas: 1 + selector: + matchLabels: + llm-d.ai/epp: {{ include "llm-d-modelservice.fullname" . }}-epp + template: + metadata: + labels: + llm-d.ai/epp: {{ include "llm-d-modelservice.fullname" . }}-epp + spec: + containers: + - name: epp + imagePullPolicy: Always + image: {{ default "ghcr.io/llm-d/llm-d-inference-scheduler:0.0.3" .Values.endpointPicker.image }} + args: + - --poolName + - {{ include "llm-d-modelservice.fullname" . }}-inference-pool + - --poolNamespace + - {{ .Release.Namespace }} + - -v + - "4" + - --zap-encoder + - json + - --grpcPort + - "9002" + - --grpcHealthPort + - "9003" + env: + - name: ENABLE_KVCACHE_AWARE_SCORER + value: "false" + - name: ENABLE_LOAD_AWARE_SCORER + value: "true" + - name: ENABLE_PREFIX_AWARE_SCORER + value: "true" + - name: ENABLE_SESSION_AWARE_SCORER + value: "false" + - name: KVCACHE_AWARE_SCORER_WEIGHT + value: "1" + - name: KVCACHE_INDEXER_REDIS_ADDR + - name: LOAD_AWARE_SCORER_WEIGHT + value: "1" + - name: PD_ENABLED + value: "false" + - name: PD_PROMPT_LEN_THRESHOLD + value: "10" + - name: PREFILL_ENABLE_KVCACHE_AWARE_SCORER + value: "false" + - name: PREFILL_ENABLE_LOAD_AWARE_SCORER + value: "false" + - name: PREFILL_ENABLE_PREFIX_AWARE_SCORER + value: "false" + - name: PREFILL_ENABLE_SESSION_AWARE_SCORER + value: "false" + - name: PREFILL_KVCACHE_AWARE_SCORER_WEIGHT + value: "1" + - name: PREFILL_KVCACHE_INDEXER_REDIS_ADDR + - name: PREFILL_LOAD_AWARE_SCORER_WEIGHT + value: "1" + - name: PREFILL_PREFIX_AWARE_SCORER_WEIGHT + value: "1" + - name: PREFILL_SESSION_AWARE_SCORER_WEIGHT + value: "1" + - name: PREFIX_AWARE_SCORER_WEIGHT + value: "2" + - name: SESSION_AWARE_SCORER_WEIGHT + value: "1" + ports: + - containerPort: 9002 + name: grpc + protocol: TCP + - containerPort: 9003 + name: grpc-health + protocol: TCP + - containerPort: 9090 + name: metrics + protocol: TCP + serviceAccount: {{ include "llm-d-modelservice.eppServiceAccountName" . }} + serviceAccountName: {{ include "llm-d-modelservice.eppServiceAccountName" . }} + readinessProbe: + grpc: + port: 9003 + service: envoy.service.ext_proc.v3.ExternalProcessor + initialDelaySeconds: 5 + timeoutSeconds: 1 + periodSeconds: 10 + successThreshold: 1 + failureThreshold: 3 + livenessProbe: + grpc: + port: 9003 + service: envoy.service.ext_proc.v3.ExternalProcessor + initialDelaySeconds: 5 + timeoutSeconds: 1 + periodSeconds: 10 + successThreshold: 1 + failureThreshold: 3 \ No newline at end of file diff --git a/helm/templates/epp-sa.yaml b/helm/templates/epp-sa.yaml new file mode 100644 index 0000000..05feab6 --- /dev/null +++ b/helm/templates/epp-sa.yaml @@ -0,0 +1,13 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "llm-d-modelservice.eppServiceAccountName" . }} + labels: + {{- include "llm-d-modelservice.labels" . | nindent 4 }} + {{- with .Values.eppServiceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +automountServiceAccountToken: {{ default true .Values.eppServiceAccount.automount }} +{{- end }} diff --git a/helm/templates/epp-service.yaml b/helm/templates/epp-service.yaml new file mode 100644 index 0000000..d13254e --- /dev/null +++ b/helm/templates/epp-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "llm-d-modelservice.fullname" . }}-epp + labels: + {{- include "llm-d-modelservice.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.endpointPicker.service.port }} + targetPort: {{ .Values.endpointPicker.service.targetPort }} + protocol: TCP + appProtocol: {{ .Values.endpointPicker.service.appProtocol }} + selector: + {{- include "llm-d-modelservice.eppSelectorLabels" . | nindent 4 }} diff --git a/helm/templates/examples/README.md b/helm/templates/examples/README.md new file mode 100644 index 0000000..1a53b66 --- /dev/null +++ b/helm/templates/examples/README.md @@ -0,0 +1,15 @@ +# Examples + +Contains example values file and their rendered templates. + +``` +cd helm +helm template [RELEASE-NAME] . -f [VALUES-FILEPATH] +``` + +1. `facebook/opt-125m`: downloads from Hugging Face + + ``` + cd helm + helm template facebook . -f templates/examples/values-facebook.yaml > templates/examples/output-facebook.yaml + ``` \ No newline at end of file diff --git a/helm/templates/examples/output-facebook.yaml b/helm/templates/examples/output-facebook.yaml new file mode 100644 index 0000000..18050d4 --- /dev/null +++ b/helm/templates/examples/output-facebook.yaml @@ -0,0 +1,382 @@ +--- +# Source: llm-d-modelservice/templates/epp-sa.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: facebook-epp-sa + labels: + helm.sh/chart: llm-d-modelservice-0.0.1 + app.kubernetes.io/version: "0.0.1" + app.kubernetes.io/managed-by: Helm +automountServiceAccountToken: true +--- +# Source: llm-d-modelservice/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: facebook-sa + labels: + helm.sh/chart: llm-d-modelservice-0.0.1 + app.kubernetes.io/version: "0.0.1" + app.kubernetes.io/managed-by: Helm +automountServiceAccountToken: true +--- +# Source: llm-d-modelservice/templates/epp-service.yaml +apiVersion: v1 +kind: Service +metadata: + name: facebook-llm-d-modelservice-epp + labels: + helm.sh/chart: llm-d-modelservice-0.0.1 + app.kubernetes.io/version: "0.0.1" + app.kubernetes.io/managed-by: Helm +spec: + type: ClusterIP + ports: + - port: 9002 + targetPort: 9002 + protocol: TCP + appProtocol: http2 + selector: + app.kubernetes.io/name: llm-d-modelservice + app.kubernetes.io/instance: facebook + llm-d.ai/epp: facebook-llm-d-modelservice-epp +--- +# Source: llm-d-modelservice/templates/decode-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: facebook-llm-d-modelservice-decode + labels: + helm.sh/chart: llm-d-modelservice-0.0.1 + app.kubernetes.io/version: "0.0.1" + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: facebook + llm-d.ai/role: decode + template: + metadata: + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: facebook + llm-d.ai/role: decode + spec: + serviceAccountName: facebook-sa + initContainers: + - args: + - --port=8000 + - --vllm-port=8200 + - --connector=nixlv2 + - -v=6 + image: ghcr.io/llm-d/llm-d-routing-sidecar:0.0.6 + imagePullPolicy: Always + name: routing-proxy + ports: + - containerPort: 8000 + protocol: TCP + restartPolicy: Always + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + containers: + - name: vllm + image: ghcr.io/llm-d/llm-d:0.0.8 + command: + - vllm + - serve + args: + - --port + - "8200" + - --enforce-eager + - --kv-transfer-config + - '{"kv_connector":"NixlConnector", "kv_role":"kv_both"}' + env: + - name: CUDA_VISIBLE_DEVICES + value: "0" + - name: UCX_TLS + value: cuda_ipc,cuda_copy,tcp + - name: HF_HOME + value: /model-cache + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "5557" + - name: VLLM_LOGGING_LEVEL + value: DEBUG + - name: HF_HOME + value: /model-cache + resources: + limits: + {} + requests: + cpu: "16" + memory: 16Gi + + volumeMounts: + - name: model-storage + mountPath: /model-cache + volumes: + - name: model-storage + emptyDir: + sizeLimit: 5Mi +--- +# Source: llm-d-modelservice/templates/epp-deployment-mk.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: facebook-llm-d-modelservice-epp + labels: + helm.sh/chart: llm-d-modelservice-0.0.1 + app.kubernetes.io/version: "0.0.1" + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: llm-d-modelservice + app.kubernetes.io/instance: facebook + llm-d.ai/epp: facebook-llm-d-modelservice-epp + template: + metadata: + labels: + helm.sh/chart: llm-d-modelservice-0.0.1 + app.kubernetes.io/version: "0.0.1" + app.kubernetes.io/managed-by: Helm + spec: + serviceAccountName: facebook-epp-sa +--- +# Source: llm-d-modelservice/templates/epp-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: facebook-llm-d-modelservice-epp + labels: + llm-d.ai/epp: facebook-llm-d-modelservice-epp + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + llm-d.ai/epp: facebook-llm-d-modelservice-epp + template: + metadata: + labels: + llm-d.ai/epp: facebook-llm-d-modelservice-epp + spec: + containers: + - name: epp + imagePullPolicy: Always + image: ghcr.io/llm-d/llm-d-inference-scheduler:0.0.3 + args: + - --poolName + - facebook-llm-d-modelservice-inference-pool + - --poolNamespace + - default + - -v + - "4" + - --zap-encoder + - json + - --grpcPort + - "9002" + - --grpcHealthPort + - "9003" + env: + - name: ENABLE_KVCACHE_AWARE_SCORER + value: "false" + - name: ENABLE_LOAD_AWARE_SCORER + value: "true" + - name: ENABLE_PREFIX_AWARE_SCORER + value: "true" + - name: ENABLE_SESSION_AWARE_SCORER + value: "false" + - name: KVCACHE_AWARE_SCORER_WEIGHT + value: "1" + - name: KVCACHE_INDEXER_REDIS_ADDR + - name: LOAD_AWARE_SCORER_WEIGHT + value: "1" + - name: PD_ENABLED + value: "false" + - name: PD_PROMPT_LEN_THRESHOLD + value: "10" + - name: PREFILL_ENABLE_KVCACHE_AWARE_SCORER + value: "false" + - name: PREFILL_ENABLE_LOAD_AWARE_SCORER + value: "false" + - name: PREFILL_ENABLE_PREFIX_AWARE_SCORER + value: "false" + - name: PREFILL_ENABLE_SESSION_AWARE_SCORER + value: "false" + - name: PREFILL_KVCACHE_AWARE_SCORER_WEIGHT + value: "1" + - name: PREFILL_KVCACHE_INDEXER_REDIS_ADDR + - name: PREFILL_LOAD_AWARE_SCORER_WEIGHT + value: "1" + - name: PREFILL_PREFIX_AWARE_SCORER_WEIGHT + value: "1" + - name: PREFILL_SESSION_AWARE_SCORER_WEIGHT + value: "1" + - name: PREFIX_AWARE_SCORER_WEIGHT + value: "2" + - name: SESSION_AWARE_SCORER_WEIGHT + value: "1" + ports: + - containerPort: 9002 + name: grpc + protocol: TCP + - containerPort: 9003 + name: grpc-health + protocol: TCP + - containerPort: 9090 + name: metrics + protocol: TCP + serviceAccount: facebook-epp-sa + serviceAccountName: facebook-epp-sa + readinessProbe: + grpc: + port: 9003 + service: envoy.service.ext_proc.v3.ExternalProcessor + initialDelaySeconds: 5 + timeoutSeconds: 1 + periodSeconds: 10 + successThreshold: 1 + failureThreshold: 3 + livenessProbe: + grpc: + port: 9003 + service: envoy.service.ext_proc.v3.ExternalProcessor + initialDelaySeconds: 5 + timeoutSeconds: 1 + periodSeconds: 10 + successThreshold: 1 + failureThreshold: 3 +--- +# Source: llm-d-modelservice/templates/prefill-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: facebook-llm-d-modelservice-prefill + labels: + helm.sh/chart: llm-d-modelservice-0.0.1 + app.kubernetes.io/version: "0.0.1" + app.kubernetes.io/managed-by: Helm +spec: + replicas: 1 + selector: + matchLabels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: facebook + llm-d.ai/role: prefill + template: + metadata: + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: facebook + llm-d.ai/role: prefill + spec: + serviceAccountName: facebook-sa + containers: + - name: vllm + image: ghcr.io/llm-d/llm-d:0.0.8 + command: + - vllm + - serve + args: + - --port + - "8000" + - --enforce-eager + - --kv-transfer-config + - '{"kv_connector":"NixlConnector", "kv_role":"kv_both"}' + env: + - name: CUDA_VISIBLE_DEVICES + value: "0" + - name: UCX_TLS + value: cuda_ipc,cuda_copy,tcp + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "5557" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: VLLM_LOGGING_LEVEL + value: DEBUG + resources: + limits: + {} + requests: + cpu: "16" + memory: 16Gi + + volumes: + - name: model-storage + emptyDir: + sizeLimit: 5Mi +--- +# Source: llm-d-modelservice/templates/routing.yaml +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: facebook-llm-d-modelservice-http-route + namespace: default + labels: + helm.sh/chart: llm-d-modelservice-0.0.1 + app.kubernetes.io/version: "0.0.1" + app.kubernetes.io/managed-by: Helm +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Istio + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: facebook-llm-d-modelservice-inference-pool + port: 8000 + weight: 1 + matches: + - headers: + - name: x-model-name + type: Exact + value: facebook/opt-125m + path: + type: PathPrefix + value: / +--- +# Source: llm-d-modelservice/templates/routing.yaml +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: facebook-llm-d-modelservice-inference-model + namespace: default + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: facebook +spec: + modelName: facebook/opt-125m + poolRef: + group: inference.networking.x-k8s.io + kind: InferencePool + name: facebook-llm-d-modelservice-inference-pool +--- +# Source: llm-d-modelservice/templates/routing.yaml +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferencePool +metadata: + name: facebook-llm-d-modelservice-inference-pool + namespace: default +spec: + extensionRef: + failureMode: FailClose + group: "" + kind: Service + name: facebook-llm-d-modelservice-epp-service + selector: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: facebook + targetPortNumber: 8000 diff --git a/helm/templates/examples/values-facebook.yaml b/helm/templates/examples/values-facebook.yaml new file mode 100644 index 0000000..ed6f437 --- /dev/null +++ b/helm/templates/examples/values-facebook.yaml @@ -0,0 +1,152 @@ +# This values.yaml file creates the resources for facebook/opt-125m + +lws: false # If true, creates LWS instead of deployments +inferencePool: true +inferenceModel: true +httpRoute: true + +routing: + # This is the model name for the OpenAI request + modelName: facebook/opt-125m + ports: + servicePort: 8000 # Sidecar listens on this port for requests. If there's no sidecar, the request goes here + internalPort: 8200 # Sidecar forwards request to vllm container on this port + proxy: + targetPort: 8000 + parentRefs: + - group: gateway.networking.k8s.io + kind: Istio + name: inference-gateway + +modelArtifacts: + prefix: "hf" + artifact: facebook/opt-125m + size: 5Mi + +# describe decode pods +decode: + enableService: false + replicas: 1 + # parallelism: + # tensor: 3 + # data: 2 + # dataLocal: 1 + initContainers: + - name: routing-proxy + image: ghcr.io/llm-d/llm-d-routing-sidecar:0.0.6 + imagePullPolicy: Always + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + args: + - "--port=8000" # servicePort + - "--vllm-port=8200" # internalPort + - "--connector=nixlv2" + - "-v=6" + ports: + - containerPort: 8000 # servicePort + protocol: TCP + restartPolicy: Always + containers: + - name: "vllm" + image: "ghcr.io/llm-d/llm-d:0.0.8" + command: + - vllm + - serve + args: + - "--port" + - "8200" # internalPort + - "--enforce-eager" + - "--kv-transfer-config" + - '{"kv_connector":"NixlConnector", "kv_role":"kv_both"}' + env: + - name: CUDA_VISIBLE_DEVICES + value: "0" + - name: UCX_TLS + value: "cuda_ipc,cuda_copy,tcp" + - name: HF_HOME + value: /model-cache + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "5557" + - name: VLLM_LOGGING_LEVEL + value: DEBUG + ports: + - containerPort: 5557 + protocol: TCP + resources: + limits: + nvidia.com/gpu: "1" + requests: + cpu: "16" + memory: 16Gi + nvidia.com/gpu: "1" + mountModelVolume: true + +# describe the prefill pods (looks the same as above) +prefill: + replicas: 1 + containers: + - name: "vllm" + image: "ghcr.io/llm-d/llm-d:0.0.8" + command: + - vllm + - serve + args: + - "--port" + - "8000" # servicePort + - "--enforce-eager" + - "--kv-transfer-config" + - '{"kv_connector":"NixlConnector", "kv_role":"kv_both"}' + env: + - name: CUDA_VISIBLE_DEVICES + value: "0" + - name: UCX_TLS + value: "cuda_ipc,cuda_copy,tcp" + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "5557" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: VLLM_LOGGING_LEVEL + value: DEBUG + ports: + - containerPort: 8000 + protocol: TCP + - containerPort: 5557 + protocol: TCP + resources: + limits: + nvidia.com/gpu: 1 + requests: + cpu: "16" + memory: 16Gi + nvidia.com/gpu: 1 + +endpointPicker: + # This is for setting up a service more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/ + service: + # This sets the service type more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types + type: ClusterIP + # This sets the ports more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#field-spec-ports + port: 9002 + targetPort: 9002 + appProtocol: http2 + + autoscaling: + enabled: false + replicas: 1 + +# This section builds out the service account more information can be found here: https://kubernetes.io/docs/concepts/security/service-accounts/ +serviceAccount: + # Specifies whether a service account should be created + create: true + +# This section builds out the service account more information can be found here: https://kubernetes.io/docs/concepts/security/service-accounts/ +eppServiceAccount: + # Specifies whether a service account should be created + create: true diff --git a/helm/templates/prefill-deployment.yaml b/helm/templates/prefill-deployment.yaml new file mode 100644 index 0000000..45f38fe --- /dev/null +++ b/helm/templates/prefill-deployment.yaml @@ -0,0 +1,106 @@ +{{- $parallelism := (include "llm-d-modelservice.parallelism" .Values.prefill.parallelism) | fromYaml -}} +{{- if .Values.prefill }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "llm-d-modelservice.fullname" . }}-prefill + labels: + {{- include "llm-d-modelservice.labels" . | nindent 4 }} +spec: + replicas: {{ default 1 .Values.prefill.replicas }} + selector: + matchLabels: + {{- include "llm-d-modelservice.prefilllabels" . | nindent 6 }} + template: + metadata: + labels: + {{- include "llm-d-modelservice.prefilllabels" . | nindent 8 }} + spec: + {{- with .Values.prefill.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "llm-d-modelservice.pdServiceAccountName" . }} + {{- with .Values.podSecurityContext }} + securityContext: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.prefill.acceleratorTypes }} + {{- include "llm-d-modelservice.acceleratorTypes" . | nindent 6 }} + {{- end }} + {{- /* initContainers */}} + {{- with .Values.prefill.initContainers }} + initContainers: + {{- toYaml . | nindent 6 }} + {{- end }} + {{- /* range $.Values.prefill.containers */}} + {{- with .Values.prefill.containers }} + containers: + {{- range . }} + - name: {{ default "vllm" .name }} + image: {{ required "image of container is required" .image }} + {{- with .securityContext }} + securityContext: + {{- toYaml . | nindent 10 }} + {{- end }} + {{- with .imagePullPolicy }} + imagePullPolicy: {{ . }} + {{- end }} + {{- with .command }} + command: + {{- toYaml . | nindent 10 }} + {{- end }} + {{- with .args }} + args: + {{- toYaml . | nindent 10 }} + {{- end }} + {{- /* insert user's env for this container */}} + {{- if or .env .mountModelVolume }} + env: + {{- end }} + {{- with .env }} + {{- toYaml . | nindent 10 }} + {{- end }} + {{- /* insert envs based on what modelArtifact prefix */}} + {{- if .mountModelVolume }} + - name: HF_HOME + value: /model-cache + {{- with $.Values.modelArtifacts.authSecretName }} + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: {{ . }} + key: HF_TOKEN + {{- end }} + {{- end }} + {{- with .livenessProbe }} + livenessProbe: + {{- toYaml . | nindent 10 }} + {{- end }} + {{- with .readinessProbe }} + readinessProbe: + {{- toYaml . | nindent 10 }} + {{- end }} + {{- with .resources }} + resources: + limits: + {{- if .limits -}} + {{- omit .limits "nvidia.com/gpu" | toYaml | nindent 12 }} + {{- end }} + {{- /* nvidia.com/gpu: "{{ $parallelism.tensor }}" */}} + requests: + {{- if .limits -}} + {{- omit .requests "nvidia.com/gpu" | toYaml | nindent 12 }} + {{- end }} + {{- /* nvidia.com/gpu: "{{ $parallelism.tensor }}" */}} + {{- end }} + {{- /* volumeMount */}} + {{- include "llm-d-modelservice.mountModelVolumeVolumeMounts" . | nindent 8 }} + {{- end }} + {{- end }} + volumes: + {{- with .Values.prefill.volumes }} + {{- toYaml . | nindent 8 }} + {{- end -}} + {{- include "llm-d-modelservice.mountModelVolumeVolumes" . | nindent 8 }} +{{- end }} \ No newline at end of file diff --git a/helm/templates/routing.yaml b/helm/templates/routing.yaml new file mode 100644 index 0000000..6459d1d --- /dev/null +++ b/helm/templates/routing.yaml @@ -0,0 +1,63 @@ +{{- /* Routing templates: InferencePool, InferenceModel, and HttpRoute */}} +{{- if .Values.inferencePool }} +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferencePool +metadata: + name: {{ include "llm-d-modelservice.fullname" . }}-inference-pool + namespace: {{ .Release.Namespace }} +spec: + extensionRef: + failureMode: FailClose + group: "" + kind: Service + name: {{ include "llm-d-modelservice.fullname" . }}-epp-service + selector: + {{- include "llm-d-modelservice.pdlabels" . | nindent 4 }} + targetPortNumber: 8000 +{{- end }} +--- +{{- if .Values.inferenceModel }} +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: {{ include "llm-d-modelservice.fullname" . }}-inference-model + namespace: {{ .Release.Namespace }} + labels: + {{- include "llm-d-modelservice.pdlabels" . | nindent 4 }} +spec: + modelName: {{ .Values.routing.modelName }} + poolRef: + group: inference.networking.x-k8s.io + kind: InferencePool + name: {{ include "llm-d-modelservice.fullname" . }}-inference-pool +{{- end }} +--- +{{- if .Values.httpRoute }} +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: {{ include "llm-d-modelservice.fullname" . }}-http-route + namespace: {{ .Release.Namespace }} + labels: + {{- include "llm-d-modelservice.labels" . | nindent 4 }} +spec: + {{- with .Values.routing.parentRefs }} + parentRefs: + {{- . | toYaml | nindent 2}} + {{- end }} + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: {{ include "llm-d-modelservice.fullname" . }}-inference-pool + port: {{ .Values.routing.ports.servicePort }} + weight: 1 + matches: + - headers: + - name: x-model-name + type: Exact + value: {{ .Values.routing.modelName }} + path: + type: PathPrefix + value: / +{{- end }} \ No newline at end of file diff --git a/helm/templates/serviceaccount.yaml b/helm/templates/serviceaccount.yaml new file mode 100644 index 0000000..32174c4 --- /dev/null +++ b/helm/templates/serviceaccount.yaml @@ -0,0 +1,13 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "llm-d-modelservice.pdServiceAccountName" . }} + labels: + {{- include "llm-d-modelservice.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +automountServiceAccountToken: {{ default true .Values.serviceAccount.automount }} +{{- end }} diff --git a/helm/values-msvc-mk.yaml b/helm/values-msvc-mk.yaml new file mode 100644 index 0000000..7859b66 --- /dev/null +++ b/helm/values-msvc-mk.yaml @@ -0,0 +1,291 @@ +# TODO +# decoupleScaling: false + +lws: false # If true, creates LWS instead of deployments +inferencePool: true +inferenceModel: true +httpRoute: true + +routing: + # This is the model name for the OpenAI request + modelName: deepsk-ai-deepsk-coder-v1-lite-instruct + servicePort: 8080 # Sidecar listens on this port for requests. If there's no sidecar, the request goes here + proxy: + image: ghcr.io/llm-d/llm-d-routing-sidecar:0.0.6 + targetPort: 8200 + debugLevel: 5 + +modelArtifacts: + # When specfying the URI with `hf` prefix, the / string + # is extracted and exposed as a template variable that can be used as {{ .HFModelName }} + + # uri: hf://facebook/opt-125m + prefix: "hf" + artficat: deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct + authSecretName: "hf-secret" + size: 5Mi + +# describe decode pods +decode: + autoscaling: + enabled: false + replicas: 1 + + # for LWS + parallelism: + tensor: 3 + data: 2 + dataLocal: 1 + + acceleratorTypes: + labelKey: nvidia.com/gpu.product + labelValues: + # According to the blog, Scout requires H100s + - NVIDIA-H100 + - NVIDIA-H200 + # initContainers: + containers: + - name: vllm-worker + image: "quay.io/tms/vllm-dev-base:0.0.15" + imagePullPolicy: Always + workingDir: /app + stdin: true + tty: true + command: ["/bin/sh","-c"] + args: + - | + # Squash a warning. + rm /etc/libibverbs.d/vmw_pvrdma.driver + ################# + # Install vLLM + ################# + /init-scripts/init-vllm.sh + ################# + # RUN vLLM + ################# + START_RANK=$(( ${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL )) + if [ "${LWS_WORKER_INDEX:-0}" -eq 0 ]; then + ################# + # Leader-only launch + ################# + exec /app/venv/bin/vllm serve \ + {{ .HFModelName }} \ + --port {{ "internal_port" | getPort }} \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager + else + ################# + # Worker-only launch + ################# + exec /app/venv/bin/vllm serve \ + {{ .HFModelName }} \ + --port {{ "internal_port" | getPort }} \ + --disable-log-requests \ + --enable-expert-parallel \ + --tensor-parallel-size $TP_SIZE \ + --data-parallel-size $DP_SIZE \ + --data-parallel-size-local $DP_SIZE_LOCAL \ + --data-parallel-address $(LWS_LEADER_ADDRESS) \ + --data-parallel-rpc-port 5555 \ + --data-parallel-start-rank $START_RANK \ + --trust-remote-code \ + --kv-transfer-config \ + '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \ + --enforce-eager \ + --headless + fi + env: + - name: DP_SIZE + value: "{{ .DecodeDataParallelism }}" + - name: TP_SIZE + value: "{{ .DecodeTensorParallelism }}" + - name: DP_SIZE_LOCAL + value: "1" + - name: VLLM_REPO_URL + value: "https://github.com/vllm-project/vllm.git" + - name: VLLM_BRANCH + value: "main" + - name: VLLM_ALL2ALL_BACKEND +# value: "naive" + value: "pplx" +# value: "deepep_high_throughput" +# value: "deepep_low_latency" +# + # Needed for GDRCOPY to be used. + # See: https://github.com/NVIDIA/nvidia-container-toolkit/releases/tag/v1.15.0 + - name: NVIDIA_GDRCOPY + value: "enabled" +# - name: NVIDIA_NVSWITCH +# value: "enabled" +# - name: NVIDIA_GDS +# value: "enabled" + + # NVIDIA_MOFED is likely needed for using IBGDA but causes crashes +# - name: NVIDIA_MOFED +# value: "enabled" +# + - name: NCCL_DEBUG + value: "INFO" + - name: NVSHMEM_DEBUG + value: "TRACE" + - name: NVSHMEM_DEBUG_SUBSYS + value: "TRANSPORT,INIT,MEM,COLL,BOOTSTRAP" + - name: NVSHMEM_REMOTE_TRANSPORT + value: "ibrc" + - name: NVSHMEM_IB_ENABLE_IBGDA + value: "true" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "true" + - name: NVSHMEM_HCA_LIST + value: "ibp0:1,ibp1:1,ibp2:1,ibp3:1,ibp4:1,ibp5:1,ibp6:1,ibp7:1" + - name: NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME + value: "eth0" + - name: GLOO_SOCKET_IFNAME + value: "eth0" + - name: NCCL_SOCKET_IFNAME + value: "eth0" + - name: NCCL_IB_HCA + value: "ibp" + - name: VLLM_LOGGING_LEVEL + value: "DEBUG" + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: HF_TOKEN + optional: true + - name: GH_TOKEN_FROM_SECRET + valueFrom: + secretKeyRef: + name: gh-token-secret + key: GH_TOKEN + optional: true + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "6555" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + + securityContext: + capabilities: + add: [ "IPC_LOCK" ] + resources: + limits: + nvidia.com/gpu: "{{ .DecodeTensorParallelism }}" + memory: 64Gi + ephemeral-storage: 256Gi + rdma/ib: 1 + requests: + cpu: 8 + memory: 64Gi + ephemeral-storage: 256Gi + nvidia.com/gpu: "{{ .DecodeTensorParallelism }}" + rdma/ib: 1 + volumeMounts: + - mountPath: /dev/shm + name: dshm + - name: init-scripts-volume + mountPath: /init-scripts + mountModelVolume: true + +# describe the prefill pods (looks the same as above) +prefill: + replicas: 1 + containers: + - name: "vllm" + args: + - "HFModelName" + +endpointPicker: + # This is for setting up a service more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/ + service: + # This sets the service type more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types + type: ClusterIP + # This sets the ports more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#field-spec-ports + port: 9002 + targetPort: 9002 + appProtocol: http2 + + # enableService: true + + autoscaling: + enabled: false + replicas: 1 + + containers: + - name: "epp" + image: "ghcr.io/llm-d/llm-d-inference-scheduler:0.0.3" + # command: + args: + # - -poolName + # - InferencePoolName + # - -poolNamespace + # - llmd-kalantar + - -v + - "5" + - --zap-encoder + - json + - -grpcPort + - "9002" + - -grpcHealthPort + - "9003" + env: + - name: PD_ENABLED + value: "true" + - name: PD_PROMPT_LEN_THRESHOLD + value: "10" + ports: + - containerPort: 9002 + protocol: TCP + - containerPort: 9003 + protocol: TCP + - containerPort: 9090 + name: metrics + protocol: TCP + livenessProbe: + failureThreshold: 3 + grpc: + port: 9003 + service: envoy.service.ext_proc.v3.ExternalProcessor + initialDelaySeconds: 5 + periodSeconds: 10 + readinessProbe: + failureThreshold: 3 + grpc: + port: 9003 + service: envoy.service.ext_proc.v3.ExternalProcessor + initialDelaySeconds: 5 + periodSeconds: 10 + + + + +# This section builds out the service account more information can be found here: https://kubernetes.io/docs/concepts/security/service-accounts/ +serviceAccount: + # Specifies whether a service account should be created + create: true + # Automatically mount a ServiceAccount's API credentials? + automount: true + # Annotations to add to the service account + annotations: {} + +# This section builds out the service account more information can be found here: https://kubernetes.io/docs/concepts/security/service-accounts/ +eppServiceAccount: + # Specifies whether a service account should be created + create: true + # Automatically mount a ServiceAccount's API credentials? + automount: true + # Annotations to add to the service account + annotations: {} + diff --git a/helm/values-msvc.yaml b/helm/values-msvc.yaml new file mode 100644 index 0000000..642d8a9 --- /dev/null +++ b/helm/values-msvc.yaml @@ -0,0 +1,172 @@ +# TODO +# decoupleScaling: false + +lws: false # If true, creates LWS instead of deployments +inferencePool: true +inferenceModel: true +httpRoute: true + +routing: + # This is the model name for the OpenAI request + modelName: facebook/opt-125m + ports: + servicePort: 8000 # Sidecar listens on this port for requests. If there's no sidecar, the request goes here + internalPort: 8200 # Sidecar forwards request to vllm container on this port + proxy: + targetPort: 8000 + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway-kgateway + +modelArtifacts: + # When specfying the URI with `hf` prefix, the / string + # is extracted and exposed as a template variable that can be used as {{ .HFModelName }} + prefix: "oci" + artifact: facebook/opt-125m + authSecretName: "hf-secret" + size: 5Mi + imagePullPolicy: IfNotPresent + +# describe decode pods +# decode: +# enableService: false +# replicas: 1 + +# # for LWS +# parallelism: +# tensor: 8 +# data: 1 +# dataLocal: 1 + +# acceleratorTypes: +# labelKey: nvidia.com/gpu.product +# labelValues: +# # According to the blog, Scout requires H100s +# - NVIDIA-H100 +# # initContainers: +# containers: +# - name: "vllm" +# image: "vllm-ai/vllm:latest" +# args: +# - "HFModelName" +# env: +# - name: "VLLM_LOG_LEVEL" +# value: "DEBUG" # Set to DEBUG for more detailed logs, or INFO for less verbose logs +# envFrom: +# - configMapRef: +# name: vllm-config +# resources: +# requests: +# cpu: "1" # Request 1 CPU core +# memory: "4Gi" # Request 4 GiB of memory +# limits: +# cpu: "2" # Limit to 2 CPU cores +# memory: "8Gi" # Limit to 8 GiB of memory +# mountModelVolume: true + +# describe the prefill pods (looks the same as above) +prefill: + replicas: 1 + containers: + - name: "vllm" + image: "vllm-ai/vllm:latest" + args: + - "HFModelName" + env: + - name: ok + value: ok + mountModelVolume: true + - name: "v2" + image: "vllm-ai/vllm:latest" + volumeMounts: + - name: whatever + mountPath: something + volumes: + - name: ok + emptyDir: + sizeLimit: 5Gi + - name: ok2 + emptyDir: + sizeLimit: 5Gi + +endpointPicker: + # This is for setting up a service more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/ + service: + # This sets the service type more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types + type: ClusterIP + # This sets the ports more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#field-spec-ports + port: 9002 + targetPort: 9002 + appProtocol: http2 + + # enableService: true + + autoscaling: + enabled: false + replicas: 1 + + containers: + - name: "epp" + image: "ghcr.io/llm-d/llm-d-inference-scheduler:0.0.3" + # command: + args: + # - -poolName + # - InferencePoolName + # - -poolNamespace + # - llmd-kalantar + - -v + - "5" + - --zap-encoder + - json + - -grpcPort + - "9002" + - -grpcHealthPort + - "9003" + env: + - name: PD_ENABLED + value: "true" + - name: PD_PROMPT_LEN_THRESHOLD + value: "10" + ports: + - containerPort: 9002 + protocol: TCP + - containerPort: 9003 + protocol: TCP + - containerPort: 9090 + name: metrics + protocol: TCP + livenessProbe: + failureThreshold: 3 + grpc: + port: 9003 + service: envoy.service.ext_proc.v3.ExternalProcessor + initialDelaySeconds: 5 + periodSeconds: 10 + readinessProbe: + failureThreshold: 3 + grpc: + port: 9003 + service: envoy.service.ext_proc.v3.ExternalProcessor + initialDelaySeconds: 5 + periodSeconds: 10 + + +# This section builds out the service account more information can be found here: https://kubernetes.io/docs/concepts/security/service-accounts/ +serviceAccount: + # Specifies whether a service account should be created + create: true + # Automatically mount a ServiceAccount's API credentials? + automount: true + # Annotations to add to the service account + annotations: {} + +# This section builds out the service account more information can be found here: https://kubernetes.io/docs/concepts/security/service-accounts/ +eppServiceAccount: + # Specifies whether a service account should be created + create: true + # Automatically mount a ServiceAccount's API credentials? + automount: true + # Annotations to add to the service account + annotations: {} + diff --git a/helm/values.yaml b/helm/values.yaml new file mode 100644 index 0000000..3e5575d --- /dev/null +++ b/helm/values.yaml @@ -0,0 +1,123 @@ +# Default values for llm-d-modelservice. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +# This will set the replicaset count more information can be found here: https://kubernetes.io/docs/concepts/workloads/controllers/replicaset/ +replicaCount: 1 + +# This sets the container image more information can be found here: https://kubernetes.io/docs/concepts/containers/images/ +image: + repository: nginx + # This sets the pull policy for images. + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: "" + +# This is for the secrets for pulling an image from a private repository more information can be found here: https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/ +imagePullSecrets: [] +# This is to override the chart name. +nameOverride: "" +fullnameOverride: "" + +# This section builds out the service account more information can be found here: https://kubernetes.io/docs/concepts/security/service-accounts/ +serviceAccount: + # Specifies whether a service account should be created + create: true + # Automatically mount a ServiceAccount's API credentials? + automount: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + +# This is for setting Kubernetes Annotations to a Pod. +# For more information checkout: https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/ +podAnnotations: {} +# This is for setting Kubernetes Labels to a Pod. +# For more information checkout: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ +podLabels: {} + +podSecurityContext: {} + # fsGroup: 2000 + +securityContext: {} + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + # runAsNonRoot: true + # runAsUser: 1000 + +# This is for setting up a service more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/ +service: + # This sets the service type more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types + type: ClusterIP + # This sets the ports more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#field-spec-ports + port: 80 + +# This block is for setting up the ingress for more information can be found here: https://kubernetes.io/docs/concepts/services-networking/ingress/ +ingress: + enabled: false + className: "" + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + hosts: + - host: chart-example.local + paths: + - path: / + pathType: ImplementationSpecific + tls: [] + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + +resources: {} + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + +# This is to setup the liveness and readiness probes more information can be found here: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/ +livenessProbe: + httpGet: + path: / + port: http +readinessProbe: + httpGet: + path: / + port: http + +# This section is for setting up autoscaling more information can be found here: https://kubernetes.io/docs/concepts/workloads/autoscaling/ +autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 100 + targetCPUUtilizationPercentage: 80 + # targetMemoryUtilizationPercentage: 80 + +# Additional volumes on the output Deployment definition. +volumes: [] +# - name: foo +# secret: +# secretName: mysecret +# optional: false + +# Additional volumeMounts on the output Deployment definition. +volumeMounts: [] +# - name: foo +# mountPath: "/etc/foo" +# readOnly: true + +nodeSelector: {} + +tolerations: [] + +affinity: {}