diff --git a/incubator/llama-factory-on-tke/Chart.yaml b/incubator/llama-factory-on-tke/Chart.yaml index fc2eae7..d8b9aa0 100644 --- a/incubator/llama-factory-on-tke/Chart.yaml +++ b/incubator/llama-factory-on-tke/Chart.yaml @@ -4,7 +4,7 @@ apiVersion: v2 # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. appVersion: "1.16.0" -description: A Helm chart for Kubernetes +description: Helm chart for deploying LLaMA-Factory resources on TKE. home: https://github.com/hiyouga/LLaMA-Factory icon: https://cloudcache.tencent-cloud.com/qcloud/ui/static/Industry_tke/48bef83f-57d3-460c-bdbe-e8e1c57318e1.png keywords: @@ -15,8 +15,6 @@ maintainers: - email: yuehuazhang@tencent.com name: yuehuazhang name: llama-factory-on-tke -sources: - - https://github.com/bitnami/bitnami-docker-redis # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) diff --git a/incubator/llama-factory-on-tke/values.yaml b/incubator/llama-factory-on-tke/values.yaml index 57ed6e4..34ac89a 100644 --- a/incubator/llama-factory-on-tke/values.yaml +++ b/incubator/llama-factory-on-tke/values.yaml @@ -15,9 +15,11 @@ image: env: - name: PIP_INDEX - value: https://pypi.org/simple + value: https://mirrors.cloud.tencent.com/pypi/simple - name: EXTRAS value: metrics + # - name: HF_ENDPOINT + # value: https://hf-mirror.com args: - llamafactory-cli webui @@ -92,12 +94,12 @@ resources: # resources, such as Minikube. If you do want to specify resources, uncomment the following # lines, adjust them as necessary, and remove the curly braces after 'resources:'. limits: - cpu: 4 - memory: 20Gi + cpu: "40" + memory: 600Gi nvidia.com/gpu: "1" requests: - cpu: 2 - memory: 10Gi + cpu: "20" + memory: 400Gi nvidia.com/gpu: "1" # This is to setup the liveness and readiness probes more information can be found here: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/ @@ -127,17 +129,29 @@ autoscaling: # targetMemoryUtilizationPercentage: 80 # Additional volumes on the output Deployment definition. -volumes: [] -# - name: foo -# secret: -# secretName: mysecret -# optional: false +volumes: + # - name: foo + # secret: + # secretName: mysecret + # optional: false + - emptyDir: + sizeLimit: 16Gi + name: dshm + - name: localtime + hostPath: + path: /etc/localtime + type: File # Additional volumeMounts on the output Deployment definition. -volumeMounts: [] -# - name: foo -# mountPath: "/etc/foo" -# readOnly: true +volumeMounts: + # - name: foo + # mountPath: "/etc/foo" + # readOnly: true + - mountPath: /dev/shm + name: dshm + - name: localtime + mountPath: /etc/localtime + readOnly: true nodeSelector: {} diff --git a/incubator/verl-on-tke/.helmignore b/incubator/verl-on-tke/.helmignore new file mode 100644 index 0000000..0e8a0eb --- /dev/null +++ b/incubator/verl-on-tke/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/incubator/verl-on-tke/Chart.yaml b/incubator/verl-on-tke/Chart.yaml new file mode 100644 index 0000000..04f4791 --- /dev/null +++ b/incubator/verl-on-tke/Chart.yaml @@ -0,0 +1,30 @@ +apiVersion: v2 +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.16.0" +description: Helm chart for deploying verl resources on TKE. +home: https://github.com/volcengine/verl +icon: https://cloudcache.tencent-cloud.com/qcloud/ui/static/Industry_tke/47f8af51-2587-4457-8a14-3e2dddefaffe.png +keywords: + - category:AI + - verl + - tke +maintainers: + - email: yuehuazhang@tencent.com + name: yuehuazhang +name: verl-on-tke +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application \ No newline at end of file diff --git a/incubator/verl-on-tke/templates/NOTES.txt b/incubator/verl-on-tke/templates/NOTES.txt new file mode 100644 index 0000000..fafc9f8 --- /dev/null +++ b/incubator/verl-on-tke/templates/NOTES.txt @@ -0,0 +1,22 @@ +1. Get the application URL by running these commands: +{{- if .Values.ingress.enabled }} +{{- range $host := .Values.ingress.hosts }} + {{- range .paths }} + http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }} + {{- end }} +{{- end }} +{{- else if contains "NodePort" .Values.service.type }} + export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "verl-on-tke.fullname" . }}) + export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") + echo http://$NODE_IP:$NODE_PORT +{{- else if contains "LoadBalancer" .Values.service.type }} + NOTE: It may take a few minutes for the LoadBalancer IP to be available. + You can watch its status by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "verl-on-tke.fullname" . }}' + export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "verl-on-tke.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}") + echo http://$SERVICE_IP:{{ .Values.service.port }} +{{- else if contains "ClusterIP" .Values.service.type }} + export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "verl-on-tke.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") + export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}") + echo "Visit http://127.0.0.1:8080 to use your application" + kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT +{{- end }} diff --git a/incubator/verl-on-tke/templates/_helpers.tpl b/incubator/verl-on-tke/templates/_helpers.tpl new file mode 100644 index 0000000..8ddc617 --- /dev/null +++ b/incubator/verl-on-tke/templates/_helpers.tpl @@ -0,0 +1,62 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "verl-on-tke.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "verl-on-tke.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "verl-on-tke.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "verl-on-tke.labels" -}} +helm.sh/chart: {{ include "verl-on-tke.chart" . }} +{{ include "verl-on-tke.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "verl-on-tke.selectorLabels" -}} +app.kubernetes.io/name: {{ include "verl-on-tke.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "verl-on-tke.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "verl-on-tke.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/incubator/verl-on-tke/templates/deployment.yaml b/incubator/verl-on-tke/templates/deployment.yaml new file mode 100644 index 0000000..3a2fccb --- /dev/null +++ b/incubator/verl-on-tke/templates/deployment.yaml @@ -0,0 +1,90 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "verl-on-tke.fullname" . }} + labels: + {{- include "verl-on-tke.labels" . | nindent 4 }} +spec: + {{- if not .Values.autoscaling.enabled }} + replicas: {{ .Values.replicaCount }} + {{- end }} + selector: + matchLabels: + {{- include "verl-on-tke.selectorLabels" . | nindent 6 }} + template: + metadata: + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "verl-on-tke.labels" . | nindent 8 }} + {{- with .Values.podLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "verl-on-tke.serviceAccountName" . }} + {{- with .Values.podSecurityContext }} + securityContext: + {{- toYaml . | nindent 8 }} + {{- end }} + containers: + - name: verl + {{- with .Values.securityContext }} + securityContext: + {{- toYaml . | nindent 12 }} + {{- end }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + {{- with .Values.env }} + env: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.args }} + args: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.command }} + command: + {{- toYaml . | nindent 12 }} + {{- end }} + ports: + - name: http + containerPort: {{ .Values.service.port }} + protocol: TCP +{{/* {{- with .Values.livenessProbe }}*/}} +{{/* livenessProbe:*/}} +{{/* {{- toYaml . | nindent 12 }}*/}} +{{/* {{- end }}*/}} +{{/* {{- with .Values.readinessProbe }}*/}} +{{/* readinessProbe:*/}} +{{/* {{- toYaml . | nindent 12 }}*/}} +{{/* {{- end }}*/}} + {{- with .Values.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.volumeMounts }} + volumeMounts: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.volumes }} + volumes: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/incubator/verl-on-tke/templates/hpa.yaml b/incubator/verl-on-tke/templates/hpa.yaml new file mode 100644 index 0000000..2afd574 --- /dev/null +++ b/incubator/verl-on-tke/templates/hpa.yaml @@ -0,0 +1,32 @@ +{{- if .Values.autoscaling.enabled }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "verl-on-tke.fullname" . }} + labels: + {{- include "verl-on-tke.labels" . | nindent 4 }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "verl-on-tke.fullname" . }} + minReplicas: {{ .Values.autoscaling.minReplicas }} + maxReplicas: {{ .Values.autoscaling.maxReplicas }} + metrics: + {{- if .Values.autoscaling.targetCPUUtilizationPercentage }} + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }} + {{- end }} + {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }} + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }} + {{- end }} +{{- end }} diff --git a/incubator/verl-on-tke/templates/ingress.yaml b/incubator/verl-on-tke/templates/ingress.yaml new file mode 100644 index 0000000..c71a1c7 --- /dev/null +++ b/incubator/verl-on-tke/templates/ingress.yaml @@ -0,0 +1,43 @@ +{{- if .Values.ingress.enabled -}} +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: {{ include "verl-on-tke.fullname" . }} + labels: + {{- include "verl-on-tke.labels" . | nindent 4 }} + {{- with .Values.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- with .Values.ingress.className }} + ingressClassName: {{ . }} + {{- end }} + {{- if .Values.ingress.tls }} + tls: + {{- range .Values.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + secretName: {{ .secretName }} + {{- end }} + {{- end }} + rules: + {{- range .Values.ingress.hosts }} + - host: {{ .host | quote }} + http: + paths: + {{- range .paths }} + - path: {{ .path }} + {{- with .pathType }} + pathType: {{ . }} + {{- end }} + backend: + service: + name: {{ include "verl-on-tke.fullname" $ }} + port: + number: {{ $.Values.service.port }} + {{- end }} + {{- end }} +{{- end }} diff --git a/incubator/verl-on-tke/templates/service.yaml b/incubator/verl-on-tke/templates/service.yaml new file mode 100644 index 0000000..953c808 --- /dev/null +++ b/incubator/verl-on-tke/templates/service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "verl-on-tke.fullname" . }} + labels: + {{- include "verl-on-tke.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.service.port }} + targetPort: http + protocol: TCP + name: http + selector: + {{- include "verl-on-tke.selectorLabels" . | nindent 4 }} diff --git a/incubator/verl-on-tke/templates/serviceaccount.yaml b/incubator/verl-on-tke/templates/serviceaccount.yaml new file mode 100644 index 0000000..c49d83e --- /dev/null +++ b/incubator/verl-on-tke/templates/serviceaccount.yaml @@ -0,0 +1,13 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "verl-on-tke.serviceAccountName" . }} + labels: + {{- include "verl-on-tke.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +automountServiceAccountToken: {{ .Values.serviceAccount.automount }} +{{- end }} diff --git a/incubator/verl-on-tke/values.yaml b/incubator/verl-on-tke/values.yaml new file mode 100644 index 0000000..b7c9b0c --- /dev/null +++ b/incubator/verl-on-tke/values.yaml @@ -0,0 +1,162 @@ +# Default values for verl-on-tke. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +# This will set the replicaset count more information can be found here: https://kubernetes.io/docs/concepts/workloads/controllers/replicaset/ +replicaCount: 1 + +# This sets the container image more information can be found here: https://kubernetes.io/docs/concepts/containers/images/ +image: + repository: "ccr.ccs.tencentyun.com/tke-market/verl" + # This sets the pull policy for images. + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: "app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2" + +env: + - name: PYTHONUNBUFFERED + value: "1" + - name: HYDRA_FULL_ERROR + value: "1" + - name: PIP_INDEX_URL + value: https://mirrors.cloud.tencent.com/pypi/simple + # - name: HF_ENDPOINT + # value: https://hf-mirror.com + # - name: SWANLAB_API_KEY + # value: "" + +args: + - trap 'exit 0' TERM; sleep infinity & wait +command: + - bash + - -c + +# This is for the secrets for pulling an image from a private repository more information can be found here: https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/ +imagePullSecrets: [] +# This is to override the chart name. +nameOverride: "" +fullnameOverride: "" + +# This section builds out the service account more information can be found here: https://kubernetes.io/docs/concepts/security/service-accounts/ +serviceAccount: + # Specifies whether a service account should be created + create: true + # Automatically mount a ServiceAccount's API credentials? + automount: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + +# This is for setting Kubernetes Annotations to a Pod. +# For more information checkout: https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/ +podAnnotations: {} +# This is for setting Kubernetes Labels to a Pod. +# For more information checkout: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ +podLabels: {} + +podSecurityContext: {} + # fsGroup: 2000 + +securityContext: {} + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + # runAsNonRoot: true + # runAsUser: 1000 + +# This is for setting up a service more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/ +service: + # This sets the service type more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types + type: ClusterIP + # This sets the ports more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#field-spec-ports + port: 80 + +# This block is for setting up the ingress for more information can be found here: https://kubernetes.io/docs/concepts/services-networking/ingress/ +ingress: + enabled: false + className: "" + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + hosts: + - host: chart-example.local + paths: + - path: / + pathType: ImplementationSpecific + tls: [] + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + +resources: + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + limits: + cpu: "40" + memory: 600Gi + nvidia.com/gpu: "1" + requests: + cpu: "20" + memory: 400Gi + nvidia.com/gpu: "1" + +# This is to setup the liveness and readiness probes more information can be found here: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/ +livenessProbe: +# httpGet: +# path: / +# port: http +readinessProbe: +# httpGet: +# path: / +# port: http + +# This section is for setting up autoscaling more information can be found here: https://kubernetes.io/docs/concepts/workloads/autoscaling/ +autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 100 + targetCPUUtilizationPercentage: 80 + # targetMemoryUtilizationPercentage: 80 + +# Additional volumes on the output Deployment definition. +volumes: +# - name: foo +# secret: +# secretName: mysecret +# optional: false + - emptyDir: + sizeLimit: 16Gi + name: dshm + - name: localtime + hostPath: + path: /etc/localtime + type: File + +# Additional volumeMounts on the output Deployment definition. +volumeMounts: +# - name: foo +# mountPath: "/etc/foo" +# readOnly: true + - mountPath: /dev/shm + name: dshm + - name: localtime + mountPath: /etc/localtime + readOnly: true + +nodeSelector: {} + +tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + - effect: NoSchedule + key: nvidia.com/gpu + operator: Equal + value: "true" + +affinity: {}