diff --git a/helm/.helmignore b/helm/.helmignore
new file mode 100644
index 0000000..0e8a0eb
--- /dev/null
+++ b/helm/.helmignore
@@ -0,0 +1,23 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*.orig
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
+.vscode/
diff --git a/helm/Chart.yaml b/helm/Chart.yaml
new file mode 100644
index 0000000..300e738
--- /dev/null
+++ b/helm/Chart.yaml
@@ -0,0 +1,24 @@
+apiVersion: v2
+name: llm-d-modelservice
+description: A Helm chart for ModelService
+
+# A chart can be either an 'application' or a 'library' chart.
+#
+# Application charts are a collection of templates that can be packaged into versioned archives
+# to be deployed.
+#
+# Library charts provide useful utilities or functions for the chart developer. They're included as
+# a dependency of application charts to inject those utilities and functions into the rendering
+# pipeline. Library charts do not define any templates and therefore cannot be deployed.
+type: application
+
+# This is the chart version. This version number should be incremented each time you make changes
+# to the chart and its templates, including the app version.
+# Versions are expected to follow Semantic Versioning (https://semver.org/)
+version: 0.0.1
+
+# This is the version number of the application being deployed. This version number should be
+# incremented each time you make changes to the application. Versions are not expected to
+# follow Semantic Versioning. They should reflect the version the application is using.
+# It is recommended to use it with quotes.
+appVersion: "0.0.1"
diff --git a/helm/examples/README.md b/helm/examples/README.md
new file mode 100644
index 0000000..3b8bcce
--- /dev/null
+++ b/helm/examples/README.md
@@ -0,0 +1,44 @@
+# Examples
+
+Contains example values file and their rendered templates.
+
+```
+cd helm 
+helm template [RELEASE-NAME] . -f [VALUES-FILEPATH]
+```
+
+1. `vllm-sim` in Kind 
+
+    Make sure there is a gateway (Kgteway or Istio) deployed in the cluster named `llm-d-inference-gateway` or change values file accordingly.
+    
+    ```
+    helm template llmd-sim . -f examples/values-vllm-sim.yaml > examples/output-vllm-sim.yaml
+    ```
+    
+    Remove `protocol: tcp` in `initContainers` and `readinessProbe` and `livenessProbe` from epp deployment
+
+
+2. `facebook/opt-125m`: downloads from Hugging Face 
+
+    ```
+    helm template facebook . -f examples/values-facebook.yaml > examples/output-facebook.yaml
+    ```
+    
+    
+Port forward the inference gateway 
+
+```
+k port-forward svc/llm-d-inference-gateway-istio 8000:80
+```
+    
+Send a request
+
+```
+curl http://localhost:8000/v1/completions -vvv \
+    -H "Content-Type: application/json" \
+    -H "x-model-name: facebook/opt-125m" \
+    -d '{
+    "model": "facebook/opt-125m",
+    "prompt": "Hello, "
+}'
+```
\ No newline at end of file
diff --git a/helm/examples/output-facebook.yaml b/helm/examples/output-facebook.yaml
new file mode 100644
index 0000000..3313c67
--- /dev/null
+++ b/helm/examples/output-facebook.yaml
@@ -0,0 +1,365 @@
+---
+# Source: llm-d-modelservice/templates/epp-sa.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: facebook-sim-test-llm-d-modelservice-epp-sa
+  labels:
+    helm.sh/chart: llm-d-modelservice-0.0.1
+    app.kubernetes.io/version: "0.0.1"
+    app.kubernetes.io/managed-by: Helm
+---
+# Source: llm-d-modelservice/templates/serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: facebook-sim-test-llm-d-modelservice-sa
+  labels:
+    helm.sh/chart: llm-d-modelservice-0.0.1
+    app.kubernetes.io/version: "0.0.1"
+    app.kubernetes.io/managed-by: Helm
+---
+# Source: llm-d-modelservice/templates/epp-service.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: facebook-sim-test-llm-d-modelservice-epp-service
+  labels:
+    helm.sh/chart: llm-d-modelservice-0.0.1
+    app.kubernetes.io/version: "0.0.1"
+    app.kubernetes.io/managed-by: Helm
+spec:
+  type: ClusterIP
+  ports:
+    - port: 9002
+      targetPort: 9002
+      protocol: TCP
+      appProtocol: http2
+  selector:
+    app.kubernetes.io/name: llm-d-modelservice
+    app.kubernetes.io/instance: facebook-sim-test
+    llm-d.ai/epp: facebook-sim-test-llm-d-modelservice-epp
+---
+# Source: llm-d-modelservice/templates/decode-deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: facebook-sim-test-llm-d-modelservice-decode
+  labels:
+    helm.sh/chart: llm-d-modelservice-0.0.1
+    app.kubernetes.io/version: "0.0.1"
+    app.kubernetes.io/managed-by: Helm
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      llm-d.ai/inferenceServing: "true"
+      llm-d.ai/model: facebook-sim-test
+      llm-d.ai/role: decode
+  template:
+    metadata:
+      labels:
+        llm-d.ai/inferenceServing: "true"
+        llm-d.ai/model: facebook-sim-test
+        llm-d.ai/role: decode
+    spec:
+      initContainers:
+        - name: routing-proxy
+          args:
+            - --port=8000
+            - --vllm-port=8200
+            - --connector=nixlv2
+            - -v=5
+          image: ghcr.io/llm-d/llm-d-routing-sidecar:0.0.6
+          imagePullPolicy: Always
+          ports:
+            - containerPort: 8000
+          resources: {}
+          restartPolicy: Always
+          securityContext:
+            allowPrivilegeEscalation: false
+            runAsNonRoot: true
+    
+      serviceAccountName: facebook-sim-test-llm-d-modelservice-sa
+      containers:
+        - name: vllm
+          image: ghcr.io/llm-d/llm-d:0.0.8
+          command:
+          - vllm
+          - serve
+          args:
+          - facebook/opt-125m
+          - --port
+          - "8200"
+          - --enforce-eager
+          - --kv-transfer-config
+          - '{"kv_connector":"NixlConnector", "kv_role":"kv_both"}'
+          env:
+          - name: CUDA_VISIBLE_DEVICES
+            value: "0"
+          - name: UCX_TLS
+            value: cuda_ipc,cuda_copy,tcp
+          - name: HF_HOME
+            value: /model-cache
+          - name: VLLM_NIXL_SIDE_CHANNEL_HOST
+            valueFrom:
+              fieldRef:
+                fieldPath: status.podIP
+          - name: VLLM_NIXL_SIDE_CHANNEL_PORT
+            value: "5557"
+          - name: VLLM_LOGGING_LEVEL
+            value: DEBUG
+          - name: HF_HOME
+            value: /model-cache
+          
+          resources:
+            limits:
+              cpu: "16"
+              memory: 16Gi
+              nvidia.com/gpu: "1"
+            requests:
+              cpu: "16"
+              memory: 16Gi
+              nvidia.com/gpu: "1"
+          volumeMounts:
+          - name: model-storage
+            mountPath: /model-cache
+      volumes:
+        - name: model-storage
+          emptyDir: 
+            sizeLimit: 5Mi
+---
+# Source: llm-d-modelservice/templates/epp-deployment.yaml
+apiVersion: apps/v1 
+kind: Deployment 
+metadata: 
+  name: facebook-sim-test-llm-d-modelservice-epp
+  labels: 
+    llm-d.ai/epp: facebook-sim-test-llm-d-modelservice-epp
+  namespace: default
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      llm-d.ai/epp: facebook-sim-test-llm-d-modelservice-epp
+  template:
+    metadata:
+      labels:
+        llm-d.ai/epp: facebook-sim-test-llm-d-modelservice-epp
+    spec:
+      containers:
+      - name: epp 
+        imagePullPolicy: Always 
+        image: ghcr.io/llm-d/llm-d-inference-scheduler:0.0.3
+        args:
+        - --poolName
+        - facebook-sim-test-llm-d-modelservice-inference-pool
+        - --poolNamespace
+        - default
+        - -v
+        - "4"
+        - --zap-encoder
+        - json
+        - --grpcPort
+        - "9002"
+        - --grpcHealthPort
+        - "9003"
+        env:
+        - name: ENABLE_KVCACHE_AWARE_SCORER
+          value: "false"
+        - name: ENABLE_LOAD_AWARE_SCORER
+          value: "true"
+        - name: ENABLE_PREFIX_AWARE_SCORER
+          value: "true"
+        - name: ENABLE_SESSION_AWARE_SCORER
+          value: "false"
+        - name: KVCACHE_AWARE_SCORER_WEIGHT
+          value: "1"
+        - name: KVCACHE_INDEXER_REDIS_ADDR
+        - name: LOAD_AWARE_SCORER_WEIGHT
+          value: "1"
+        - name: PD_ENABLED
+          value: "false"
+        - name: PD_PROMPT_LEN_THRESHOLD
+          value: "10"
+        - name: PREFILL_ENABLE_KVCACHE_AWARE_SCORER
+          value: "false"
+        - name: PREFILL_ENABLE_LOAD_AWARE_SCORER
+          value: "false"
+        - name: PREFILL_ENABLE_PREFIX_AWARE_SCORER
+          value: "false"
+        - name: PREFILL_ENABLE_SESSION_AWARE_SCORER
+          value: "false"
+        - name: PREFILL_KVCACHE_AWARE_SCORER_WEIGHT
+          value: "1"
+        - name: PREFILL_KVCACHE_INDEXER_REDIS_ADDR
+        - name: PREFILL_LOAD_AWARE_SCORER_WEIGHT
+          value: "1"
+        - name: PREFILL_PREFIX_AWARE_SCORER_WEIGHT
+          value: "1"
+        - name: PREFILL_SESSION_AWARE_SCORER_WEIGHT
+          value: "1"
+        - name: PREFIX_AWARE_SCORER_WEIGHT
+          value: "2"
+        - name: SESSION_AWARE_SCORER_WEIGHT
+          value: "1"
+        ports:
+        - containerPort: 9002
+          name: grpc
+          protocol: TCP
+        - containerPort: 9003
+          name: grpc-health
+          protocol: TCP
+        - containerPort: 9090
+          name: metrics
+          protocol: TCP
+      serviceAccount: facebook-sim-test-llm-d-modelservice-epp-sa
+      serviceAccountName: facebook-sim-test-llm-d-modelservice-epp-sa
+      readinessProbe:
+        grpc:
+          port: 9003
+          service: envoy.service.ext_proc.v3.ExternalProcessor
+        initialDelaySeconds: 5
+        timeoutSeconds: 1
+        periodSeconds: 10
+        successThreshold: 1
+        failureThreshold: 3
+      livenessProbe:
+        grpc:
+          port: 9003
+          service: envoy.service.ext_proc.v3.ExternalProcessor
+        initialDelaySeconds: 5
+        timeoutSeconds: 1
+        periodSeconds: 10
+        successThreshold: 1
+        failureThreshold: 3
+---
+# Source: llm-d-modelservice/templates/prefill-deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: facebook-sim-test-llm-d-modelservice-prefill
+  labels:
+    helm.sh/chart: llm-d-modelservice-0.0.1
+    app.kubernetes.io/version: "0.0.1"
+    app.kubernetes.io/managed-by: Helm
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      llm-d.ai/inferenceServing: "true"
+      llm-d.ai/model: facebook-sim-test
+      llm-d.ai/role: prefill
+  template:
+    metadata:
+      labels:
+        llm-d.ai/inferenceServing: "true"
+        llm-d.ai/model: facebook-sim-test
+        llm-d.ai/role: prefill
+    spec:
+    
+      serviceAccountName: facebook-sim-test-llm-d-modelservice-sa
+      containers:
+        - name: vllm
+          image: ghcr.io/llm-d/llm-d:0.0.8
+          command:
+          - vllm
+          - serve
+          args:
+          - facebook/opt-125m
+          - --port
+          - "8000"
+          - --enforce-eager
+          - --kv-transfer-config
+          - '{"kv_connector":"NixlConnector", "kv_role":"kv_both"}'
+          env:
+          - name: CUDA_VISIBLE_DEVICES
+            value: "0"
+          - name: UCX_TLS
+            value: cuda_ipc,cuda_copy,tcp
+          - name: VLLM_NIXL_SIDE_CHANNEL_PORT
+            value: "5557"
+          - name: VLLM_NIXL_SIDE_CHANNEL_HOST
+            valueFrom:
+              fieldRef:
+                fieldPath: status.podIP
+          - name: VLLM_LOGGING_LEVEL
+            value: DEBUG
+          
+          resources:
+            limits:
+              cpu: "16"
+              memory: 16Gi
+              nvidia.com/gpu: "1"
+            requests:
+              cpu: "16"
+              memory: 16Gi
+              nvidia.com/gpu: "1"
+      volumes:
+        - name: model-storage
+          emptyDir: 
+            sizeLimit: 5Mi
+---
+# Source: llm-d-modelservice/templates/routing.yaml
+apiVersion: gateway.networking.k8s.io/v1
+kind: HTTPRoute
+metadata:
+  name: facebook-sim-test-llm-d-modelservice-http-route
+  namespace: default
+  labels:
+    helm.sh/chart: llm-d-modelservice-0.0.1
+    app.kubernetes.io/version: "0.0.1"
+    app.kubernetes.io/managed-by: Helm
+spec:
+  parentRefs:
+  - group: gateway.networking.k8s.io
+    kind: Gateway
+    name: inference-gateway
+  rules:
+  - backendRefs:
+    - group: inference.networking.x-k8s.io
+      kind: InferencePool
+      name: facebook-sim-test-llm-d-modelservice-inference-pool
+      port: 8000
+      weight: 1
+    matches:
+    - headers:
+      - name: x-model-name
+        type: Exact
+        value: facebook/opt-125m
+      path:
+        type: PathPrefix
+        value: /
+---
+# Source: llm-d-modelservice/templates/routing.yaml
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: facebook-sim-test-llm-d-modelservice-inference-model
+  namespace: default
+  labels:
+    llm-d.ai/inferenceServing: "true"
+    llm-d.ai/model: facebook-sim-test
+spec:
+  modelName: facebook/opt-125m
+  poolRef:
+    group: inference.networking.x-k8s.io
+    kind: InferencePool
+    name: facebook-sim-test-llm-d-modelservice-inference-pool
+---
+# Source: llm-d-modelservice/templates/routing.yaml
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferencePool
+metadata:
+  name: facebook-sim-test-llm-d-modelservice-inference-pool
+  namespace: default
+spec:
+  extensionRef:
+    failureMode: FailClose
+    group: ""
+    kind: Service
+    name: facebook-sim-test-llm-d-modelservice-epp-service
+  selector:
+    llm-d.ai/inferenceServing: "true"
+    llm-d.ai/model: facebook-sim-test
+  targetPortNumber: 8000
diff --git a/helm/examples/output-vllm-sim.yaml b/helm/examples/output-vllm-sim.yaml
new file mode 100644
index 0000000..547b20e
--- /dev/null
+++ b/helm/examples/output-vllm-sim.yaml
@@ -0,0 +1,302 @@
+---
+# Source: llm-d-modelservice/templates/epp-sa.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: facebook-sim-test-llm-d-modelservice-epp-sa
+  labels:
+    helm.sh/chart: llm-d-modelservice-0.0.1
+    app.kubernetes.io/version: "0.0.1"
+    app.kubernetes.io/managed-by: Helm
+---
+# Source: llm-d-modelservice/templates/serviceaccount.yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: facebook-sim-test-llm-d-modelservice-sa
+  labels:
+    helm.sh/chart: llm-d-modelservice-0.0.1
+    app.kubernetes.io/version: "0.0.1"
+    app.kubernetes.io/managed-by: Helm
+---
+# Source: llm-d-modelservice/templates/epp-service.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: facebook-sim-test-llm-d-modelservice-epp-service
+  labels:
+    helm.sh/chart: llm-d-modelservice-0.0.1
+    app.kubernetes.io/version: "0.0.1"
+    app.kubernetes.io/managed-by: Helm
+spec:
+  type: ClusterIP
+  ports:
+    - port: 9002
+      targetPort: 9002
+      protocol: TCP
+      appProtocol: http2
+  selector:
+    app.kubernetes.io/name: llm-d-modelservice
+    app.kubernetes.io/instance: facebook-sim-test
+    llm-d.ai/epp: facebook-sim-test-llm-d-modelservice-epp
+---
+# Source: llm-d-modelservice/templates/decode-deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: facebook-sim-test-llm-d-modelservice-decode
+  labels:
+    helm.sh/chart: llm-d-modelservice-0.0.1
+    app.kubernetes.io/version: "0.0.1"
+    app.kubernetes.io/managed-by: Helm
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      llm-d.ai/inferenceServing: "true"
+      llm-d.ai/model: facebook-sim-test
+      llm-d.ai/role: decode
+  template:
+    metadata:
+      labels:
+        llm-d.ai/inferenceServing: "true"
+        llm-d.ai/model: facebook-sim-test
+        llm-d.ai/role: decode
+    spec:
+      initContainers:
+        - name: routing-proxy
+          args:
+            - --port=8000
+            - --vllm-port=8200
+            - --connector=nixlv2
+            - -v=5
+          image: ghcr.io/llm-d/llm-d-routing-sidecar:0.0.6
+          imagePullPolicy: Always
+          ports:
+            - containerPort: 8000
+          resources: {}
+          restartPolicy: Always
+          securityContext:
+            allowPrivilegeEscalation: false
+            runAsNonRoot: true
+    
+      serviceAccountName: facebook-sim-test-llm-d-modelservice-sa
+      containers:
+        - name: vllm
+          image: ghcr.io/llm-d/llm-d-inference-sim:0.0.4
+          args:
+          - --model
+          - facebook/opt-125m
+          - --port
+          - "8200"
+          env:
+          - name: HF_HOME
+            value: /model-cache
+          
+          resources:
+            limits:
+              {}
+            requests:
+              {}
+          volumeMounts:
+          - name: model-storage
+            mountPath: /model-cache
+      volumes:
+---
+# Source: llm-d-modelservice/templates/epp-deployment.yaml
+apiVersion: apps/v1 
+kind: Deployment 
+metadata: 
+  name: facebook-sim-test-llm-d-modelservice-epp
+  labels: 
+    llm-d.ai/epp: facebook-sim-test-llm-d-modelservice-epp
+  namespace: default
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      llm-d.ai/epp: facebook-sim-test-llm-d-modelservice-epp
+  template:
+    metadata:
+      labels:
+        llm-d.ai/epp: facebook-sim-test-llm-d-modelservice-epp
+    spec:
+      containers:
+      - name: epp 
+        imagePullPolicy: Always 
+        image: ghcr.io/llm-d/llm-d-inference-scheduler:0.0.3
+        args:
+        - --poolName
+        - facebook-sim-test-llm-d-modelservice-inference-pool
+        - --poolNamespace
+        - default
+        - -v
+        - "6"
+        - --zap-encoder
+        - json
+        - --grpcPort
+        - "9002"
+        - --grpcHealthPort
+        - "9003"
+        env:
+        - name: ENABLE_KVCACHE_AWARE_SCORER
+          value: "false"
+        - name: ENABLE_LOAD_AWARE_SCORER
+          value: "true"
+        - name: ENABLE_PREFIX_AWARE_SCORER
+          value: "true"
+        - name: ENABLE_SESSION_AWARE_SCORER
+          value: "false"
+        - name: KVCACHE_AWARE_SCORER_WEIGHT
+          value: "1"
+        - name: KVCACHE_INDEXER_REDIS_ADDR
+        - name: LOAD_AWARE_SCORER_WEIGHT
+          value: "1"
+        - name: PD_ENABLED
+          value: "false"
+        - name: PD_PROMPT_LEN_THRESHOLD
+          value: "10"
+        - name: PREFILL_ENABLE_KVCACHE_AWARE_SCORER
+          value: "false"
+        - name: PREFILL_ENABLE_LOAD_AWARE_SCORER
+          value: "false"
+        - name: PREFILL_ENABLE_PREFIX_AWARE_SCORER
+          value: "false"
+        - name: PREFILL_ENABLE_SESSION_AWARE_SCORER
+          value: "false"
+        - name: PREFILL_KVCACHE_AWARE_SCORER_WEIGHT
+          value: "1"
+        - name: PREFILL_KVCACHE_INDEXER_REDIS_ADDR
+        - name: PREFILL_LOAD_AWARE_SCORER_WEIGHT
+          value: "1"
+        - name: PREFILL_PREFIX_AWARE_SCORER_WEIGHT
+          value: "1"
+        - name: PREFILL_SESSION_AWARE_SCORER_WEIGHT
+          value: "1"
+        - name: PREFIX_AWARE_SCORER_WEIGHT
+          value: "2"
+        - name: SESSION_AWARE_SCORER_WEIGHT
+          value: "1"
+        ports:
+        - containerPort: 9002
+          name: grpc
+          protocol: TCP
+        - containerPort: 9003
+          name: grpc-health
+          protocol: TCP
+        - containerPort: 9090
+          name: metrics
+          protocol: TCP
+      serviceAccount: facebook-sim-test-llm-d-modelservice-epp-sa
+      serviceAccountName: facebook-sim-test-llm-d-modelservice-epp-sa
+---
+# Source: llm-d-modelservice/templates/prefill-deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: facebook-sim-test-llm-d-modelservice-prefill
+  labels:
+    helm.sh/chart: llm-d-modelservice-0.0.1
+    app.kubernetes.io/version: "0.0.1"
+    app.kubernetes.io/managed-by: Helm
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      llm-d.ai/inferenceServing: "true"
+      llm-d.ai/model: facebook-sim-test
+      llm-d.ai/role: prefill
+  template:
+    metadata:
+      labels:
+        llm-d.ai/inferenceServing: "true"
+        llm-d.ai/model: facebook-sim-test
+        llm-d.ai/role: prefill
+    spec:
+    
+      serviceAccountName: facebook-sim-test-llm-d-modelservice-sa
+      containers:
+        - name: vllm
+          image: ghcr.io/llm-d/llm-d-inference-sim:0.0.4
+          args:
+          - --model
+          - facebook/opt-125m
+          - --port
+          - "8000"
+          env:
+          - name: HF_HOME
+            value: /model-cache
+          
+          resources:
+            limits:
+              {}
+            requests:
+              {}
+          volumeMounts:
+          - name: model-storage
+            mountPath: /model-cache
+      volumes:
+---
+# Source: llm-d-modelservice/templates/routing.yaml
+apiVersion: gateway.networking.k8s.io/v1
+kind: HTTPRoute
+metadata:
+  name: facebook-sim-test-llm-d-modelservice-http-route
+  namespace: default
+  labels:
+    helm.sh/chart: llm-d-modelservice-0.0.1
+    app.kubernetes.io/version: "0.0.1"
+    app.kubernetes.io/managed-by: Helm
+spec:
+  parentRefs:
+  - group: gateway.networking.k8s.io
+    kind: Gateway
+    name: llm-d-inference-gateway
+  rules:
+  - backendRefs:
+    - group: inference.networking.x-k8s.io
+      kind: InferencePool
+      name: facebook-sim-test-llm-d-modelservice-inference-pool
+      port: 8000
+      weight: 1
+    matches:
+    - headers:
+      - name: x-model-name
+        type: Exact
+        value: facebook/opt-125m
+      path:
+        type: PathPrefix
+        value: /
+---
+# Source: llm-d-modelservice/templates/routing.yaml
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: facebook-sim-test-llm-d-modelservice-inference-model
+  namespace: default
+  labels:
+    llm-d.ai/inferenceServing: "true"
+    llm-d.ai/model: facebook-sim-test
+spec:
+  modelName: facebook/opt-125m
+  poolRef:
+    group: inference.networking.x-k8s.io
+    kind: InferencePool
+    name: facebook-sim-test-llm-d-modelservice-inference-pool
+---
+# Source: llm-d-modelservice/templates/routing.yaml
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferencePool
+metadata:
+  name: facebook-sim-test-llm-d-modelservice-inference-pool
+  namespace: default
+spec:
+  extensionRef:
+    failureMode: FailClose
+    group: ""
+    kind: Service
+    name: facebook-sim-test-llm-d-modelservice-epp-service
+  selector:
+    llm-d.ai/inferenceServing: "true"
+    llm-d.ai/model: facebook-sim-test
+  targetPortNumber: 8000
diff --git a/helm/examples/values-deepseek.yaml b/helm/examples/values-deepseek.yaml
new file mode 100644
index 0000000..f8a6590
--- /dev/null
+++ b/helm/examples/values-deepseek.yaml
@@ -0,0 +1,377 @@
+# This values.yaml file creates the resources for deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct
+
+# If true, use a LeaderWorkerSet instead of a Deployment to host the model
+multinode: true
+inferencePool: true
+inferenceModel: true
+httpRoute: true
+
+routing: 
+  # This is the model name for the OpenAI request
+  modelName: deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct
+  servicePort: 8080   # Sidecar listens on this port for requests. If there's no sidecar, the request goes here
+  proxy:
+    image: ghcr.io/llm-d/llm-d-routing-sidecar:0.0.6
+    targetPort: 8200
+    debugLevel: 5
+  parentRefs:
+  - name: inference-gateway
+
+modelArtifacts:
+  # When specfying the URI with `hf` prefix, the <repo-id>/<model-id> string
+  # is extracted and exposed as a template variable that can be used as {{ .HFModelName }}
+  
+  uri: "pvc://tms-hf-cache/model-cache"
+  authSecretName: "hf-secret"
+  size: 5Mi 
+
+# describe decode pods
+decode:
+  autoscaling: 
+    enabled: false
+  replicas: 1
+
+  parallelism:  
+    tensor: 1
+    data: 1
+    dataLocal: 1 
+  
+  acceleratorTypes:
+    labelKey: gpu.nvidia.com/model
+    labelValues:
+      - H200
+
+  volumes:
+    # Volume for the init script from ConfigMap
+    - name: init-scripts-volume
+      configMap:
+        defaultMode: 0755
+        name: vllm-init-scripts-config
+    # Needed for NCCL to function
+    - name: dshm
+      emptyDir:
+        medium: Memory
+        sizeLimit: 1Gi
+    # - name: hf-cache
+    #   persistentVolumeClaim:
+    #     claimName: tms-hf-cache
+    - name: vllm
+      persistentVolumeClaim:
+        claimName: tms-vllm
+
+  containers:
+    - name: vllm-worker
+      image: "quay.io/tms/vllm-dev-pplx:0.1.0"
+      imagePullPolicy: Always
+      workingDir: /app
+      stdin: true
+      tty: true
+      command: ["/bin/sh","-c"]
+      args:
+        - |
+          #################
+          # Install vLLM
+          #################
+          VLLM_USE_PRECOMPILED=1 /init-scripts/vllm.sh
+          #################
+          # RUN vLLM
+          #################
+          START_RANK=$(( ${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL ))
+          if [ "${LWS_WORKER_INDEX:-0}" -eq 0 ]; then
+            #################
+            # Leader-only launch
+            #################
+            exec /app/venv/bin/vllm serve \
+              deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \
+              --port 8200 \
+              --disable-log-requests \
+              --enable-expert-parallel \
+              --tensor-parallel-size $TP_SIZE \
+              --data-parallel-size $DP_SIZE \
+              --data-parallel-size-local $DP_SIZE_LOCAL \
+              --data-parallel-address $(LWS_LEADER_ADDRESS) \
+              --data-parallel-rpc-port 5555 \
+              --data-parallel-start-rank $START_RANK \
+              --trust-remote-code \
+              --enforce-eager
+          else
+            #################
+            # Worker-only launch
+            #################
+            exec /app/venv/bin/vllm serve \
+              deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \
+              --port 8200 \
+              --disable-log-requests \
+              --enable-expert-parallel \
+              --tensor-parallel-size $TP_SIZE \
+              --data-parallel-size $DP_SIZE \
+              --data-parallel-size-local $DP_SIZE_LOCAL \
+              --data-parallel-address $(LWS_LEADER_ADDRESS) \
+              --data-parallel-rpc-port 5555 \
+              --data-parallel-start-rank $START_RANK \
+              --trust-remote-code \
+              --enforce-eager \
+              --headless
+          fi
+      env:
+        - name: CUDA_LAUNCH_BLOCKING
+          value: "1"
+        - name: VLLM_REPO_URL
+          value: "https://github.com/vllm-project/vllm.git"
+        - name: VLLM_BRANCH
+          value: "main"
+        #- name: VLLM_USE_DEEP_GEMM
+        #  value: "1"
+        - name: VLLM_ALL2ALL_BACKEND
+        #  value: "naive"
+          value: "pplx"
+        #  value: "deepep_high_throughput"
+        #  value: "deepep_low_latency"
+        # Needed for GDRCOPY to be used.
+        # See: https://github.com/NVIDIA/nvidia-container-toolkit/releases/tag/v1.15.0
+        - name: NVIDIA_GDRCOPY
+          value: "enabled"
+        - name: NVSHMEM_DEBUG
+          value: "INFO"
+        # Uncomment for debugging
+        #- name: NVSHMEM_DEBUG_SUBSYS
+        #  value: "TRANSPORT,INIT,MEM,COLL,BOOTSTRAP"
+        - name: NVSHMEM_REMOTE_TRANSPORT
+          value: "ibgda"
+        - name: NVSHMEM_IB_ENABLE_IBGDA
+          value: "true"
+        - name: NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME
+          value: "eth0"
+        - name: GLOO_SOCKET_IFNAME
+          value: "eth0"
+        - name: NCCL_SOCKET_IFNAME
+          value: "eth0"
+        - name: NCCL_IB_HCA
+          value: "ibp"
+        - name: VLLM_LOGGING_LEVEL
+          value: "DEBUG"
+        #MK - name: HF_HUB_CACHE
+        #MK   value: /huggingface-cache
+        - name: GH_TOKEN_FROM_SECRET
+          valueFrom:
+            secretKeyRef:
+              name: gh-token-secret
+              key: GH_TOKEN
+              optional: true
+        - name: VLLM_NIXL_SIDE_CHANNEL_PORT
+          value: "6555"
+        - name: VLLM_NIXL_SIDE_CHANNEL_HOST
+          valueFrom:
+            fieldRef:
+              fieldPath: status.podIP
+
+      securityContext:
+        capabilities:
+          add:
+          - "IPC_LOCK"
+          - "SYS_RAWIO"
+      resources:
+        limits:
+          nvidia.com/gpu: 1
+          memory: 64Gi
+          ephemeral-storage: 64Gi
+          rdma/ib: 1
+        requests:
+          cpu: 8
+          memory: 64Gi
+          ephemeral-storage: 64Gi
+          nvidia.com/gpu: 1
+          rdma/ib: 1
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+        - name: init-scripts-volume
+          mountPath: /init-scripts
+        #MK - name: hf-cache
+        #MK   mountPath: /huggingface-cache
+        - name: vllm
+          mountPath: /code
+      mountModelVolume: true
+
+# describe prefill pods
+prefill:
+  autoscaling: 
+    enabled: false
+  replicas: 1
+
+  parallelism:  
+    tensor: 1
+    data: 1
+    dataLocal: 1 
+  
+  acceleratorTypes:
+    labelKey: gpu.nvidia.com/model
+    labelValues:
+      - H200
+
+  volumes:
+    # Volume for the init script from ConfigMap
+    - name: init-scripts-volume
+      configMap:
+        defaultMode: 0755
+        name: vllm-init-scripts-config
+    # Needed for NCCL to function
+    - name: dshm
+      emptyDir:
+        medium: Memory
+        sizeLimit: 1Gi
+    # - name: hf-cache
+    #   persistentVolumeClaim:
+    #     claimName: tms-hf-cache
+    - name: vllm
+      persistentVolumeClaim:
+        claimName: tms-vllm
+
+
+  containers:
+    - name: vllm-worker
+      image: "quay.io/tms/vllm-dev-pplx:0.1.0"
+      imagePullPolicy: Always
+      workingDir: /app
+      stdin: true
+      tty: true
+      command: ["/bin/sh","-c"]
+      args:
+        - |
+          #################
+          # Install vLLM
+          #################
+          VLLM_USE_PRECOMPILED=1 /init-scripts/vllm.sh
+          #################
+          # RUN vLLM
+          #################
+          START_RANK=$(( ${LWS_WORKER_INDEX:-0} * DP_SIZE_LOCAL ))
+          if [ "${LWS_WORKER_INDEX:-0}" -eq 0 ]; then
+            #################
+            # Leader-only launch
+            #################
+            exec /app/venv/bin/vllm serve \
+              deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \
+              --port 8080 \
+              --disable-log-requests \
+              --enable-expert-parallel \
+              --tensor-parallel-size $TP_SIZE \
+              --data-parallel-size $DP_SIZE \
+              --data-parallel-size-local $DP_SIZE_LOCAL \
+              --data-parallel-address $(LWS_LEADER_ADDRESS) \
+              --data-parallel-rpc-port 5555 \
+              --data-parallel-start-rank $START_RANK \
+              --trust-remote-code \
+              --enforce-eager
+          else
+            #################
+            # Worker-only launch
+            #################
+            exec /app/venv/bin/vllm serve \
+              deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct \
+              --port 8080 \
+              --disable-log-requests \
+              --enable-expert-parallel \
+              --tensor-parallel-size $TP_SIZE \
+              --data-parallel-size $DP_SIZE \
+              --data-parallel-size-local $DP_SIZE_LOCAL \
+              --data-parallel-address $(LWS_LEADER_ADDRESS) \
+              --data-parallel-rpc-port 5555 \
+              --data-parallel-start-rank $START_RANK \
+              --trust-remote-code \
+              --headless
+          fi
+      env:
+        - name: CUDA_LAUNCH_BLOCKING
+          value: "1"
+        - name: VLLM_REPO_URL
+          value: "https://github.com/vllm-project/vllm.git"
+        - name: VLLM_BRANCH
+          value: "main"
+        #- name: VLLM_USE_DEEP_GEMM
+        #  value: "1"
+        - name: VLLM_ALL2ALL_BACKEND
+        #  value: "naive"
+          value: "pplx"
+        #  value: "deepep_high_throughput"
+        #  value: "deepep_low_latency"
+        # Needed for GDRCOPY to be used.
+        # See: https://github.com/NVIDIA/nvidia-container-toolkit/releases/tag/v1.15.0
+        - name: NVIDIA_GDRCOPY
+          value: "enabled"
+        - name: NVSHMEM_DEBUG
+          value: "INFO"
+        # Uncomment for debugging
+        #- name: NVSHMEM_DEBUG_SUBSYS
+        #  value: "TRANSPORT,INIT,MEM,COLL,BOOTSTRAP"
+        - name: NVSHMEM_REMOTE_TRANSPORT
+          value: "ibgda"
+        - name: NVSHMEM_IB_ENABLE_IBGDA
+          value: "true"
+        - name: NVSHMEM_BOOTSTRAP_UID_SOCK_IFNAME
+          value: "eth0"
+        - name: GLOO_SOCKET_IFNAME
+          value: "eth0"
+        - name: NCCL_SOCKET_IFNAME
+          value: "eth0"
+        - name: NCCL_IB_HCA
+          value: "ibp"
+        - name: VLLM_LOGGING_LEVEL
+          value: "DEBUG"
+        #MK - name: HF_HUB_CACHE
+        #MK   value: /huggingface-cache
+        - name: GH_TOKEN_FROM_SECRET
+          valueFrom:
+            secretKeyRef:
+              name: gh-token-secret
+              key: GH_TOKEN
+              optional: true
+        - name: VLLM_NIXL_SIDE_CHANNEL_PORT
+          value: "6555"
+        - name: VLLM_NIXL_SIDE_CHANNEL_HOST
+          valueFrom:
+            fieldRef:
+              fieldPath: status.podIP
+
+      securityContext:
+        capabilities:
+          add:
+          - "IPC_LOCK"
+          - "SYS_RAWIO"
+      resources:
+        limits:
+          nvidia.com/gpu: 1
+          memory: 64Gi
+          ephemeral-storage: 64Gi
+          rdma/ib: 1
+        requests:
+          cpu: 8
+          memory: 64Gi
+          ephemeral-storage: 64Gi
+          nvidia.com/gpu: 1
+          rdma/ib: 1
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+        - name: init-scripts-volume
+          mountPath: /init-scripts
+        #MK - name: hf-cache
+        #MK   mountPath: /huggingface-cache
+        - name: vllm
+          mountPath: /code
+      mountModelVolume: true
+    
+endpointPicker:
+  image: ghcr.io/llm-d/llm-d-inference-scheduler:0.0.3
+  debugLevel: 5
+  service:
+    type: ClusterIP
+    port: 9002
+    targetPort: 9002
+    appProtocol: http2
+  # The name of cluster role containing permissions to be granted to endpointPicker (via a role binding to the service account)
+  permissions: pod-read
+
+  autoscaling:
+    enabled: false
+  replicas: 1
diff --git a/helm/examples/values-facebook.yaml b/helm/examples/values-facebook.yaml
new file mode 100644
index 0000000..3382040
--- /dev/null
+++ b/helm/examples/values-facebook.yaml
@@ -0,0 +1,116 @@
+# This values.yaml file creates the resources for facebook/opt-125m 
+
+multinode: false          # If true, creates LWS instead of deployments  
+inferencePool: true 
+inferenceModel: true 
+httpRoute: true 
+
+routing: 
+  # This is the model name for the OpenAI request
+  modelName: facebook/opt-125m
+  servicePort: 8000   # Sidecar listens on this port for requests. If there's no sidecar, the request goes here
+  proxy:
+    image: ghcr.io/llm-d/llm-d-routing-sidecar:0.0.6
+    targetPort: 8200
+  parentRefs:
+  - name: inference-gateway
+
+modelArtifacts:
+  uri: "hf://facebook/opt-125m"
+
+# describe decode pods
+decode:
+  enableService: false
+  replicas: 1
+  containers:
+  - name: "vllm"
+    image: "ghcr.io/llm-d/llm-d:0.0.8"  
+    command: 
+      - vllm
+      - serve
+    args:
+      - "--enforce-eager"
+      - "--kv-transfer-config"
+      - '{"kv_connector":"NixlConnector", "kv_role":"kv_both"}'
+    env:
+      - name: CUDA_VISIBLE_DEVICES
+        value: "0"
+      - name: UCX_TLS
+        value: "cuda_ipc,cuda_copy,tcp"
+      - name: HF_HOME
+        value: /model-cache
+      - name: VLLM_NIXL_SIDE_CHANNEL_HOST
+        valueFrom:
+          fieldRef:
+            fieldPath: status.podIP
+      - name: VLLM_NIXL_SIDE_CHANNEL_PORT
+        value: "5557"
+      - name: VLLM_LOGGING_LEVEL
+        value: DEBUG
+    ports:
+      - containerPort: 5557
+        protocol: TCP
+    resources:
+      limits:
+        memory: 16Gi
+        cpu: "16"
+        nvidia.com/gpu: "1"
+      requests:
+        cpu: "16"
+        memory: 16Gi
+        nvidia.com/gpu: "1"
+    mountModelVolume: true 
+
+# describe the prefill pods (looks the same as above)
+prefill:
+  replicas: 1
+  containers:
+  - name: "vllm"
+    image: "ghcr.io/llm-d/llm-d:0.0.8"  
+    command: 
+      - vllm 
+      - serve
+    args:
+      - "--enforce-eager"
+      - "--kv-transfer-config"
+      - '{"kv_connector":"NixlConnector", "kv_role":"kv_both"}'
+    env:
+      - name: CUDA_VISIBLE_DEVICES
+        value: "0"
+      - name: UCX_TLS
+        value: "cuda_ipc,cuda_copy,tcp"
+      - name: VLLM_NIXL_SIDE_CHANNEL_PORT
+        value: "5557"
+      - name: VLLM_NIXL_SIDE_CHANNEL_HOST
+        valueFrom:
+          fieldRef:
+            fieldPath: status.podIP
+      - name: VLLM_LOGGING_LEVEL
+        value: DEBUG
+    ports:
+      - containerPort: 8000
+        protocol: TCP
+      - containerPort: 5557
+        protocol: TCP
+    resources:
+      limits:
+        memory: 16Gi
+        cpu: "16"
+        nvidia.com/gpu: "1"
+      requests:
+        cpu: "16"
+        memory: 16Gi
+        nvidia.com/gpu: "1"
+    
+endpointPicker:
+  service:
+    type: ClusterIP
+    port: 9002
+    targetPort: 9002
+    appProtocol: http2
+  # The name of cluster role containing permissions to be granted to endpointPicker (via a role binding to the service account)
+  permissions: pod-read
+  
+  autoscaling:
+    enabled: false
+  replicas: 1
diff --git a/helm/examples/values-vllm-sim.yaml b/helm/examples/values-vllm-sim.yaml
new file mode 100644
index 0000000..3c4f159
--- /dev/null
+++ b/helm/examples/values-vllm-sim.yaml
@@ -0,0 +1,78 @@
+# This values.yaml file creates the resources for random 
+
+multinode: false          # If true, creates LWS instead of deployments  
+inferencePool: true 
+inferenceModel: true 
+httpRoute: true 
+
+routing: 
+  # This is the model name for the OpenAI request
+  modelName: random
+  servicePort: 8000   # Sidecar listens on this port for requests. If there's no sidecar, the request goes here
+  proxy:
+    image: ghcr.io/llm-d/llm-d-routing-sidecar:0.0.6
+    targetPort: 8200
+  parentRefs:
+  - group: gateway.networking.k8s.io
+    kind: Gateway
+    name: llm-d-inference-gateway
+
+modelArtifacts:
+  uri: "hf://random"
+  size: 5Mi
+
+# describe decode pods
+decode:
+  replicas: 1
+  containers:
+  - name: "vllm"
+    image: "ghcr.io/llm-d/llm-d-inference-sim:0.0.4"  
+    args:
+      - "--model"
+      - "random"
+      - "--port"
+      - "8200"  # targetPort
+    ports:
+      - containerPort: 5557
+        protocol: TCP
+    mountModelVolume: true 
+prefill:
+  replicas: 1
+  containers:
+  - name: "vllm"
+    image: "ghcr.io/llm-d/llm-d-inference-sim:0.0.4"  
+    args:
+      - "--model"
+      - "random"
+      - "--port"
+      - "8000"  # servicePort
+    ports:
+      - containerPort: 5557
+        protocol: TCP
+    mountModelVolume: true 
+    
+endpointPicker:
+  # This is for setting up a service more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/
+  service:
+    # This sets the service type more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types
+    type: ClusterIP
+    # This sets the ports more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#field-spec-ports
+    port: 9002
+    targetPort: 9002
+    appProtocol: http2
+  debugLevel: 6
+  disableReadinessProbe: true 
+  disableLivenessProbe: true
+  
+  autoscaling:
+    enabled: false
+  replicas: 1
+
+
+curl http://localhost:8000/v1/completions -vvv \
+    -H "Content-Type: application/json" \
+    -H "x-model-name: facebook/opt-125m" \
+    -d '{
+    "model": "facebook/opt-125m",
+    "prompt": "Hello, "
+}' 
\ No newline at end of file
diff --git a/helm/templates/_helpers.tpl b/helm/templates/_helpers.tpl
new file mode 100644
index 0000000..d6d74d7
--- /dev/null
+++ b/helm/templates/_helpers.tpl
@@ -0,0 +1,321 @@
+{{/*
+Expand the name of the chart.
+*/}}
+{{- define "llm-d-modelservice.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Create a default fully qualified app name.
+We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
+If release name contains chart name it will be used as a full name.
+*/}}
+{{- define "llm-d-modelservice.fullname" -}}
+{{- if .Values.fullnameOverride }}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- $name := default .Chart.Name .Values.nameOverride }}
+{{- if contains $name .Release.Name }}
+{{- .Release.Name | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
+{{- end }}
+{{- end }}
+{{- end }}
+
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "llm-d-modelservice.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Common labels
+*/}}
+{{- define "llm-d-modelservice.labels" -}}
+helm.sh/chart: {{ include "llm-d-modelservice.chart" . }}
+{{- if .Chart.AppVersion }}
+app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
+{{- end }}
+app.kubernetes.io/managed-by: {{ .Release.Service }}
+{{- end }}
+
+{{/* Sanitized model name (DNS compliant) */}}
+{{- define "llm-d-modelservice.sanitizedModelName" -}}
+  {{- $name := .Release.Name | lower | trim -}}
+  {{- $name = regexReplaceAll "[^a-z0-9_.-]" $name "-" -}}
+  {{- $name = regexReplaceAll "^[\\-._]+" $name "" -}}
+  {{- $name = regexReplaceAll "[\\-._]+$" $name "" -}}
+  {{- $name = regexReplaceAll "\\." $name "-" -}}
+
+  {{- if gt (len $name) 63 -}}
+    {{- $name = substr 0 63 $name -}}
+  {{- end -}}
+
+{{- $name -}}
+{{- end }}
+
+{{/* Common P/D labels */}}
+{{- define "llm-d-modelservice.pdlabels" -}}
+llm-d.ai/inferenceServing: "true"
+llm-d.ai/model: {{ (include "llm-d-modelservice.fullname" .) -}}
+{{- end }}
+
+{{/* prefill labels */}}
+{{- define "llm-d-modelservice.prefilllabels" -}}
+{{ include "llm-d-modelservice.pdlabels" . }}
+llm-d.ai/role: prefill
+{{- end }}
+
+{{/* decode labels */}}
+{{- define "llm-d-modelservice.decodelabels" -}}
+{{ include "llm-d-modelservice.pdlabels" . }}
+llm-d.ai/role: decode
+{{- end }}
+
+{{/* affinity from acceleratorTypes */}}
+{{- define "llm-d-modelservice.acceleratorTypes" -}}
+affinity:
+  nodeAffinity:
+    requiredDuringSchedulingIgnoredDuringExecution:
+      nodeSelectorTerms:
+        - matchExpressions:
+          - key: {{ .labelKey }}
+            operator: In
+            {{- with .labelValues }}
+            values:
+            {{- toYaml . | nindent 14 }}
+            {{- end }}
+{{- end }}
+
+{{/* Routing proxy -- sidecar for decode pods */}}
+{{- define "llm-d-modelservice.routingProxy" -}}
+initContainers:
+  - name: routing-proxy
+    args:
+      - --port={{ default 8080 .servicePort }}
+      - --vllm-port={{ default 8200 .proxy.targetPort }}
+      - --connector=nixlv2
+      - -v={{ default 5 .proxy.debugLevel }}
+    image: {{ default "ghcr.io/llm-d/llm-d-routing-sidecar:0.0.6" .proxy.image }}
+    imagePullPolicy: Always
+    ports:
+      - containerPort: {{ default 8080 .servicePort }}
+    resources: {}
+    restartPolicy: Always
+    securityContext:
+      allowPrivilegeEscalation: false
+      runAsNonRoot: true
+{{- end }}
+
+{{/* Desired P/d tensor parallelism -- user set or defaults to 1 */}}
+{{- define "llm-d-modelservice.tensorParallelism" -}}
+{{- if and . .tensor }}{{ .tensor }}{{ else }}1{{ end }}
+{{- end }}
+
+{{/* Desired P/D data parallelism -- user set or defaults to 1 */}}
+{{- define "llm-d-modelservice.dataParallelism" -}}
+{{- if and . .data }}{{ .data }}{{ else }}1{{ end }}
+{{- end }}
+
+{{/*
+Port on which vllm container should listen.
+Context is helm root context plus key "role" ("decode" or "prefill")
+*/}}
+{{- define "llm-d-modelservice.vllmPort" -}}
+{{- if eq .role "prefill" }}{{ .Values.routing.servicePort }}{{ else }}{{ .Values.routing.proxy.targetPort }}{{ end }}
+{{- end }}
+
+{{/* P/D deployment container resources */}}
+{{- define "llm-d-modelservice.resources" -}}
+{{- $tensorParallelism := int (include "llm-d-modelservice.tensorParallelism" .parallelism) -}}
+{{- $limits := dict }}
+{{- if and .resources .resources.limits }}
+{{- $limits = deepCopy .resources.limits }}
+{{- end }}
+{{- if gt (int $tensorParallelism) 1 }}
+{{- $limits = mergeOverwrite $limits (dict "nvidia.com/gpu" $tensorParallelism) }}
+{{- end }}
+{{- $requests := dict }}
+{{- if and .resources .resources.requests }}
+{{- $requests = deepCopy .resources.requests }}
+{{- end }}
+{{- if gt (int $tensorParallelism) 1 }}
+{{- $requests = mergeOverwrite $requests (dict "nvidia.com/gpu" $tensorParallelism) }}
+{{- end }}
+resources:
+  limits:
+    {{- toYaml $limits | nindent 4 }}
+  requests:
+    {{- toYaml $requests | nindent 4 }}
+{{- end }}
+
+{{/* P/D service account name */}}
+{{- define "llm-d-modelservice.pdServiceAccountName" -}}
+{{ include "llm-d-modelservice.fullname" . }}-sa
+{{- end }}
+
+{{/* 
+EPP service account name 
+Context is helm root context
+*/}}
+{{- define "llm-d-modelservice.eppServiceAccountName" -}}
+{{ include "llm-d-modelservice.fullname" . }}-epp-sa
+{{- end }}
+
+{{/*
+Volumes for PD containers based on model artifact prefix
+Context is .Values.modelArtifacts
+*/}}
+{{- define "llm-d-modelservice.mountModelVolumeVolumes" -}}
+{{- $parsedArtifacts := regexSplit "://" .uri -1 -}}
+{{- $protocol := first $parsedArtifacts -}}
+{{- $path := last $parsedArtifacts -}}
+{{- if eq $protocol "hf" -}}
+- name: model-storage
+  emptyDir: 
+    sizeLimit: {{ default "0" .size }}
+{{- else if eq $protocol "pvc" }}
+{{- $parsedArtifacts := regexSplit "/" $path -1 -}}
+{{- $claim := first $parsedArtifacts -}}
+- name: model-storage
+  persistentVolumeClaim:
+    claimName: {{ $claim }}
+    readOnly: true
+{{- else if eq $protocol "oci" }}
+- name: model-storage
+  image:
+    reference: {{ $path }}
+    pullPolicy: {{ default "Always" .imagePullPolicy }}
+{{- end }}
+{{- end }}
+
+{{/*
+VolumeMount for a PD container
+Supplies model-storage mount if mountModelVolume: true for the container
+*/}}
+{{- define "llm-d-modelservice.mountModelVolumeVolumeMounts" -}}
+{{- if or .volumeMounts .mountModelVolume }}
+volumeMounts:
+{{- end }}
+{{- /* user supplied volume mount in values */}}
+{{- with .volumeMounts }}
+  {{- toYaml . | nindent 8 }}
+{{- end }}
+{{- /* what we add if mounModelVolume is true */}}
+{{- if .mountModelVolume }}
+  - name: model-storage
+    mountPath: /model-cache
+{{- end }}
+{{- end }}
+
+{{/*
+Pod elements of deployment/lws spec template
+context is a pdSpec
+*/}}
+{{- define "llm-d-modelservice.modelPod" -}}
+  {{- with .pdSpec.imagePullSecrets }}
+  imagePullSecrets:
+    {{- toYaml . | nindent 2 }}
+  {{- end }}
+  serviceAccountName: {{ include "llm-d-modelservice.pdServiceAccountName" . }}
+  {{- with .pdSpec.podSecurityContext }}
+  securityContext:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+  {{- with .pdSpec.acceleratorTypes }}
+  {{- include "llm-d-modelservice.acceleratorTypes" . | nindent 2 }}
+  {{- end }}
+  {{- if or .pdSpec.volumes .pdSpec.mountModelVolume }}
+  volumes:
+    {{- toYaml .pdSpec.volumes | nindent 4 }}
+    {{ include "llm-d-modelservice.mountModelVolumeVolumes" .Values.modelArtifacts | nindent 4}}
+  {{- end }}
+{{- end }} {{- /* define "llm-d-modelservice.modelPod" */}}
+
+{{/*
+Container elements of deployment/lws spec template
+context is a dict with helm root context plus:
+   key - "container"; value - container spec
+   key - "roll"; value - either "decode" or "prefill"
+   key - "parallelism"; value - $.Values.decode.parallelism
+*/}}
+{{- define "llm-d-modelservice.container" -}}
+- name: {{ default "vllm" .container.name }}
+  image: {{ required "image of container is required" .container.image }}
+  {{- with .container.securityContext }}
+  securityContext:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+  {{- with .container.imagePullPolicy }}
+  imagePullPolicy: {{ . }}
+  {{- end }}
+  {{- with .container.command }}
+  command:
+    {{- toYaml . | nindent 2 }}
+  {{- end }}
+  args:
+  - {{ .Values.routing.modelName | quote }}
+  - --port
+  - {{ (include "llm-d-modelservice.vllmPort" .) | quote }}
+  {{- with .container.args }}
+    {{- toYaml . | nindent 2 }}
+  {{- end }}
+  {{- /* insert user's env for this container */}}
+  {{- if or .container.env .container.mountModelVolume }}
+  env:
+  {{- end }}
+  {{- with .container.env }}
+    {{- toYaml . | nindent 2 }}
+  {{- end }}
+  - name: DP_SIZE
+    value: {{ include "llm-d-modelservice.tensorParallelism" .parallelism | quote }}
+  - name: TP_SIZE
+    value: {{ include "llm-d-modelservice.dataParallelism" .parallelism | quote }}
+  - name: DP_SIZE_LOCAL
+    value: "1"
+  {{- /* insert envs based on what modelArtifact prefix */}}
+  {{- if .container.mountModelVolume }}
+  - name: HF_HOME
+    value: /model-cache
+  {{- with .Values.modelArtifacts.authSecretName }}
+  - name: HF_TOKEN
+    valueFrom:
+      secretKeyRef:
+        name: {{ . }}
+        key: HF_TOKEN
+  {{- end }}
+  {{- end }}
+  {{- with .container.livenessProbe }}
+  livenessProbe:
+    {{- toYaml . | nindent 2 }}
+  {{- end }}
+  {{- with .container.readinessProbe }}
+  readinessProbe:
+    {{- toYaml . | nindent 2 }}
+  {{- end }}
+  {{- (include "llm-d-modelservice.resources" (dict "resources" .container.resources "parallelism" .parallelism)) | nindent 2 }}
+  {{- /* volumeMount */}}
+  {{- if or .container.volumeMounts .container.mountModelVolume }}
+  volumeMounts:
+  {{- end -}}
+  {{- /* user supplied volume mount in values */}}
+  {{- with .container.volumeMounts }}
+    {{- toYaml . | nindent 2 }}
+  {{- end }}
+  {{- /* what we add if mounModelVolume is true */}}
+  {{- if .container.mountModelVolume }}
+  - name: model-storage
+    mountPath: /model-cache
+  {{- end }}
+  {{- with .container.workingDir }}
+  workingDir: {{ . }}
+  {{- end }}
+  {{- with .container.stdin }}
+  stdin: {{ . }}
+  {{- end }}
+  {{- with .container.tty }}
+  tty: {{ . }}
+  {{- end }}
+{{- end }} {{- /* define "llm-d-modelservice.container" */}}
diff --git a/helm/templates/decode-deployment.yaml b/helm/templates/decode-deployment.yaml
new file mode 100644
index 0000000..52bc147
--- /dev/null
+++ b/helm/templates/decode-deployment.yaml
@@ -0,0 +1,28 @@
+{{- if and .Values.decode (not .Values.multinode) }}
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "llm-d-modelservice.fullname" . }}-decode
+  labels:
+    {{- include "llm-d-modelservice.labels" . | nindent 4 }}
+spec:
+  replicas: {{ default 1 .Values.decode.replicas }}
+  selector:
+    matchLabels:
+      {{- include "llm-d-modelservice.decodelabels" . | nindent 6 }}
+  template:
+    metadata:
+      labels:
+        {{- include "llm-d-modelservice.decodelabels" . | nindent 8 }}
+    spec:
+      {{- with .Values.routing }}
+      {{- (include "llm-d-modelservice.routingProxy" .) | nindent 6 }}
+      {{- end }}
+      {{- (include "llm-d-modelservice.modelPod" (merge . (dict "pdSpec" .Values.decode))) | nindent 4 }}
+      {{- with .Values.decode.containers }}
+      containers:
+        {{- range . }}
+        {{- (include "llm-d-modelservice.container" (merge (dict "role" "decode" "container" . "parallelism" $.Values.decode.parallelism) $)) | nindent 8 }}
+        {{- end }}
+      {{- end }}
+{{- end }}
\ No newline at end of file
diff --git a/helm/templates/decode-lws.yaml b/helm/templates/decode-lws.yaml
new file mode 100644
index 0000000..a3e840d
--- /dev/null
+++ b/helm/templates/decode-lws.yaml
@@ -0,0 +1,44 @@
+{{- if and .Values.decode .Values.multinode }}
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+  name: {{ include "llm-d-modelservice.fullname" . }}-decode
+  labels:
+    {{- include "llm-d-modelservice.labels" . | nindent 4 }}
+    {{- include "llm-d-modelservice.decodelabels" . | nindent 4 }}
+spec:
+  {{- if not .Values.decode.autoscaling.enabled }}
+  replicas: {{ default 1 .Values.decode.replicas }}
+  {{- end }}
+  leaderWorkerTemplate:
+    size: {{ int (include "llm-d-modelservice.dataParallelism" .Values.decode.parallelism) }}
+    leaderTemplate:
+      metadata:
+        labels:
+          {{- include "llm-d-modelservice.decodelabels" . | nindent 10 }}
+      spec:
+        {{- with .Values.routing }}
+        {{ (include "llm-d-modelservice.routingProxy" .) | nindent 8 }}
+        {{- end }}
+        {{- (include "llm-d-modelservice.modelPod" (merge . (dict "pdSpec" .Values.decode))) | nindent 6 }}
+        {{- with .Values.decode.containers }}
+        containers:
+          {{- range . }}
+        {{- (include "llm-d-modelservice.container" (merge (dict "role" "decode" "container" . "parallelism" $.Values.decode.parallelism) $)) | nindent 8 }}
+          {{- end }}
+        {{- end }}
+
+    workerTemplate:
+      metadata:
+        labels:
+          {{- include "llm-d-modelservice.decodelabels" . | nindent 10 }}
+      spec:
+        {{- (include "llm-d-modelservice.modelPod" (merge . (dict "pdSpec" .Values.decode))) | nindent 6 }}
+        {{- with .Values.decode.containers }}
+        containers:
+          {{- range . }}
+        {{- (include "llm-d-modelservice.container" (merge (dict "role" "decode" "container" . "parallelism" $.Values.decode.parallelism) $)) | nindent 8 }}
+          {{- end }}
+        {{- end }}
+
+{{- end }}
diff --git a/helm/templates/epp-deployment.yaml b/helm/templates/epp-deployment.yaml
new file mode 100644
index 0000000..825d0bc
--- /dev/null
+++ b/helm/templates/epp-deployment.yaml
@@ -0,0 +1,108 @@
+apiVersion: apps/v1 
+kind: Deployment 
+metadata: 
+  name: {{ include "llm-d-modelservice.fullname" . }}-epp
+  labels: 
+    llm-d.ai/epp: {{ include "llm-d-modelservice.fullname" . }}-epp
+  namespace: {{ .Release.Namespace }}
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      llm-d.ai/epp: {{ include "llm-d-modelservice.fullname" . }}-epp
+  template:
+    metadata:
+      labels:
+        llm-d.ai/epp: {{ include "llm-d-modelservice.fullname" . }}-epp
+    spec:
+      containers:
+      - name: epp 
+        imagePullPolicy: Always 
+        image: {{ default "ghcr.io/llm-d/llm-d-inference-scheduler:0.0.3" .Values.endpointPicker.image }}
+        args:
+        - --poolName
+        - {{ include "llm-d-modelservice.fullname" . }}-inference-pool
+        - --poolNamespace
+        - {{ .Release.Namespace }}
+        - -v
+        - "{{ default 4 .Values.endpointPicker.debugLevel }}"
+        - --zap-encoder
+        - json
+        - --grpcPort
+        - "9002"
+        - --grpcHealthPort
+        - "9003"
+        # using defaults from https://github.com/llm-d/llm-d-deployer/blob/main/charts/llm-d/values.yaml#L563-L603
+        env:
+        - name: ENABLE_KVCACHE_AWARE_SCORER
+          value: "false"
+        - name: ENABLE_LOAD_AWARE_SCORER
+          value: "true"
+        - name: ENABLE_PREFIX_AWARE_SCORER
+          value: "true"
+        - name: ENABLE_SESSION_AWARE_SCORER
+          value: "false"
+        - name: KVCACHE_AWARE_SCORER_WEIGHT
+          value: "1"
+        - name: KVCACHE_INDEXER_REDIS_ADDR
+        - name: LOAD_AWARE_SCORER_WEIGHT
+          value: "1"
+        - name: PD_ENABLED
+          value: "false"
+        - name: PD_PROMPT_LEN_THRESHOLD
+          value: "10"
+        - name: PREFILL_ENABLE_KVCACHE_AWARE_SCORER
+          value: "false"
+        - name: PREFILL_ENABLE_LOAD_AWARE_SCORER
+          value: "false"
+        - name: PREFILL_ENABLE_PREFIX_AWARE_SCORER
+          value: "false"
+        - name: PREFILL_ENABLE_SESSION_AWARE_SCORER
+          value: "false"
+        - name: PREFILL_KVCACHE_AWARE_SCORER_WEIGHT
+          value: "1"
+        - name: PREFILL_KVCACHE_INDEXER_REDIS_ADDR
+        - name: PREFILL_LOAD_AWARE_SCORER_WEIGHT
+          value: "1"
+        - name: PREFILL_PREFIX_AWARE_SCORER_WEIGHT
+          value: "1"
+        - name: PREFILL_SESSION_AWARE_SCORER_WEIGHT
+          value: "1"
+        - name: PREFIX_AWARE_SCORER_WEIGHT
+          value: "2"
+        - name: SESSION_AWARE_SCORER_WEIGHT
+          value: "1"
+        ports:
+        - containerPort: 9002
+          name: grpc
+          protocol: TCP
+        - containerPort: 9003
+          name: grpc-health
+          protocol: TCP
+        - containerPort: 9090
+          name: metrics
+          protocol: TCP
+        {{- if (not .Values.endpointPicker.disableReadinessProbe) }}
+        readinessProbe:
+          grpc:
+            port: 9003
+            service: envoy.service.ext_proc.v3.ExternalProcessor
+          initialDelaySeconds: 5
+          timeoutSeconds: 1
+          periodSeconds: 10
+          successThreshold: 1
+          failureThreshold: 3
+        {{- end }}
+        {{- if (not .Values.endpointPicker.disableLivenessProbe) }}
+        livenessProbe:
+          grpc:
+            port: 9003
+            service: envoy.service.ext_proc.v3.ExternalProcessor
+          initialDelaySeconds: 5
+          timeoutSeconds: 1
+          periodSeconds: 10
+          successThreshold: 1
+          failureThreshold: 3
+        {{- end }}
+      serviceAccount: {{ include "llm-d-modelservice.eppServiceAccountName" . }}
+      serviceAccountName: {{ include "llm-d-modelservice.eppServiceAccountName" . }}
\ No newline at end of file
diff --git a/helm/templates/epp-sa.yaml b/helm/templates/epp-sa.yaml
new file mode 100644
index 0000000..5503b9d
--- /dev/null
+++ b/helm/templates/epp-sa.yaml
@@ -0,0 +1,8 @@
+{{- if .Values.endpointPicker -}}
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: {{ include "llm-d-modelservice.eppServiceAccountName" . }}
+  labels:
+    {{- include "llm-d-modelservice.labels" . | nindent 4 }}
+{{- end }}
diff --git a/helm/templates/epp-service.yaml b/helm/templates/epp-service.yaml
new file mode 100644
index 0000000..9c182a9
--- /dev/null
+++ b/helm/templates/epp-service.yaml
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ include "llm-d-modelservice.fullname" . }}-epp-service
+  labels:
+    {{- include "llm-d-modelservice.labels" . | nindent 4 }}
+spec:
+  type: {{ .Values.service.type }}
+  ports:
+    - port: {{ .Values.endpointPicker.service.port }}
+      targetPort: {{ .Values.endpointPicker.service.targetPort }}
+      protocol: TCP
+      appProtocol: {{ .Values.endpointPicker.service.appProtocol }}
+  selector:
+    llm-d.ai/epp: {{ include "llm-d-modelservice.fullname" . }}-epp
diff --git a/helm/templates/prefill-deployment.yaml b/helm/templates/prefill-deployment.yaml
new file mode 100644
index 0000000..1be4784
--- /dev/null
+++ b/helm/templates/prefill-deployment.yaml
@@ -0,0 +1,25 @@
+{{- if and .Values.prefill (not .Values.multinode) }}
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "llm-d-modelservice.fullname" . }}-prefill
+  labels:
+    {{- include "llm-d-modelservice.labels" . | nindent 4 }}
+spec:
+  replicas: {{ default 1 .Values.prefill.replicas }}
+  selector:
+    matchLabels:
+      {{- include "llm-d-modelservice.prefilllabels" . | nindent 6 }}
+  template:
+    metadata:
+      labels:
+        {{- include "llm-d-modelservice.prefilllabels" . | nindent 8 }}
+    spec:
+      {{- (include "llm-d-modelservice.modelPod" (merge . (dict "pdSpec" .Values.prefill))) | nindent 4 }}
+      {{- with .Values.prefill.containers }}
+      containers:
+        {{- range . }}
+        {{- (include "llm-d-modelservice.container" (merge (dict "role" "prefill" "container" . "parallelism" $.Values.prefill.parallelism) $)) | nindent 8 }}
+        {{- end }}
+      {{- end }}
+{{- end }}
\ No newline at end of file
diff --git a/helm/templates/prefill-lws.yaml b/helm/templates/prefill-lws.yaml
new file mode 100644
index 0000000..618633a
--- /dev/null
+++ b/helm/templates/prefill-lws.yaml
@@ -0,0 +1,41 @@
+{{- if and .Values.prefill .Values.multinode }}
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+  name: {{ include "llm-d-modelservice.fullname" . }}-prefill
+  labels:
+    {{- include "llm-d-modelservice.labels" . | nindent 4 }}
+    {{- include "llm-d-modelservice.prefilllabels" . | nindent 4 }}
+spec:
+  {{- if not .Values.prefill.autoscaling.enabled }}
+  replicas: {{ default 1 .Values.prefill.replicas }}
+  {{- end }}
+  leaderWorkerTemplate:
+    size: {{ int (include "llm-d-modelservice.dataParallelism" .Values.prefill.parallelism) }}
+    leaderTemplate:
+      metadata:
+        labels:
+          {{- include "llm-d-modelservice.prefilllabels" . | nindent 10 }}
+      spec:
+        {{- (include "llm-d-modelservice.modelPod" (merge . (dict "pdSpec" .Values.prefill))) | nindent 6 }}
+        {{- with .Values.prefill.containers }}
+        containers:
+          {{- range . }}
+        {{- (include "llm-d-modelservice.container" (merge (dict "role" "prefill" "container" . "parallelism" $.Values.prefill.parallelism) $)) | nindent 8 }}
+          {{- end }}
+        {{- end }}
+
+    workerTemplate:
+      metadata:
+        labels:
+          {{- include "llm-d-modelservice.prefilllabels" . | nindent 10 }}
+      spec:
+        {{- (include "llm-d-modelservice.modelPod" (merge . (dict "pdSpec" .Values.prefill))) | nindent 6 }}
+        {{- with .Values.prefill.containers }}
+        containers:
+          {{- range . }}
+        {{- (include "llm-d-modelservice.container" (merge (dict "role" "prefill" "container" . "parallelism" $.Values.prefill.parallelism) $)) | nindent 8 }}
+          {{- end }}
+        {{- end }}
+
+{{- end }}
diff --git a/helm/templates/rolebinding.yaml b/helm/templates/rolebinding.yaml
new file mode 100644
index 0000000..e9ad13b
--- /dev/null
+++ b/helm/templates/rolebinding.yaml
@@ -0,0 +1,11 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: {{ include "llm-d-modelservice.fullname" . }}-epp-rolebinding
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: {{ required "role for inference schedule required" .Values.endpointPicker.permissions }}
+subjects:
+- kind: ServiceAccount
+  name: {{ include "llm-d-modelservice.eppServiceAccountName" . }}
\ No newline at end of file
diff --git a/helm/templates/routing.yaml b/helm/templates/routing.yaml
new file mode 100644
index 0000000..8e7c42c
--- /dev/null
+++ b/helm/templates/routing.yaml
@@ -0,0 +1,66 @@
+{{- /* Routing templates: InferencePool, InferenceModel, and HttpRoute */}}
+{{- if .Values.inferencePool }}
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferencePool
+metadata:
+  name: {{ include "llm-d-modelservice.fullname" . }}-inference-pool
+  namespace: {{ .Release.Namespace }}
+spec:
+  extensionRef:
+    failureMode: FailClose
+    group: ""
+    kind: Service
+    name: {{ include "llm-d-modelservice.fullname" . }}-epp-service
+  selector:
+    {{- if .Values.multinode }}
+    leaderworkerset.sigs.k8s.io/worker-index: "0"
+    {{- end }}
+    {{- include "llm-d-modelservice.pdlabels" . | nindent 4 }}
+  targetPortNumber: {{ .Values.routing.servicePort }}
+{{- end }}
+--- 
+{{- if .Values.inferenceModel }}
+apiVersion: inference.networking.x-k8s.io/v1alpha2
+kind: InferenceModel
+metadata:
+  name: {{ include "llm-d-modelservice.fullname" . }}-inference-model
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "llm-d-modelservice.pdlabels" . | nindent 4 }}
+spec:
+  modelName: {{ .Values.routing.modelName }}
+  poolRef:
+    group: inference.networking.x-k8s.io
+    kind: InferencePool
+    name: {{ include "llm-d-modelservice.fullname" . }}-inference-pool
+{{- end }}
+--- 
+{{- if .Values.httpRoute }}
+apiVersion: gateway.networking.k8s.io/v1
+kind: HTTPRoute
+metadata:
+  name: {{ include "llm-d-modelservice.fullname" . }}-http-route
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "llm-d-modelservice.labels" . | nindent 4 }}
+spec:
+  {{- with .Values.routing.parentRefs }}
+  parentRefs:
+  {{- . | toYaml | nindent 2}}
+  {{- end }}
+  rules:
+  - backendRefs:
+    - group: inference.networking.x-k8s.io
+      kind: InferencePool
+      name: {{ include "llm-d-modelservice.fullname" . }}-inference-pool
+      port: {{ .Values.routing.servicePort  }}
+      weight: 1
+    matches:
+    - path:
+        type: PathPrefix
+        value: /
+      headers:
+      - name: x-model-name
+        type: Exact
+        value: {{ .Values.routing.modelName }}
+{{- end }}
\ No newline at end of file
diff --git a/helm/templates/serviceaccount.yaml b/helm/templates/serviceaccount.yaml
new file mode 100644
index 0000000..463b584
--- /dev/null
+++ b/helm/templates/serviceaccount.yaml
@@ -0,0 +1,8 @@
+{{- if or .Values.prefill .Values.decode -}}
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: {{ include "llm-d-modelservice.pdServiceAccountName" . }}
+  labels:
+    {{- include "llm-d-modelservice.labels" . | nindent 4 }}
+{{- end }}
diff --git a/helm/values-msvc.yaml b/helm/values-msvc.yaml
new file mode 100644
index 0000000..2fa22fa
--- /dev/null
+++ b/helm/values-msvc.yaml
@@ -0,0 +1,98 @@
+multinode: false  # If true, creates LWS instead of deployments  
+inferencePool: true 
+inferenceModel: true 
+httpRoute: true 
+
+routing: 
+  # This is the model name for the OpenAI request
+  modelName: facebook/opt-125m
+  ports:
+    servicePort: 8000   # Sidecar listens on this port for requests. If there's no sidecar, the request goes here
+    internalPort: 8200  # Sidecar forwards request to vllm container on this port 
+    proxy:
+      targetPort: 8000
+  parentRefs:
+  - group: gateway.networking.k8s.io
+    kind: Gateway
+    name: inference-gateway-kgateway
+
+modelArtifacts:
+  # When specfying the URI with `hf` prefix, the <repo-id>/<model-id> string
+  # is extracted and exposed as a template variable that can be used as {{ .HFModelName }}
+  prefix: "oci"
+  artifact: facebook/opt-125m
+  authSecretName: "hf-secret"
+  size: 5Mi 
+  imagePullPolicy: IfNotPresent
+
+# describe decode pods
+decode:
+  enableService: false
+  replicas: 1
+  
+  # for LWS
+  parallelism:  
+    tensor: 8
+    data: 1
+    dataLocal: 1 
+  
+  acceleratorTypes:
+    labelKey: nvidia.com/gpu.product
+    labelValues:
+      # According to the blog, Scout requires H100s
+      - NVIDIA-H100
+  # initContainers:
+  containers:
+  - name: "vllm"
+    image: "vllm-ai/vllm:latest"  
+    args:
+      - "HFModelName"
+    env:
+    - name: "VLLM_LOG_LEVEL"
+      value: "DEBUG"  # Set to DEBUG for more detailed logs, or INFO for less verbose logs
+    envFrom:
+      - configMapRef:
+          name: vllm-config
+    resources:
+      requests:
+        cpu: "1"          # Request 1 CPU core
+        memory: "4Gi"    # Request 4 GiB of memory
+      limits:
+        cpu: "2"          # Limit to 2 CPU cores
+        memory: "8Gi"     # Limit to 8 GiB of memory
+    mountModelVolume: true 
+
+# describe the prefill pods (looks the same as above)
+prefill:
+  replicas: 1
+  containers:
+    - name: "vllm"
+      image: "vllm-ai/vllm:latest"  
+      args:
+        - "HFModelName"
+      env: 
+        - name: ok 
+          value: ok 
+      mountModelVolume: true
+    - name: "v2"
+      image: "vllm-ai/vllm:latest"  
+      volumeMounts: 
+        - name: whatever 
+          mountPath: something 
+  volumes: 
+    - name: ok 
+      emptyDir:
+        sizeLimit: 5Gi 
+    - name: ok2
+      emptyDir:
+        sizeLimit: 5Gi 
+    
+endpointPicker:
+  # This is for setting up a service more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/
+  service:
+    # This sets the service type more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types
+    type: ClusterIP
+    # This sets the ports more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#field-spec-ports
+    port: 9002
+    targetPort: 9002
+    appProtocol: http2
diff --git a/helm/values.yaml b/helm/values.yaml
new file mode 100644
index 0000000..3e5575d
--- /dev/null
+++ b/helm/values.yaml
@@ -0,0 +1,123 @@
+# Default values for llm-d-modelservice.
+# This is a YAML-formatted file.
+# Declare variables to be passed into your templates.
+
+# This will set the replicaset count more information can be found here: https://kubernetes.io/docs/concepts/workloads/controllers/replicaset/
+replicaCount: 1
+
+# This sets the container image more information can be found here: https://kubernetes.io/docs/concepts/containers/images/
+image:
+  repository: nginx
+  # This sets the pull policy for images.
+  pullPolicy: IfNotPresent
+  # Overrides the image tag whose default is the chart appVersion.
+  tag: ""
+
+# This is for the secrets for pulling an image from a private repository more information can be found here: https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/
+imagePullSecrets: []
+# This is to override the chart name.
+nameOverride: ""
+fullnameOverride: ""
+
+# This section builds out the service account more information can be found here: https://kubernetes.io/docs/concepts/security/service-accounts/
+serviceAccount:
+  # Specifies whether a service account should be created
+  create: true
+  # Automatically mount a ServiceAccount's API credentials?
+  automount: true
+  # Annotations to add to the service account
+  annotations: {}
+  # The name of the service account to use.
+  # If not set and create is true, a name is generated using the fullname template
+  name: ""
+
+# This is for setting Kubernetes Annotations to a Pod.
+# For more information checkout: https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/
+podAnnotations: {}
+# This is for setting Kubernetes Labels to a Pod.
+# For more information checkout: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/
+podLabels: {}
+
+podSecurityContext: {}
+  # fsGroup: 2000
+
+securityContext: {}
+  # capabilities:
+  #   drop:
+  #   - ALL
+  # readOnlyRootFilesystem: true
+  # runAsNonRoot: true
+  # runAsUser: 1000
+
+# This is for setting up a service more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/
+service:
+  # This sets the service type more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types
+  type: ClusterIP
+  # This sets the ports more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#field-spec-ports
+  port: 80
+
+# This block is for setting up the ingress for more information can be found here: https://kubernetes.io/docs/concepts/services-networking/ingress/
+ingress:
+  enabled: false
+  className: ""
+  annotations: {}
+    # kubernetes.io/ingress.class: nginx
+    # kubernetes.io/tls-acme: "true"
+  hosts:
+    - host: chart-example.local
+      paths:
+        - path: /
+          pathType: ImplementationSpecific
+  tls: []
+  #  - secretName: chart-example-tls
+  #    hosts:
+  #      - chart-example.local
+
+resources: {}
+  # We usually recommend not to specify default resources and to leave this as a conscious
+  # choice for the user. This also increases chances charts run on environments with little
+  # resources, such as Minikube. If you do want to specify resources, uncomment the following
+  # lines, adjust them as necessary, and remove the curly braces after 'resources:'.
+  # limits:
+  #   cpu: 100m
+  #   memory: 128Mi
+  # requests:
+  #   cpu: 100m
+  #   memory: 128Mi
+
+# This is to setup the liveness and readiness probes more information can be found here: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/
+livenessProbe:
+  httpGet:
+    path: /
+    port: http
+readinessProbe:
+  httpGet:
+    path: /
+    port: http
+
+# This section is for setting up autoscaling more information can be found here: https://kubernetes.io/docs/concepts/workloads/autoscaling/
+autoscaling:
+  enabled: false
+  minReplicas: 1
+  maxReplicas: 100
+  targetCPUUtilizationPercentage: 80
+  # targetMemoryUtilizationPercentage: 80
+
+# Additional volumes on the output Deployment definition.
+volumes: []
+# - name: foo
+#   secret:
+#     secretName: mysecret
+#     optional: false
+
+# Additional volumeMounts on the output Deployment definition.
+volumeMounts: []
+# - name: foo
+#   mountPath: "/etc/foo"
+#   readOnly: true
+
+nodeSelector: {}
+
+tolerations: []
+
+affinity: {}