Skip to content
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ crash.*.log
# to change depending on the environment.
*.tfvars
*.tfvars.json
**/*.tfvars

# Ignore override files as they are usually used to override resources locally and so
# are not checked in
Expand Down
4 changes: 2 additions & 2 deletions airflow/dags/cwl_dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@
is_paused_upon_creation=False,
catchup=False,
schedule=None,
max_active_runs=100,
max_active_tasks=300,
max_active_runs=1000,
max_active_tasks=3000,
default_args=dag_default_args,
params={
"cwl_workflow": Param(
Expand Down
66 changes: 60 additions & 6 deletions airflow/helm/values.tmpl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,11 @@ scheduler:
values: ["on-demand"]
- key: "karpenter.k8s.aws/instance-family"
operator: "In"
values: ["c6i", "c5"] # Choosing compute-optimized instances
# values: ["c6i", "c5"] # Choosing compute-optimized instances
values: ["r5"] # Choosing memory-optimized instance
- key: "karpenter.k8s.aws/instance-cpu"
operator: "In"
values: ["2", "4"] # Scheduler might benefit from higher CPU
values: ["8"]
topologySpreadConstraints:
- maxSkew: 1
topologyKey: "topology.kubernetes.io/zone"
Expand All @@ -117,13 +118,47 @@ triggerer:
keda:
enabled: true
minReplicaCount: 1
nodeSelector:
"karpenter.sh/nodepool": "airflow-core-components"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: "karpenter.sh/capacity-type"
operator: "In"
values: [ "on-demand" ]
- key: "karpenter.k8s.aws/instance-family"
operator: "In"
# values: ["c6i", "c5"] # Choosing compute-optimized instances
values: [ "r5" ] # Choosing memory-optimized instance
- key: "karpenter.k8s.aws/instance-cpu"
operator: "In"
values: [ "8" ] # Scheduler might benefit from higher CPU

postgresql:
enabled: false

pgbouncer:
enabled: true
replicas: 3
nodeSelector:
"karpenter.sh/nodepool": "airflow-core-components"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: "karpenter.sh/capacity-type"
operator: "In"
values: [ "on-demand" ]
- key: "karpenter.k8s.aws/instance-family"
operator: "In"
# values: ["c6i", "c5"] # Choosing compute-optimized instances
values: [ "r5" ] # Choosing memory-optimized instance
- key: "karpenter.k8s.aws/instance-cpu"
operator: "In"
values: [ "8" ] # Scheduler might benefit from higher CPU

webserverSecretKeySecretName: ${webserver_secret_name}

Expand All @@ -147,10 +182,11 @@ webserver:
values: ["on-demand"]
- key: "karpenter.k8s.aws/instance-family"
operator: "In"
values: ["c6i", "c5"] # Choosing compute-optimized instances
# values: ["c6i", "c5"] # Choosing compute-optimized instances
values: ["r5"] # Choosing memory-optimized instance
- key: "karpenter.k8s.aws/instance-cpu"
operator: "In"
values: ["2", "4"] # Balancing between CPU and memory
values: ["8"] # Balancing between CPU and memory
topologySpreadConstraints:
- maxSkew: 1
topologyKey: "topology.kubernetes.io/zone"
Expand Down Expand Up @@ -184,10 +220,11 @@ workers:
- matchExpressions:
- key: "karpenter.k8s.aws/instance-family"
operator: "In"
values: ["t3"]
# values: ["c6i", "c5"] # Choosing compute-optimized instances
values: ["r5"] # Choosing memory-optimized instance
- key: "karpenter.k8s.aws/instance-cpu"
operator: "In"
values: ["2", "4"]
values: ["8"]
topologySpreadConstraints:
- maxSkew: 1
topologyKey: "topology.kubernetes.io/zone"
Expand Down Expand Up @@ -263,6 +300,23 @@ dags:
dagProcessor:
enabled: true
replicas: 3
nodeSelector:
"karpenter.sh/nodepool": "airflow-core-components"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: "karpenter.sh/capacity-type"
operator: "In"
values: [ "on-demand" ]
- key: "karpenter.k8s.aws/instance-family"
operator: "In"
# values: ["c6i", "c5"] # Choosing compute-optimized instances
values: [ "r5" ] # Choosing memory-optimized instance
- key: "karpenter.k8s.aws/instance-cpu"
operator: "In"
values: [ "8" ] # Scheduler might benefit from higher CPU

env:
- name: "AIRFLOW_VAR_KUBERNETES_PIPELINE_NAMESPACE"
Expand Down
36 changes: 23 additions & 13 deletions airflow/plugins/unity_sps_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

# Note: each Pod is assigned the same label to assure that (via the anti-affinity requirements)
# two Pods with the same label cannot run on the same Node
SPS_DOCKER_CWL_IMAGE = "ghcr.io/unity-sds/unity-sps/sps-docker-cwl:2.5.5"
SPS_DOCKER_CWL_IMAGE = "ghcr.io/unity-sds/unity-sps/sps-docker-cwl:2.5.6"

NODE_POOL_DEFAULT = "airflow-kubernetes-pod-operator"
NODE_POOL_HIGH_WORKLOAD = "airflow-kubernetes-pod-operator-high-workload"
Expand All @@ -27,11 +27,16 @@
LOG_LEVEL_TYPE = {10: "DEBUG", 20: "INFO", 30: "WARNING", 40: "ERROR", 50: "CRITICAL"}

EC2_TYPES = {
"t3.micro": {
"desc": "General Purpose",
"cpu": 1,
"memory": 1,
},
# "t3.nano": {
# "desc": "General Purpose",
# "cpu": 1,
# "memory": 0.5,
# },
# "t3.micro": {
# "desc": "General Purpose",
# "cpu": 2,
# "memory": 1,
# },
"t3.small": {
"desc": "General Purpose",
"cpu": 2,
Expand Down Expand Up @@ -97,23 +102,28 @@
"cpu": 32,
"memory": 64,
},
"m5ad.large": {
"desc": "General Purpose with SSD storage",
"cpu": 2,
"memory": 8,
"c6i.12xlarge": {
"desc": "Compute Optimized",
"cpu": 48,
"memory": 96,
},
"c6i.16xlarge": {
"desc": "Compute Optimized",
"cpu": 64,
"memory": 128,
},
"m5ad.xlarge": {
"desc": "General Purpose with SSD storage",
"desc": "General Purpose with SSD local storage",
"cpu": 4,
"memory": 16,
},
"m5ad.2xlarge": {
"desc": "General Purpose with SSD storage",
"desc": "General Purpose with SSD local storage",
"cpu": 8,
"memory": 32,
},
"m5ad.4xlarge": {
"desc": "General Purpose with SSD storage",
"desc": "General Purpose with SSD local storage",
"cpu": 16,
"memory": 64,
},
Expand Down
19 changes: 0 additions & 19 deletions terraform-unity/.terraform.lock.hcl

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions terraform-unity/README.md

Large diffs are not rendered by default.

8 changes: 6 additions & 2 deletions terraform-unity/modules/terraform-unity-sps-eks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,33 +7,37 @@
|------|---------|
| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | ~> 1.8.2 |
| <a name="requirement_aws"></a> [aws](#requirement\_aws) | 5.67.0 |
| <a name="requirement_null"></a> [null](#requirement\_null) | 3.2.3 |

## Providers

| Name | Version |
|------|---------|
| <a name="provider_aws"></a> [aws](#provider\_aws) | 5.67.0 |
| <a name="provider_null"></a> [null](#provider\_null) | 3.2.3 |

## Modules

| Name | Source | Version |
|------|--------|---------|
| <a name="module_unity-eks"></a> [unity-eks](#module\_unity-eks) | git::https://github.com/unity-sds/unity-cs-infra.git//terraform-unity-eks_module | unity-sps-2.4.0 |
| <a name="module_unity-eks"></a> [unity-eks](#module\_unity-eks) | git::https://github.com/unity-sds/unity-cs-infra.git//terraform-unity-eks_module | unity-sps-2.4.1-hotfix1 |

## Resources

| Name | Type |
|------|------|
| [aws_iam_role_policy.sps_airflow_eks_inline_policy](https://registry.terraform.io/providers/hashicorp/aws/5.67.0/docs/resources/iam_role_policy) | resource |
| [null_resource.eks_post_deployment_actions](https://registry.terraform.io/providers/hashicorp/null/3.2.3/docs/resources/resource) | resource |
| [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/5.67.0/docs/data-sources/caller_identity) | data source |
| [aws_region.current](https://registry.terraform.io/providers/hashicorp/aws/5.67.0/docs/data-sources/region) | data source |

## Inputs

| Name | Description | Type | Default | Required |
|------|-------------|------|---------|:--------:|
| <a name="input_deployment_name"></a> [deployment\_name](#input\_deployment\_name) | The name of the deployment. | `string` | `""` | no |
| <a name="input_installprefix"></a> [installprefix](#input\_installprefix) | The install prefix for the service area (unused) | `string` | `""` | no |
| <a name="input_nodegroups"></a> [nodegroups](#input\_nodegroups) | A map of node group configurations | <pre>map(object({<br> create_iam_role = optional(bool)<br> iam_role_arn = optional(string)<br> ami_id = optional(string)<br> min_size = optional(number)<br> max_size = optional(number)<br> desired_size = optional(number)<br> instance_types = optional(list(string))<br> capacity_type = optional(string)<br> enable_bootstrap_user_data = optional(bool)<br> metadata_options = optional(map(any))<br> block_device_mappings = optional(map(object({<br> device_name = string<br> ebs = object({<br> volume_size = number<br> volume_type = string<br> encrypted = bool<br> delete_on_termination = bool<br> })<br> })))<br> }))</pre> | <pre>{<br> "defaultGroup": {<br> "block_device_mappings": {<br> "xvda": {<br> "device_name": "/dev/xvda",<br> "ebs": {<br> "delete_on_termination": true,<br> "encrypted": true,<br> "volume_size": 100,<br> "volume_type": "gp2"<br> }<br> }<br> },<br> "desired_size": 1,<br> "instance_types": [<br> "t3.xlarge"<br> ],<br> "max_size": 1,<br> "metadata_options": {<br> "http_endpoint": "enabled",<br> "http_put_response_hop_limit": 3<br> },<br> "min_size": 1<br> }<br>}</pre> | no |
| <a name="input_nodegroups"></a> [nodegroups](#input\_nodegroups) | A map of node group configurations | <pre>map(object({<br> create_iam_role = optional(bool)<br> iam_role_arn = optional(string)<br> ami_id = optional(string)<br> min_size = optional(number)<br> max_size = optional(number)<br> desired_size = optional(number)<br> instance_types = optional(list(string))<br> capacity_type = optional(string)<br> enable_bootstrap_user_data = optional(bool)<br> metadata_options = optional(map(any))<br> block_device_mappings = optional(map(object({<br> device_name = string<br> ebs = object({<br> volume_size = number<br> volume_type = string<br> encrypted = bool<br> delete_on_termination = bool<br> })<br> })))<br> }))</pre> | <pre>{<br> "defaultGroup": {<br> "block_device_mappings": {<br> "xvda": {<br> "device_name": "/dev/xvda",<br> "ebs": {<br> "delete_on_termination": true,<br> "encrypted": true,<br> "volume_size": 100,<br> "volume_type": "gp2"<br> }<br> }<br> },<br> "desired_size": 1,<br> "instance_types": [<br> "t3.2xlarge"<br> ],<br> "max_size": 1,<br> "metadata_options": {<br> "http_endpoint": "enabled",<br> "http_put_response_hop_limit": 3<br> },<br> "min_size": 1<br> }<br>}</pre> | no |
| <a name="input_project"></a> [project](#input\_project) | The project or mission deploying Unity SPS | `string` | `"unity"` | no |
| <a name="input_release"></a> [release](#input\_release) | The software release version. | `string` | `"24.4"` | no |
| <a name="input_service_area"></a> [service\_area](#input\_service\_area) | The service area owner of the resources being deployed | `string` | `"sps"` | no |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ variable "nodegroups" {
}))
default = {
defaultGroup = {
instance_types = ["t3.xlarge"]
instance_types = ["t3.2xlarge"]
min_size = 1
max_size = 1
desired_size = 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ resource "kubernetes_deployment" "ogc_processes_api" {
match_expressions {
key = "karpenter.k8s.aws/instance-cpu"
operator = "In"
values = ["2", "4"]
values = ["4"]
}
}
}
Expand Down
Loading