diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index d15d9e93b..c65b1ae3e 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -14,7 +14,7 @@ jobs: env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack - TF_VAR_cluster_name: ci${{ github.run_id }} + CI_CLUSTER_NAME: ci${{ github.run_id }} CI_CLOUD: ${{ vars.CI_CLOUD }} steps: - uses: actions/checkout@v2 @@ -43,10 +43,6 @@ jobs: with: terraform: v1.5.5 - - name: Initialise terraform - run: terraform init - working-directory: ${{ github.workspace }}/environments/.stackhpc/terraform - - name: Write clouds.yaml run: | mkdir -p ~/.config/openstack/ @@ -67,8 +63,7 @@ jobs: run: | . venv/bin/activate . environments/.stackhpc/activate - cd $APPLIANCES_ENVIRONMENT_ROOT/terraform - terraform apply -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars" + ansible-playbook -v ansible/infra.yml -e terraform_autoapprove=true - name: Delete infrastructure if provisioning failed run: | diff --git a/ansible/.gitignore b/ansible/.gitignore index 47a79a28a..120fc43a1 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -55,3 +55,5 @@ roles/* !roles/persist_hostkeys/ !roles/persist_hostkeys/** !roles/requirements.yml +!roles/terraform/ +!roles/terraform/** diff --git a/ansible/infra.yml b/ansible/infra.yml new file mode 100644 index 000000000..ad6a0626a --- /dev/null +++ b/ansible/infra.yml @@ -0,0 +1,6 @@ +- hosts: localhost + become: no + gather_facts: no + tasks: + - import_role: + name: terraform diff --git a/ansible/roles/terraform/defaults/main.yml b/ansible/roles/terraform/defaults/main.yml new file mode 100644 index 000000000..63ae56d81 --- /dev/null +++ b/ansible/roles/terraform/defaults/main.yml @@ -0,0 +1,7 @@ +terraform_templates: [main.tf.j2] +terraform_project_path: +terraform_autoapprove: false +terraform_binary_path: +terraform_backend_config: {} +terraform_variables: {} +terraform_state: present diff --git a/ansible/roles/terraform/filter_plugins/terraform.py b/ansible/roles/terraform/filter_plugins/terraform.py new file mode 100644 index 000000000..e4793eb36 --- /dev/null +++ b/ansible/roles/terraform/filter_plugins/terraform.py @@ -0,0 +1,17 @@ +import re + +def expand_hostlist(hostlist): + match = re.search(r'(\w+)-\[(\d+)-(\d+)\]', hostlist) + if match: + prefix = match.groups()[0] + start, end = [int(v) for v in match.groups()[1:]] + hosts = [f'{prefix}-{n}' for n in range(start, end+1)] + return hosts + else: + return [hostlist,] + +class FilterModule(object): + def filters(self): + return { + 'expand_hostlist': expand_hostlist, + } diff --git a/ansible/roles/terraform/tasks/apply.yml b/ansible/roles/terraform/tasks/apply.yml new file mode 100644 index 000000000..3590d19d0 --- /dev/null +++ b/ansible/roles/terraform/tasks/apply.yml @@ -0,0 +1,44 @@ +- name: Create Terraform plan + community.general.terraform: + binary_path: "{{ terraform_binary_path or omit }}" + project_path: "{{ terraform_project_path }}" + state: planned + backend_config: "{{ terraform_backend_config }}" + plan_file: terraform.plan + force_init: yes + init_reconfigure: yes + variables: "{{ terraform_variables }}" + register: _tf_plan + +- name: Show Terraform plan + debug: + msg: "{{ _tf_plan.stdout }}" + +- name: Prompt to approve Terraform plan execution + pause: + prompt: "Do you want to execute this plan? (Only 'yes' executes)" + register: _tf_approve_plan + when: + - "'No changes. Your infrastructure matches the configuration.' not in _tf_plan.stdout" + - 'not terraform_autoapprove | bool' + +- name: End host if Terraform plan is not approved + ansible.builtin.meta: end_host + when: "not (( terraform_autoapprove | bool ) or ( _tf_approve_plan.user_input | default(false) | bool ))" + +- name: Provision infrastructure using Terraform + community.general.terraform: + binary_path: "{{ terraform_binary_path or omit }}" + project_path: "{{ terraform_project_path }}" + state: "{{ terraform_state }}" + backend_config: "{{ terraform_backend_config }}" + force_init: yes + init_reconfigure: yes + variables: "{{ terraform_variables }}" + plan_file: terraform.plan + register: terraform_provision + +- name: Show Terraform provision output + debug: + msg: "{{ terraform_provision.stdout }}" + when: "'stdout' in terraform_provision" diff --git a/ansible/roles/terraform/tasks/main.yml b/ansible/roles/terraform/tasks/main.yml new file mode 100644 index 000000000..8ba99e91e --- /dev/null +++ b/ansible/roles/terraform/tasks/main.yml @@ -0,0 +1,7 @@ +- name: Template Terraform configurations + template: + src: "{{ item }}" + dest: "{{ terraform_project_path }}/{{ (item | splitext | first) if item.endswith('.j2') else item }}" + loop: "{{ terraform_templates }}" + +- include_tasks: apply.yml diff --git a/ansible/roles/terraform/templates/main.tf.j2 b/ansible/roles/terraform/templates/main.tf.j2 new file mode 100644 index 000000000..fcb53f99f --- /dev/null +++ b/ansible/roles/terraform/templates/main.tf.j2 @@ -0,0 +1,148 @@ +#jinja2:lstrip_blocks: True +terraform { + required_version = ">= 0.14" + required_providers { + openstack = { + source = "terraform-provider-openstack/openstack" + } + } +} + +# --- volumes --- +{% for volume_name, volume in cluster_volumes.items() %} +resource "openstack_blockstorage_volume_v3" "{{ volume_name }}" { + name = "{{ cluster_name }}-{{ volume_name }}" + description = "{{ volume.description }}" + size = "{{ volume.size }}" +} +{% endfor %} + +{% for instance_hostlist, _instance in cluster_instances.items() %} +{% set hostgroup = instance_hostlist.split('-')[0] %}{# NB: assumes prefix- format #} +### --- hostgroup {{ hostgroup }} --- +{% set instance = cluster_instance_defaults | combine(_instance) %} + +{# NB: Currently secgroups apply to all ports on each instance #} +data "openstack_networking_secgroup_v2" "{{ hostgroup }}" { + for_each = toset({{ instance.secgroup_names | to_json }}) + + name = each.key +} + +{% for port in instance.ports %} +{% set port_tf_name = (hostgroup, port.network_name) | join('_') | replace('-', '_') %} + +data "openstack_networking_network_v2" "{{ port_tf_name }}" { + name = "{{ port.network_name }}" +} + +{% if 'subnet_name' in port %} +data "openstack_networking_subnet_v2" "{{ port_tf_name }}" { + name = "{{ port.subnet_name }}" +} +{% endif %} + +resource "openstack_networking_port_v2" "{{ port_tf_name }}" { + for_each = toset({{ instance_hostlist | expand_hostlist | to_json }}) + + name = "{{ cluster_name }}-${each.key}-{{ port.network_name }}" + network_id = data.openstack_networking_network_v2.{{ port_tf_name }}.id + + {% if 'subnet_name' in port %} + fixed_ip { + subnet_id = data.openstack_networking_subnet_v2.{{ port_tf_name }}.id + } + {% endif %} + + security_group_ids = [for sg in data.openstack_networking_secgroup_v2.{{ hostgroup }}: sg.id] + + binding { + vnic_type = "{{ port.vnic_type | default('normal') }}" + profile = {{ port.binding_profile | to_json if 'binding_profile' in port else 'null' }} + } +} +{% endfor %}{# instance.ports #} + +data "openstack_images_image_v2" "{{ hostgroup }}" { + name = "{{ instance.image_name }}" +} + +resource "openstack_compute_instance_v2" "{{ hostgroup }}" { + for_each = toset({{ instance_hostlist | expand_hostlist | to_json }}) + + name = "{{ cluster_name }}-${each.key}" + image_name = data.openstack_images_image_v2.{{ hostgroup }}.name + {% if instance and 'flavor_name' in instance %} + flavor_name = "{{ instance.flavor_name }}" + {% else %} + flavor_id = "{{ instance.flavor_id }}" + {% endif %} + key_pair = "{{ instance.key_pair }}" + {% if instance.volumes | default([]) | length or instance.root_volume_size | default(None) %} + # root disk: + block_device { + uuid = data.openstack_images_image_v2.{{ hostgroup }}.id + source_type = "image" + destination_type = "{{ 'volume' if instance.root_volume_size | default(None) else 'local' }}" + volume_size = {{ instance.root_volume_size if instance.root_volume_size | default(None) else 'null' }} + boot_index = 0 + delete_on_termination = true + } + {% for volume_name in instance.volumes | default([]) %} + block_device { + destination_type = "volume" + source_type = "volume" + boot_index = -1 + uuid = openstack_blockstorage_volume_v3.{{ volume_name }}.id + } + {% endfor %} + {% endif %} + {% for port in instance.ports %} + {% set port_tf_name = (hostgroup, port.network_name) | join('_') | replace('-', '_') %} + network { + port = openstack_networking_port_v2.{{ port_tf_name }}[each.key].id + } + {% endfor %} + + metadata = { + environment_root = "{{ appliances_environment_root }}" + } + + user_data = <<-EOF + #cloud-config + fs_setup: + {% for volume_name, volume in instance.get('volumes', {}).items() %} + - label: {{ volume_name }} + filesystem: ext4 + device: {{ volume.device_path }} + partition: auto + {% endfor %} + mounts: + {% for volume_name, volume in instance.get('volumes', {}).items() %} + - [LABEL={{ volume_name }}, {{ volume.mount_point }}, auto, "{{ volume.mount_options | default('') }}" ] + {% endfor %} + EOF + +} + +{% endfor %}{# cluster_instances.items() #} + +resource "local_file" "hosts" { + content = <<-EOF + {% for instance_hostlist, _instance in cluster_instances.items() %} + {% set hostgroup = instance_hostlist.split('-')[0] %} + [{{ hostgroup }}] + {% for hostkey in instance_hostlist | expand_hostlist %} + {% set inventory_hostname = cluster_name + '-' + hostkey %} + {{ inventory_hostname }} ansible_host=${openstack_compute_instance_v2.{{ hostgroup }}["{{ hostkey }}"].network[0].fixed_ip_v4} + {% endfor %} + + {% for extra_group in _instance.extra_groups | default([]) %} + [{{ extra_group }}:children] + {{ hostgroup }} + {% endfor %} + + {% endfor %} + EOF + filename = "../inventory/hosts" +} diff --git a/environments/.stackhpc/inventory/group_vars/all/cluster.yml b/environments/.stackhpc/inventory/group_vars/all/cluster.yml new file mode 100644 index 000000000..6166af28b --- /dev/null +++ b/environments/.stackhpc/inventory/group_vars/all/cluster.yml @@ -0,0 +1,36 @@ +ci_cluster_name: "{{ lookup('env', 'CI_CLUSTER_NAME') }}" +dev_cluster_name: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') | basename | split('-') | last }}" # dev directories use slurm-app-$FEATURE for directories +cluster_name: "{{ ci_cluster_name | default(dev_cluster_name, true) }}" # true means use default if value is empty string + +cluster_volumes: # can reduce the size a lot for dev/CI + state: + description: Persistent state + size: 10 # GB + home: + description: User home directories + size: 10 # GB + +# TODO: support SMS-labs here too +cluster_instance_defaults: + image_name: openhpc-230926-1343-e3d3e307 # https://github.com/stackhpc/ansible-slurm-appliance/pull/314 + flavor_name: vm.ska.cpu.general.small + key_pair: slurm-app-ci + ports: + - network_name: portal-internal # required str (tf standard: network_id) + secgroup_names: + - default + +cluster_instances: + login: + secgroup_names: [default, SSH] + compute-[0-3]: + control: + flavor_name: vm.ska.cpu.general.quarter + volumes: + state: + device_path: /dev/sdb + mount_point: /var/lib/state + home: + device_path: /dev/sdc + mount_point: /exports/home + mount_options: x-systemd.required-by=nfs-server.service,x-systemd.before=nfs-server.service diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf deleted file mode 100644 index 8772189bb..000000000 --- a/environments/.stackhpc/terraform/main.tf +++ /dev/null @@ -1,81 +0,0 @@ -# This terraform configuration uses the "skeleton" terraform, so that is checked by CI. - -variable "environment_root" { - type = string - description = "Path to environment root, automatically set by activate script" -} - -variable "cluster_name" { - type = string - description = "Name for cluster, used as prefix for resources - set by environment var in CI" -} - -variable "cluster_image" { - description = "single image for all cluster nodes - a convenience for CI" - type = string - default = "openhpc-231206-1648-9d6aa4e4" # https://github.com/stackhpc/ansible-slurm-appliance/pull/340 - # default = "Rocky-8-GenericCloud-Base-8.8-20230518.0.x86_64.qcow2" -} - -variable "cluster_net" {} - -variable "cluster_subnet" {} - -variable "vnic_type" {} - -variable "control_node_flavor" {} - -variable "other_node_flavor" {} - -variable "volume_backed_instances" { - default = false -} - -variable "state_volume_device_path" {} - -variable "home_volume_device_path" {} - -module "cluster" { - source = "../../skeleton/{{cookiecutter.environment}}/terraform/" - - cluster_name = var.cluster_name - cluster_net = var.cluster_net - cluster_subnet = var.cluster_subnet - vnic_type = var.vnic_type - key_pair = "slurm-app-ci" - control_node = { - flavor: var.control_node_flavor - image: var.cluster_image - } - login_nodes = { - login-0: { - flavor: var.other_node_flavor - image: var.cluster_image - } - } - compute_types = { - small: { - flavor: var.other_node_flavor - image: var.cluster_image - } - extra: { - flavor: var.other_node_flavor - image: var.cluster_image - } - } - compute_nodes = { - compute-0: "small" - compute-1: "small" - compute-2: "extra" - compute-3: "extra" - } - volume_backed_instances = var.volume_backed_instances - - environment_root = var.environment_root - # Can reduce volume size a lot for short-lived CI clusters: - state_volume_size = 10 - home_volume_size = 20 - - state_volume_device_path = var.state_volume_device_path - home_volume_device_path = var.home_volume_device_path -} diff --git a/environments/common/inventory/group_vars/all/cluster.yml b/environments/common/inventory/group_vars/all/cluster.yml new file mode 100644 index 000000000..a43c7206c --- /dev/null +++ b/environments/common/inventory/group_vars/all/cluster.yml @@ -0,0 +1,38 @@ +cluster_name: "{{ undef(hint='cluster_name must be defined in environment') }}" + +cluster_volumes: + state: + description: Persistent state + size: 150 # GB + home: + description: User home directories + size: 100 # GB + +cluster_instance_defaults: # overriden by entries in cluster_instances + # image_name: required str + # flavor_name: required str + # key_pair: required str + # ports: required list of maps: + # - network_name: # required str (tf standard: network_id) + # subnet_name: # optional str (tf standard: subnet_id) + # vnic_type: optional str + # binding_profile: optional yaml + # root_volume_size: # optional int, size in GB to use volume-backed instances. Missing or null to use local root disk + # volumes: [] # optional list of maps, volumes to attach. TODO: describe these + # extra_groups: [] # optional list of str, names of extra inventory groups for this host + secgroup_names: + - default + +cluster_instances: # each key below becomes a TF for_each set + login: + secgroup_names: [default, SSH] + compute-[0-1]: # TODO: this is problematic for cases where we want to define a count. Consider making the key the primary group, and adding a hostexpression/count parameter + control: + volumes: + state: + device_path: /dev/sdb + mount_point: /var/lib/state + home: + device_path: /dev/sdc + mount_point: /exports/home + mount_options: x-systemd.required-by=nfs-server.service,x-systemd.before=nfs-server.service diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index 1cb963657..c469c1a05 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -2,7 +2,7 @@ # See: https://github.com/stackhpc/ansible-role-openhpc # for variable definitions - +openhpc_cluster_name: "{{ cluster_name }}" openhpc_enable: control: "{{ inventory_hostname in groups['control'] }}" batch: "{{ inventory_hostname in groups['compute'] }}" diff --git a/environments/common/inventory/group_vars/all/terraform.yml b/environments/common/inventory/group_vars/all/terraform.yml new file mode 100644 index 000000000..50564ead7 --- /dev/null +++ b/environments/common/inventory/group_vars/all/terraform.yml @@ -0,0 +1 @@ +terraform_project_path: "{{ appliances_environment_root }}/terraform"