diff --git a/ansible/linux-slurmcompute.yml b/ansible/linux-slurmcompute.yml new file mode 100644 index 00000000..ca4557b3 --- /dev/null +++ b/ansible/linux-slurmcompute.yml @@ -0,0 +1,6 @@ +--- + +- hosts: all + become: yes + roles: + - jupyter-repo2docker diff --git a/ansible/roles/linux-slurmcompute/defaults/main.yml b/ansible/roles/linux-slurmcompute/defaults/main.yml new file mode 100644 index 00000000..d4ac518a --- /dev/null +++ b/ansible/roles/linux-slurmcompute/defaults/main.yml @@ -0,0 +1,9 @@ +openhpc_slurm_packages: + - ohpc-base-compute + - ohpc-slurm-client + - munge +openhpc_extra_packages: + - lmod-ohpc + - slurm-libpmi-ohpc # to allow intel mpi to work properly + - ohpc-gnu9-openmpi4-perf-tools # for hpctests + - openblas-gnu9-ohpc # for hpctests (HPL) diff --git a/ansible/roles/linux-slurmcompute/tasks/bootstrap.yml b/ansible/roles/linux-slurmcompute/tasks/bootstrap.yml new file mode 100644 index 00000000..53924cab --- /dev/null +++ b/ansible/roles/linux-slurmcompute/tasks/bootstrap.yml @@ -0,0 +1,32 @@ +- name: Move rocky user + ansible.builtin.user: + name: rocky + home: /var/lib/rocky + move_home: true + local: true + become_method: "sudo" + # Need to change working directory otherwise we try to switch back to non-existent directory. + become_flags: '-i' + +- name: Reset ssh connection to allow user changes to affect ansible_user + meta: reset_connection + +- name: Set SELinux state and policy + ansible.posix.selinux: + state: permissive + policy: targeted + +- name: Update base image packages + ansible.builtin.dnf: + name: '*' + state: 'latest' + async: "{{ 30 * 60 }}" # wait for up to 30 minutes + poll: 15 # check every 15 seconds + +- name: Reboot to cope with package updates and SELinux changes + ansible.builtin.reboot: + post_reboot_delay: 30 + +- name: Wait for hosts to be reachable + ansible.builtin.wait_for_connection: + sleep: 15 diff --git a/ansible/roles/linux-slurmcompute/tasks/main.yml b/ansible/roles/linux-slurmcompute/tasks/main.yml new file mode 100644 index 00000000..06bcc105 --- /dev/null +++ b/ansible/roles/linux-slurmcompute/tasks/main.yml @@ -0,0 +1,25 @@ +- name: Get builder commit + shell: + cmd: git describe --all --long --dirty + register: builder_commit + delegate_to: localhost + +- name: Write builder commit to /var/lib/misc/build.txt + copy: + dest: /var/lib/misc/build.txt + content: "{{ builder_commit.stdout }}" + +- include_role: + name: linux-ansible-init + +- import_tasks: bootstrap.yml +#- import_tasks: reimage.yml # TODO: if required to support slurm-driven rebuild from compute nodes +- import_tasks: slurm.yml +- import_tasks: ood_vnc.yml +- import_tasks: monitoring.yml + +- name: Delete /etc/resolv.conf + # required as if cloud-init (rather than network manager) controls this on next boot it won't be entirely overrwritten + file: + path: /etc/resolv.conf + state: absent diff --git a/ansible/roles/linux-slurmcompute/tasks/monitoring.yml b/ansible/roles/linux-slurmcompute/tasks/monitoring.yml new file mode 100644 index 00000000..bf933dca --- /dev/null +++ b/ansible/roles/linux-slurmcompute/tasks/monitoring.yml @@ -0,0 +1,3 @@ +- name: Deploy node_exporter + import_role: + name: cloudalchemy.node_exporter diff --git a/ansible/roles/linux-slurmcompute/tasks/ood_jupyter.yml b/ansible/roles/linux-slurmcompute/tasks/ood_jupyter.yml new file mode 100644 index 00000000..842a5a3c --- /dev/null +++ b/ansible/roles/linux-slurmcompute/tasks/ood_jupyter.yml @@ -0,0 +1,16 @@ +# See https://osc.github.io/ood-documentation/latest/app-development/tutorials-interactive-apps/add-jupyter/software-requirements.html +# - Will already have openssl and lmod + +- name: Install jupyter venv + # Requires separate step so that the upgraded pip is used to install packages + ansible.builtin.pip: + name: pip + state: latest + virtualenv: /opt/jupyter + virtualenv_command: python3 -m venv + +- name: Install jupyter package in venv + ansible.builtin.pip: + name: jupyter + virtualenv: /opt/jupyter + virtualenv_command: python3 -m venv diff --git a/ansible/roles/linux-slurmcompute/tasks/ood_vnc.yml b/ansible/roles/linux-slurmcompute/tasks/ood_vnc.yml new file mode 100644 index 00000000..22f6f57a --- /dev/null +++ b/ansible/roles/linux-slurmcompute/tasks/ood_vnc.yml @@ -0,0 +1,45 @@ +- name: Enable TurboVNC repo + ansible.builtin.get_url: + url: https://turbovnc.org/pmwiki/uploads/Downloads/TurboVNC.repo + dest: /etc/yum.repos.d/TurboVNC.repo + +- name: Install EPEL + ansible.builtin.dnf: + name: epel-release + +- name: Install VNC-related packages + ansible.builtin.dnf: + name: + - turbovnc + - nmap-ncat + - python3 + +- name: Install Xfce desktop + ansible.builtin.dnf: + name: '@Xfce' + +- name: Install websockify venv + # Requires separate step so that the upgraded pip is used to install packages + ansible.builtin.pip: + name: pip + virtualenv: /opt/websockify + virtualenv_command: python3 -m venv + tags: install + +- name: Install websockify package in venv + ansible.builtin.pip: + name: websockify + virtualenv: /opt/websockify + virtualenv_command: python3 -m venv + tags: install + +- name: Symlink websockify to where Open Ondemand expects + ansible.builtin.file: + src: /opt/websockify/bin/websockify + dest: /opt/websockify/run + state: link + +- name: Disable screensaver # as users might not have passwords + ansible.builtin.dnf: + name: xfce4-screensaver + state: absent diff --git a/ansible/roles/linux-slurmcompute/tasks/slurm.yml b/ansible/roles/linux-slurmcompute/tasks/slurm.yml new file mode 100644 index 00000000..5fca55ec --- /dev/null +++ b/ansible/roles/linux-slurmcompute/tasks/slurm.yml @@ -0,0 +1,23 @@ +- name: Install ohpc-release package + yum: + name: "http://repos.openhpc.community/OpenHPC/2/CentOS_8/x86_64/ohpc-release-2-1.el8.x86_64.rpm" + state: present + disable_gpg_check: True + +- name: Enable PowerTools repo + community.general.ini_file: + path: /etc/yum.repos.d/CentOS-PowerTools.repo 8.3 + create: false + option: enabled + value: 1 + +- name: Install packages + ansible.builtin.dnf: + name: "{{ openhpc_slurm_packages + openhpc_extra_packages + }}" + +# - munge key +# - JobComp logfile exists? +# - name: Set slurmctld location for configless operation +#- name: Ensure Slurm service state +# make sure munge and slurm wait for cloud-init + diff --git a/packer/linux-slurmcompute.pkr.hcl b/packer/linux-slurmcompute.pkr.hcl new file mode 100644 index 00000000..fd55b2c4 --- /dev/null +++ b/packer/linux-slurmcompute.pkr.hcl @@ -0,0 +1,82 @@ +# Use like: +# $ PACKER_LOG=1 packer build --on-error=ask -var-file=.pkrvars.hcl + +# "timestamp" template function replacement:s +locals { timestamp = formatdate("YYMMDD-hhmm", timestamp())} + +variable "source_image_name" { + type = string +} + +variable "network" { + type = string +} + +variable "floating_ip_network" { + type = string +} + +variable "flavor" { + type = string +} + +variable "security_groups" { + type = list(string) +} + +variable "volume_size" { + type = number + default = 10 +} + +variable "disk_format" { + type = string + default = "qcow2" +} + +variable "distro_name" { + type = string +} + +variable "ssh_username" { + type = string +} + +source "openstack" "linux-slurmcompute" { + image_name = "${var.distro_name}-desktop-${local.timestamp}" + image_visibility = "private" + image_disk_format = "${var.disk_format}" + + source_image_name = "${var.source_image_name}" + flavor = "${var.flavor}" + networks = ["${var.network}"] + security_groups = "${var.security_groups}" + floating_ip_network = "${var.floating_ip_network}" + + use_blockstorage_volume = true + volume_size = "${var.volume_size}" + + communicator = "ssh" + ssh_username = "${var.ssh_username}" + ssh_clear_authorized_keys = true +} + +build { + source "source.openstack.linux-desktop" { } + + provisioner "ansible" { + galaxy_file = "${path.root}/../requirements.yml" + playbook_file = "${path.root}/../ansible/linux-slurmcompute.yml" + use_proxy = false + extra_arguments = [ + "-v", + ] + ansible_env_vars = ["ANSIBLE_SSH_RETRIES=10"] + } + + post-processor "manifest" { + custom_data = { + source = "${source.name}" + } + } +}