Skip to content

Commit b2843c4

Browse files
Squash Commit of changes for v2.11.1
1 parent 6c9652e commit b2843c4

File tree

100 files changed

+5576
-1688
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

100 files changed

+5576
-1688
lines changed

README.md

+22-15
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,9 @@ Allow dynamic-group instance_principal to manage dns in compartment compartmentN
2727
```
2828
or:
2929

30-
`Allow dynamic-group instance_principal to manage all-resources in compartment compartmentName`
30+
```
31+
Allow dynamic-group instance_principal to manage all-resources in compartment compartmentName
32+
```
3133

3234

3335
## Supported OS:
@@ -67,21 +69,24 @@ The resize.sh is deployed on the controller node as part of the HPC cluster Stac
6769
usage: resize.sh [-h] [--compartment_ocid COMPARTMENT_OCID]
6870
[--cluster_name CLUSTER_NAME] [--nodes NODES [NODES ...]]
6971
[--no_reconfigure] [--user_logging] [--force] [--remove_unreachable]
70-
[{add,remove,list,reconfigure}] [number]
71-
72+
[{add,remove,remove_unreachable,list,reconfigure}] [number] [--quiet]
7273
Script to resize the CN
7374
7475
positional arguments:
7576
{add,remove,remove_unreachable,list,reconfigure}
76-
Mode type. add/remove node options, implicitly
77-
configures newly added nodes. Also implicitly
78-
reconfigure/restart services like Slurm to recognize
79-
new nodes. Similarly for remove option, terminates
80-
nodes and implicitly reconfigure/restart services like
81-
Slurm on rest of the cluster nodes to remove reference
82-
to deleted nodes.
83-
number Number of nodes to add or delete if a list of
84-
hostnames is not defined
77+
Mode type. add/remove node options, implicitly
78+
configures newly added nodes. Also implicitly
79+
reconfigure/restart services like Slurm to recognize
80+
new nodes. Similarly for remove option, terminates
81+
nodes and implicitly reconfigure/restart services like
82+
Slurm on rest of the cluster nodes to remove reference
83+
to deleted nodes. IMPORTANT: remove or remove_unreachable
84+
means delete the node from the cluster which means terminate
85+
the node. remove_unreachable should be used to remove specific
86+
nodes which are no longer reachable via ssh. It gives you control
87+
on which nodes will be terminated by passing the --nodes parameter.
88+
number Number of nodes to add or delete if a list of
89+
hostnames is not defined.
8590
8691
optional arguments:
8792
-h, --help show this help message and exit
@@ -102,9 +107,11 @@ optional arguments:
102107
--ansible_crucial If present during reconfiguration, only crucial
103108
ansible playbooks will be executed on the live nodes.
104109
Non live nodes will be removed
105-
--remove_unreachable If present, nodes that are not sshable will be terminated
106-
before running the action that was requested
107-
(Example Adding a node)
110+
--remove_unreachable If present, ALL nodes that are not sshable will be terminated
111+
before running the action that was requested (Example Adding a node).
112+
CAUTION: Use this only if you want to remove ALL nodes that
113+
are unreachable. Instead, remove specific nodes that are
114+
unreachable by using positional argument remove_unreachable.
108115
--quiet If present, the script will not prompt for a response when
109116
removing nodes and will not give a reminder to save data
110117
from nodes that are being removed

autoscaling/crontab/autoscale_slurm.sh

+6-1
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,12 @@ def getstatus_slurm():
214214

215215
nodes=int(new_line[3])
216216
jobID=int(new_line[1])
217-
cluster_to_build.append([nodes,instanceType,queue,jobID,user])
217+
if isPermanent(config,queue,instanceType) is None :
218+
continue
219+
elif isPermanent(config,queue,instanceType):
220+
continue
221+
else:
222+
cluster_to_build.append([nodes,instanceType,queue,jobID,user])
218223

219224
cluster_to_destroy=[]
220225
current_nodes={}

autoscaling/tf_init/cluster-network-configuration.tf

+4-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,10 @@ resource "oci_core_instance_configuration" "cluster-network-instance_configurati
1111
compartment_id = var.targetCompartment
1212
create_vnic_details {
1313
}
14-
display_name = local.cluster_name
14+
freeform_tags = {
15+
"cluster_name" = local.cluster_name
16+
"parent_cluster" = local.cluster_name
17+
}
1518
metadata = {
1619
# TODO: add user key to the authorized_keys
1720
ssh_authorized_keys = file("/home/${var.controller_username}/.ssh/id_rsa.pub")

autoscaling/tf_init/controller_update.tf

+3-1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ resource "local_file" "inventory" {
3333
rdma_netmask = cidrnetmask(var.rdma_subnet),
3434
zone_name = var.zone_name,
3535
dns_entries = var.dns_entries,
36+
vcn_compartment = var.vcn_compartment,
3637
nfs = var.use_scratch_nfs ? local.cluster_instances_names[0] : "",
3738
scratch_nfs = var.use_scratch_nfs,
3839
cluster_nfs = var.use_cluster_nfs,
@@ -82,7 +83,8 @@ resource "local_file" "inventory" {
8283
use_compute_agent=var.use_compute_agent,
8384
healthchecks=var.healthchecks,
8485
change_hostname=var.change_hostname,
85-
hostname_convention=var.hostname_convention
86+
hostname_convention=var.hostname_convention,
87+
ons_topic_ocid=var.ons_topic_ocid
8688
})
8789
filename = "${local.controller_path}/inventory"
8890
}

autoscaling/tf_init/data.tf

+2-2
Original file line numberDiff line numberDiff line change
@@ -55,13 +55,13 @@ data "oci_core_vcn" "vcn" {
5555
}
5656

5757
data "oci_dns_views" "dns_views" {
58-
compartment_id = var.targetCompartment
58+
compartment_id = var.vcn_compartment
5959
scope = "PRIVATE"
6060
display_name = data.oci_core_vcn.vcn.display_name
6161
}
6262

6363
data "oci_dns_zones" "dns_zones" {
64-
compartment_id = var.targetCompartment
64+
compartment_id = var.vcn_compartment
6565
name = "${var.zone_name}"
6666
zone_type = "PRIMARY"
6767
scope = "PRIVATE"

autoscaling/tf_init/instance-pool-configuration.tf

+4-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,10 @@ resource "oci_core_instance_configuration" "instance_pool_configuration" {
1111
compartment_id = var.targetCompartment
1212
create_vnic_details {
1313
}
14-
display_name = local.cluster_name
14+
freeform_tags = {
15+
"cluster_name" = local.cluster_name
16+
"parent_cluster" = local.cluster_name
17+
}
1518
metadata = {
1619
# TODO: add user key to the authorized_keys
1720
ssh_authorized_keys = file("/home/${var.controller_username}/.ssh/id_rsa.pub")

autoscaling/tf_init/inventory.tpl

+3-1
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,8 @@ sacct_limits=${sacct_limits}
7676
use_compute_agent=${use_compute_agent}
7777
zone_name=${zone_name}
7878
dns_entries=${dns_entries}
79+
vcn_compartment=${vcn_compartment}
7980
healthchecks=${healthchecks}
8081
change_hostname=${change_hostname}
81-
hostname_convention=${hostname_convention}
82+
hostname_convention=${hostname_convention}
83+
ons_topic_ocid=${ons_topic_ocid}

autoscaling/tf_init/network.tf

+10-10
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
resource "oci_core_vcn" "vcn" {
22
count = var.use_existing_vcn ? 0 : 1
33
cidr_block = var.vcn_subnet
4-
compartment_id = var.targetCompartment
4+
compartment_id = var.vcn_compartment
55
display_name = "${local.cluster_name}_VCN"
66
dns_label = "cluster"
77
}
88

99
resource "oci_core_security_list" "internal-security-list" {
1010
count = var.use_existing_vcn ? 0 : 1
1111
vcn_id = oci_core_vcn.vcn[0].id
12-
compartment_id = var.targetCompartment
12+
compartment_id = var.vcn_compartment
1313

1414
ingress_security_rules {
1515
protocol = "all"
@@ -41,7 +41,7 @@ resource "oci_core_security_list" "internal-security-list" {
4141
resource "oci_core_security_list" "public-security-list" {
4242
count = var.use_existing_vcn ? 0 : 1
4343
vcn_id = oci_core_vcn.vcn[0].id
44-
compartment_id = var.targetCompartment
44+
compartment_id = var.vcn_compartment
4545

4646
ingress_security_rules {
4747
protocol = "all"
@@ -83,22 +83,22 @@ resource "oci_core_security_list" "public-security-list" {
8383
resource "oci_core_internet_gateway" "ig1" {
8484
count = var.use_existing_vcn ? 0 : 1
8585
vcn_id = oci_core_vcn.vcn[0].id
86-
compartment_id = var.targetCompartment
86+
compartment_id = var.vcn_compartment
8787
display_name = "${local.cluster_name}_internet-gateway"
8888
}
8989

9090
resource "oci_core_nat_gateway" "ng1" {
9191
count = var.use_existing_vcn ? 0 : 1
9292
vcn_id = oci_core_vcn.vcn[0].id
93-
compartment_id = var.targetCompartment
93+
compartment_id = var.vcn_compartment
9494
display_name = "${local.cluster_name}_nat-gateway"
9595
}
9696

9797

9898
resource "oci_core_service_gateway" "sg1" {
9999
count = var.use_existing_vcn ? 0 : 1
100100
vcn_id = oci_core_vcn.vcn[0].id
101-
compartment_id = var.targetCompartment
101+
compartment_id = var.vcn_compartment
102102
display_name = "${local.cluster_name}_service-gateway"
103103

104104
services {
@@ -108,7 +108,7 @@ resource "oci_core_service_gateway" "sg1" {
108108

109109
resource "oci_core_route_table" "public_route_table" {
110110
count = var.use_existing_vcn ? 0 : 1
111-
compartment_id = var.targetCompartment
111+
compartment_id = var.vcn_compartment
112112
vcn_id = oci_core_vcn.vcn[0].id
113113
display_name = "${local.cluster_name}_public_route_table"
114114

@@ -122,7 +122,7 @@ resource "oci_core_route_table" "public_route_table" {
122122
resource "oci_core_route_table" "private_route_table" {
123123
count = var.use_existing_vcn ? 0 : 1
124124
display_name = "${local.cluster_name}_private_route_table"
125-
compartment_id = var.targetCompartment
125+
compartment_id = var.vcn_compartment
126126
vcn_id = oci_core_vcn.vcn[0].id
127127

128128
route_rules {
@@ -142,7 +142,7 @@ resource "oci_core_subnet" "public-subnet" {
142142
count = var.use_existing_vcn ? 0 : 1
143143
# availability_domain = var.ad
144144
vcn_id = oci_core_vcn.vcn[0].id
145-
compartment_id = var.targetCompartment
145+
compartment_id = var.vcn_compartment
146146
cidr_block = trimspace(var.public_subnet)
147147
security_list_ids = [oci_core_security_list.public-security-list[0].id]
148148
dns_label = "public"
@@ -154,7 +154,7 @@ resource "oci_core_subnet" "private-subnet" {
154154
count = var.use_existing_vcn ? 0 : 1
155155
# availability_domain = var.ad
156156
vcn_id = oci_core_vcn.vcn[0].id
157-
compartment_id = var.targetCompartment
157+
compartment_id = var.vcn_compartment
158158
cidr_block = trimspace(var.private_subnet)
159159
security_list_ids = [oci_core_security_list.internal-security-list[0].id]
160160
dns_label = "private"

bin/controller.sh

+8-3
Original file line numberDiff line numberDiff line change
@@ -36,17 +36,22 @@ if [ $ID == "ol" ] || [ $ID == "centos" ] ; then
3636
sudo yum-config-manager --save --setopt=ol7_oci_included.skip_if_unavailable=true
3737
sudo yum makecache --enablerepo=$repo
3838
sudo yum install --enablerepo=$repo -y ansible python-netaddr
39+
sudo yum install -y https://archive.releases.hashicorp.com/RHEL/7.9/x86_64/stable/terraform-1.6.6-1.x86_64.rpm
3940
elif [ $vid == 8 ] ; then
4041
sudo yum makecache --enablerepo=$repo
4142
sudo yum install --enablerepo=$repo -y python38.x86_64
4243
sudo python3.8 -m pip install ansible cryptography netaddr > /dev/null
4344
sudo mkdir /etc/ansible
4445
sudo ln -s /usr/local/bin/ansible-playbook /bin/ansible-playbook
4546
sudo ln -s /usr/local/bin/ansible /bin/ansible
47+
sudo yum-config-manager --add-repo https://rpm.releases.hashicorp.com/RHEL/hashicorp.repo
48+
sudo sed -i 's/$releasever/'"${vid}"'/g' /etc/yum.repos.d/hashicorp.repo
49+
sudo yum install -y terraform
4650
fi
47-
sudo yum-config-manager --add-repo https://rpm.releases.hashicorp.com/RHEL/hashicorp.repo
48-
sudo sed -i 's/$releasever/'"${vid}"'/g' /etc/yum.repos.d/hashicorp.repo
49-
sudo yum install -y terraform
51+
# sudo yum-config-manager --add-repo https://rpm.releases.hashicorp.com/RHEL/hashicorp.repo
52+
# sudo sed -i 's/$releasever/'"${vid}"'/g' /etc/yum.repos.d/hashicorp.repo
53+
# wget https://archive.releases.hashicorp.com/RHEL/7.9/x86_64/stable/terraform-1.6.6-1.x86_64.rpm
54+
# sudo yum install -y terraform
5055
sudo python3 -m pip install -U pip > /dev/null
5156
sudo python3 -m pip install netaddr --upgrade > /dev/null
5257
sudo python3 -m pip install setuptools_rust --upgrade > /dev/null

bin/create_cluster.sh

+2
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ do
100100
if [ $status -eq 0 ]
101101
then
102102
echo "Successfully created $2 in $runtime seconds"
103+
sleep 60 # Give the jobs some time to start
103104
rm currently_building
104105
if [ -f $monitoring_folder/activated ]
105106
then
@@ -161,6 +162,7 @@ do
161162
mysql -u $ENV_MYSQL_USER -p$ENV_MYSQL_PASS -e "use $ENV_MYSQL_DATABASE_NAME; INSERT INTO cluster_log.errors_timeserie (cluster_id,state,error_log,error_type,nodes,created_on_m,class_name) VALUES ('$2_${date}','creation','$logs_folder/create_$2_${date}.log','$ERROR_MSG $inst_pool_work_request_error_messages $cn_work_request_error_messages','$1','$end_timestamp','$4');" >> $logs_folder/create_$2_${date}.log 2>&1
162163
mysql -u $ENV_MYSQL_USER -p$ENV_MYSQL_PASS -e "use $ENV_MYSQL_DATABASE_NAME; UPDATE cluster_log.clusters SET state='deleting',creation_error='`tail $logs_folder/create_$2_${date}.log | grep Error`' WHERE id='$2_${date}';" >> $logs_folder/create_$2_${date}.log 2>&1
163164
fi
165+
sleep 60 # Give the jobs some time to start
164166
rm currently_building
165167
fi
166168
done

0 commit comments

Comments
 (0)