diff --git a/services/jenkins-autoscaling/lambda_mxnet_ci/autoscaling/environment.yml b/services/jenkins-autoscaling/lambda_mxnet_ci/autoscaling/environment.yml index 16375d6..763e3b7 100644 --- a/services/jenkins-autoscaling/lambda_mxnet_ci/autoscaling/environment.yml +++ b/services/jenkins-autoscaling/lambda_mxnet_ci/autoscaling/environment.yml @@ -7,13 +7,13 @@ test: IAM_JENKINS_RESTRICTED_SLAVE_ROLE: arn:aws:iam::REDACTED:role/jenkins_restricted_slave_role SECRETS_MANAGER_ARN: arn:aws:secretsmanager:us-west-2:REDACTED:secret:REDACTED JENKINS_PRIV_TUNNEL: jenkins-priv.mxnet-ci-dev.amazon-ml.com:48593 - LAUNCH_TEMPLATES: '{"mxnetlinux-cpu":{"id":"lt-06a15945813ad44f2","version":"14"},"restricted-mxnetlinux-cpu":{"id":"lt-0dc74292f7d647ac6","version":"9"},"mxnetlinux-gpu":{"id":"lt-0c22f238c0edb58ab","version":"19"},"mxnetlinux-gpu-p3":{"id":"lt-00c83ee5d7aeaf4ab","version":"12"},"restricted-mxnetlinux-gpu-p3":{"id":"lt-0f893d7f3f2660c1c","version":"3"},"mxnetlinux-gpu-p3-8xlarge":{"id":"lt-0277305ae5f49782b","version":"6"},"mxnetwindows-cpu":{"id":"lt-09dff2fff6b5586f0","version":"11"},"mxnetwindows-gpu":{"id":"lt-0ce229129d0d3be27","version":"15"},"utility":{"id":"lt-028ee0bc3cef79942","version":"3"},"restricted-utility":{"id":"lt-05d66be1f50c9b3fc","version":"1"}, "restricted-mxnetlinux-gpu":{"id":"lt-0c246487c1570d396","version":"5"}}' - EXECUTORS_PER_LABEL: '{"mxnetlinux-cpu":3,"restricted-mxnetlinux-cpu":3,"mxnetlinux-gpu":1,"mxnetlinux-gpu-p3":1,"restricted-mxnetlinux-gpu-p3":1,"mxnetlinux-gpu-p3-8xlarge":1,"mxnetwindows-cpu":4,"mxnetwindows-gpu":1,"utility":30,"restricted-utility":30, "restricted-mxnetlinux-gpu": 1}' - WARM_POOL_SIZE: '{"mxnetlinux-cpu":1,"restricted-mxnetlinux-cpu":0,"mxnetlinux-gpu":0,"mxnetlinux-gpu-p3":0,"restricted-mxnetlinux-gpu-p3":0,"mxnetlinux-gpu-p3-8xlarge":0,"mxnetwindows-cpu":1,"mxnetwindows-gpu":0,"utility":1,"restricted-utility":1, "restricted-mxnetlinux-gpu": 0}' - MINIMUM_QUEUE_TIMES_SEC: '{"mxnetlinux-cpu":30,"restricted-mxnetlinux-cpu":30,"mxnetlinux-gpu":30,"mxnetlinux-gpu-p3":30,"restricted-mxnetlinux-gpu-p3":30,"mxnetlinux-gpu-p3-8xlarge":30,"mxnetwindows-cpu":30,"mxnetwindows-gpu":30,"utility":3,"restricted-utility":3, "restricted-mxnetlinux-gpu": 30}' - CCACHE_EFS_DNS: '{"mxnetlinux-cpu":"fs-REDACTED.efs.us-west-2.amazonaws.com","restricted-mxnetlinux-cpu":"fs-REDACTED.efs.us-west-2.amazonaws.com","mxnetlinux-gpu":"NONE","mxnetlinux-gpu-p3":"NONE","restricted-mxnetlinux-gpu-p3":"NONE","mxnetlinux-gpu-p3-8xlarge":"NONE","mxnetwindows-cpu":"NONE","mxnetwindows-gpu":"NONE","utility":"NONE","restricted-utility":"NONE", "restricted-mxnetlinux-gpu": "NONE"}' - MAXIMUM_STARTUP_TIME_SEC: '{"mxnetlinux-cpu":300,"restricted-mxnetlinux-cpu":300,"mxnetlinux-gpu":300,"mxnetlinux-gpu-p3":300,"restricted-mxnetlinux-gpu-p3":300,"mxnetlinux-gpu-p3-8xlarge":300,"mxnetwindows-cpu":1800,"mxnetwindows-gpu":1800,"utility":300,"restricted-utility":300, "restricted-mxnetlinux-gpu":300}' - MANAGED_JENKINS_NODE_LABELS: '["mxnetlinux-cpu","restricted-mxnetlinux-cpu","mxnetlinux-gpu", "restricted-mxnetlinux-gpu", "mxnetwindows-cpu","mxnetwindows-gpu","mxnetlinux-gpu-p3","restricted-mxnetlinux-gpu-p3","mxnetlinux-gpu-p3-8xlarge","utility","restricted-utility"]' + LAUNCH_TEMPLATES: '{"mxnetlinux-cpu":{"id":"lt-06a15945813ad44f2","version":"14"},"restricted-mxnetlinux-cpu":{"id":"lt-0dc74292f7d647ac6","version":"9"},"mxnetlinux-gpu":{"id":"lt-0c22f238c0edb58ab","version":"19"},"mxnetlinux-gpu-g4":{"id":"lt-0f830794cba5041e2","version":"1"},"mxnetlinux-gpu-p3":{"id":"lt-00c83ee5d7aeaf4ab","version":"12"},"restricted-mxnetlinux-gpu-p3":{"id":"lt-0f893d7f3f2660c1c","version":"3"},"mxnetlinux-gpu-p3-8xlarge":{"id":"lt-0277305ae5f49782b","version":"6"},"mxnetwindows-cpu":{"id":"lt-09dff2fff6b5586f0","version":"11"},"mxnetwindows-gpu":{"id":"lt-0ce229129d0d3be27","version":"15"},"utility":{"id":"lt-028ee0bc3cef79942","version":"3"},"restricted-utility":{"id":"lt-05d66be1f50c9b3fc","version":"1"}, "restricted-mxnetlinux-gpu":{"id":"lt-0c246487c1570d396","version":"5"}}' + EXECUTORS_PER_LABEL: '{"mxnetlinux-cpu":2,"restricted-mxnetlinux-cpu":3,"mxnetlinux-gpu":1,"mxnetlinux-gpu-g4":1,"mxnetlinux-gpu-p3":1,"restricted-mxnetlinux-gpu-p3":1,"mxnetlinux-gpu-p3-8xlarge":1,"mxnetwindows-cpu":4,"mxnetwindows-gpu":1,"utility":30,"restricted-utility":30, "restricted-mxnetlinux-gpu": 1}' + WARM_POOL_SIZE: '{"mxnetlinux-cpu":1,"restricted-mxnetlinux-cpu":0,"mxnetlinux-gpu":0,"mxnetlinux-gpu-g4":0,"mxnetlinux-gpu-p3":0,"restricted-mxnetlinux-gpu-p3":0,"mxnetlinux-gpu-p3-8xlarge":0,"mxnetwindows-cpu":1,"mxnetwindows-gpu":0,"utility":1,"restricted-utility":1, "restricted-mxnetlinux-gpu": 0}' + MINIMUM_QUEUE_TIMES_SEC: '{"mxnetlinux-cpu":30,"restricted-mxnetlinux-cpu":30,"mxnetlinux-gpu":30,"mxnetlinux-gpu-g4":30,"mxnetlinux-gpu-p3":30,"restricted-mxnetlinux-gpu-p3":30,"mxnetlinux-gpu-p3-8xlarge":30,"mxnetwindows-cpu":30,"mxnetwindows-gpu":30,"utility":3,"restricted-utility":3, "restricted-mxnetlinux-gpu": 30}' + CCACHE_EFS_DNS: '{"mxnetlinux-cpu":"fs-REDACTED.efs.us-west-2.amazonaws.com","restricted-mxnetlinux-cpu":"fs-REDACTED.efs.us-west-2.amazonaws.com","mxnetlinux-gpu":"NONE","mxnetlinux-gpu-g4":"NONE","mxnetlinux-gpu-p3":"NONE","restricted-mxnetlinux-gpu-p3":"NONE","mxnetlinux-gpu-p3-8xlarge":"NONE","mxnetwindows-cpu":"NONE","mxnetwindows-gpu":"NONE","utility":"NONE","restricted-utility":"NONE", "restricted-mxnetlinux-gpu": "NONE"}' + MAXIMUM_STARTUP_TIME_SEC: '{"mxnetlinux-cpu":300,"restricted-mxnetlinux-cpu":300,"mxnetlinux-gpu":300,"mxnetlinux-gpu-g4":300,"mxnetlinux-gpu-p3":300,"restricted-mxnetlinux-gpu-p3":300,"mxnetlinux-gpu-p3-8xlarge":300,"mxnetwindows-cpu":1800,"mxnetwindows-gpu":1800,"utility":300,"restricted-utility":300, "restricted-mxnetlinux-gpu":300}' + MANAGED_JENKINS_NODE_LABELS: '["mxnetlinux-cpu","restricted-mxnetlinux-cpu","mxnetlinux-gpu", "mxnetlinux-gpu-g4", "restricted-mxnetlinux-gpu", "mxnetwindows-cpu","mxnetwindows-gpu","mxnetlinux-gpu-p3","restricted-mxnetlinux-gpu-p3","mxnetlinux-gpu-p3-8xlarge","utility","restricted-utility"]' IGNORED_JENKINS_NODE_LABELS: '["mxnetlinux","mxnetwindows","master"]' IGNORED_JENKINS_NODE_NAMES: '["master"]' LOGGING_LEVEL: DEBUG @@ -31,13 +31,13 @@ prod: IAM_JENKINS_RESTRICTED_SLAVE_ROLE: arn:aws:iam::REDACTED:role/jenkins_restricted_slave_role SECRETS_MANAGER_ARN: arn:aws:secretsmanager:us-west-2:REDACTED:secret:REDACTED JENKINS_PRIV_TUNNEL: jenkins-priv.mxnet-ci.amazon-ml.com:48593 - LAUNCH_TEMPLATES: '{"mxnetlinux-cpu":{"id":"lt-059ca0af3b73fdd43","version":"6"},"restricted-mxnetlinux-cpu":{"id":"lt-0752e01a2f18939a4","version":"4"},"mxnetlinux-gpu":{"id":"lt-083414b180618edd0","version":"8"},"mxnetlinux-gpu-p3":{"id":"lt-09e887362d145072b","version":"8"},"mxnetlinux-gpu-p3-8xlarge":{"id":"lt-03458db28b362cf92","version":"5"},"mxnetwindows-cpu":{"id":"lt-02d3dfef15faf1298","version":"6"},"mxnetwindows-gpu":{"id":"lt-0eb15ef80c9a69ef1","version":"6"},"utility":{"id":"lt-0b64c7b2c90e53235","version":"1"},"restricted-utility":{"id":"lt-0b8c0a9d4ee3ea089","version":"1"},"restricted-mxnetlinux-gpu-p3":{"id":"lt-0a98711a162486aa5","version":"1"}}' - EXECUTORS_PER_LABEL: '{"mxnetlinux-cpu":3,"restricted-mxnetlinux-cpu":3,"mxnetlinux-gpu":1,"restricted-mxnetlinux-gpu":0,"mxnetlinux-gpu-p3":1,"mxnetlinux-gpu-p3-8xlarge":1,"mxnetwindows-cpu":4,"restricted-mxnetlinux-gpu-p3":1,"mxnetwindows-gpu":1,"utility":30,"restricted-utility":30}' - WARM_POOL_SIZE: '{"mxnetlinux-cpu":1,"restricted-mxnetlinux-cpu":0,"mxnetlinux-gpu":0,"mxnetlinux-gpu-p3":0,"mxnetlinux-gpu-p3-8xlarge":0,"mxnetwindows-cpu":1,"mxnetwindows-gpu":0,"restricted-mxnetlinux-gpu-p3":0,"utility":1,"restricted-utility":1}' - MINIMUM_QUEUE_TIMES_SEC: '{"mxnetlinux-cpu":30,"restricted-mxnetlinux-cpu":30,"mxnetlinux-gpu":30,"mxnetlinux-gpu-p3":30,"mxnetlinux-gpu-p3-8xlarge":30,"restricted-mxnetlinux-gpu-p3":30,"mxnetwindows-cpu":30,"mxnetwindows-gpu":30,"utility":3,"restricted-utility":3}' - CCACHE_EFS_DNS: '{"mxnetlinux-cpu":"fs-REDACTED.efs.us-west-2.amazonaws.com","restricted-mxnetlinux-cpu":"fs-REDACTED.efs.us-west-2.amazonaws.com","mxnetlinux-gpu":"NONE","mxnetlinux-gpu-p3":"NONE","restricted-mxnetlinux-gpu-p3":"NONE","mxnetlinux-gpu-p3-8xlarge":"NONE","mxnetwindows-cpu":"NONE","mxnetwindows-gpu":"NONE","utility":"NONE","restricted-utility":"NONE"}' - MAXIMUM_STARTUP_TIME_SEC: '{"mxnetlinux-cpu":300,"restricted-mxnetlinux-cpu":300,"mxnetlinux-gpu":300,"mxnetlinux-gpu-p3":300,"restricted-mxnetlinux-gpu-p3":300,"mxnetlinux-gpu-p3-8xlarge":300,"mxnetwindows-cpu":1800,"mxnetwindows-gpu":1800,"utility":300,"restricted-utility":300}' - MANAGED_JENKINS_NODE_LABELS: '["mxnetlinux-cpu","restricted-mxnetlinux-cpu","mxnetlinux-gpu","mxnetwindows-cpu","mxnetwindows-gpu","mxnetlinux-gpu-p3","restricted-mxnetlinux-gpu-p3","mxnetlinux-gpu-p3-8xlarge","utility","restricted-utility"]' + LAUNCH_TEMPLATES: '{"mxnetlinux-cpu":{"id":"lt-059ca0af3b73fdd43","version":"15"},"restricted-mxnetlinux-cpu":{"id":"lt-0752e01a2f18939a4","version":"10"},"mxnetlinux-gpu":{"id":"lt-083414b180618edd0","version":"14"},"mxnetlinux-gpu-g4":{"id":"lt-0ebf575cc5a56ebf4","version":"1"},"restricted-mxnetlinux-gpu":{"id":"lt-091e6f84d25af91d2","version":"5"},"mxnetlinux-gpu-p3":{"id":"lt-09e887362d145072b","version":"12"},"mxnetlinux-gpu-p3-8xlarge":{"id":"lt-03458db28b362cf92","version":"5"},"mxnetwindows-cpu":{"id":"lt-02d3dfef15faf1298","version":"22"},"mxnetwindows-gpu":{"id":"lt-0eb15ef80c9a69ef1","version":"30"},"utility":{"id":"lt-0b64c7b2c90e53235","version":"1"},"restricted-utility":{"id":"lt-0b8c0a9d4ee3ea089","version":"7"},"restricted-mxnetlinux-gpu-p3":{"id":"lt-0a98711a162486aa5","version":"5"}}' + EXECUTORS_PER_LABEL: '{"mxnetlinux-cpu":2,"restricted-mxnetlinux-cpu":3,"mxnetlinux-gpu":1,"mxnetlinux-gpu-g4":1,"restricted-mxnetlinux-gpu":1,"mxnetlinux-gpu-p3":1,"mxnetlinux-gpu-p3-8xlarge":1,"mxnetwindows-cpu":1,"restricted-mxnetlinux-gpu-p3":1,"mxnetwindows-gpu":1,"utility":30,"restricted-utility":30}' + WARM_POOL_SIZE: '{"mxnetlinux-cpu":1,"restricted-mxnetlinux-cpu":0,"mxnetlinux-gpu":0,"mxnetlinux-gpu-g4":0,"restricted-mxnetlinux-gpu":0,"mxnetlinux-gpu-p3":0,"mxnetlinux-gpu-p3-8xlarge":0,"mxnetwindows-cpu":1,"mxnetwindows-gpu":0,"restricted-mxnetlinux-gpu-p3":0,"utility":1,"restricted-utility":1}' + MINIMUM_QUEUE_TIMES_SEC: '{"mxnetlinux-cpu":30,"restricted-mxnetlinux-cpu":30,"mxnetlinux-gpu":30,"mxnetlinux-gpu-g4":30,"mxnetlinux-gpu-p3":30,"restricted-mxnetlinux-gpu":30,"mxnetlinux-gpu-p3-8xlarge":30,"restricted-mxnetlinux-gpu-p3":30,"mxnetwindows-cpu":30,"mxnetwindows-gpu":30,"utility":3,"restricted-utility":3}' + CCACHE_EFS_DNS: '{"mxnetlinux-cpu":"fs-REDACTED.efs.us-west-2.amazonaws.com","restricted-mxnetlinux-cpu":"fs-REDACTED.efs.us-west-2.amazonaws.com","mxnetlinux-gpu":"NONE","mxnetlinux-gpu-g4":"NONE","mxnetlinux-gpu-p3":"NONE","restricted-mxnetlinux-gpu":"NONE","restricted-mxnetlinux-gpu-p3":"NONE","mxnetlinux-gpu-p3-8xlarge":"NONE","mxnetwindows-cpu":"NONE","mxnetwindows-gpu":"NONE","utility":"NONE","restricted-utility":"NONE"}' + MAXIMUM_STARTUP_TIME_SEC: '{"mxnetlinux-cpu":300,"restricted-mxnetlinux-cpu":300,"mxnetlinux-gpu":300,"mxnetlinux-gpu-g4":300,"restricted-mxnetlinux-gpu":300,"mxnetlinux-gpu-p3":300,"restricted-mxnetlinux-gpu-p3":300,"mxnetlinux-gpu-p3-8xlarge":300,"mxnetwindows-cpu":1800,"mxnetwindows-gpu":1800,"utility":300,"restricted-utility":300}' + MANAGED_JENKINS_NODE_LABELS: '["mxnetlinux-cpu","restricted-mxnetlinux-cpu","mxnetlinux-gpu", "mxnetlinux-gpu-g4", "mxnetwindows-cpu","mxnetwindows-gpu","mxnetlinux-gpu-p3","restricted-mxnetlinux-gpu","restricted-mxnetlinux-gpu-p3","mxnetlinux-gpu-p3-8xlarge","utility","restricted-utility"]' IGNORED_JENKINS_NODE_LABELS: '["mxnetlinux","mxnetwindows","master"]' IGNORED_JENKINS_NODE_NAMES: '["master"]' LOGGING_LEVEL: DEBUG diff --git a/services/jenkins-autoscaling/lambda_mxnet_ci/autoscaling/handler.py b/services/jenkins-autoscaling/lambda_mxnet_ci/autoscaling/handler.py index d48ed14..0201d0f 100755 --- a/services/jenkins-autoscaling/lambda_mxnet_ci/autoscaling/handler.py +++ b/services/jenkins-autoscaling/lambda_mxnet_ci/autoscaling/handler.py @@ -434,7 +434,13 @@ def _unconnected_instances(nodes: list, instance_uptime: Dict[str, int], ec2_res dict_starting_nodes[label].append(tags['Name']) else: # pragma: no cover logging.error("Managed slave instance %s does not have tag label", instance.id) - + elif not target_node: + logging.error("Found orphaned / zombie instance: '%s'", instance.id) + if 'label' in tags: + label = tags['label'] + dict_starting_nodes[label].append(tags['Name']) + else: # pragma: no cover + logging.error("Managed slave instance %s does not have tag label", instance.id) return dict_starting_nodes @@ -611,7 +617,7 @@ def _instance_uptime(ec2_resource) -> Dict[str, int]: instances = list(ec2_resource.instances.filter( Filters=[ {'Name': 'tag:AutoScaledSlave', 'Values': ['True']} # Ensure only listing instances managed by auto scaling - , {'Name': 'instance-state-name', 'Values': ['starting', 'running']} + , {'Name': 'instance-state-name', 'Values': ['pending', 'running']} ] )) @@ -828,6 +834,7 @@ def format_linux(label, target_instance_name): linux_types = ['mxnetlinux-cpu', 'restricted-mxnetlinux-cpu', 'mxnetlinux-gpu', + 'mxnetlinux-gpu-g4', 'mxnetlinux-gpu-p3', 'restricted-mxnetlinux-gpu-p3', 'restricted-mxnetlinux-gpu', @@ -1072,7 +1079,7 @@ def _get_jenkins_handle() -> jenkinsapi.jenkins.Jenkins: # pragma: no cover except HTTPError as e: logging.exception('Error initializing Jenkins API.') if e.response.status_code == 500: - logging.error('Did you properly set up the API token? https://REDACTEDI/MXBLN-376') + logging.error('Did you properly set up the API token? https://REDACTED/MXBLN-376') logging.error('HTML response - use an HTML beautifier to view it properly: %s', e.response.content) raise Exception('Error initializing Jenkins API', e) @@ -1261,6 +1268,15 @@ def _get_slave_configuration(): 'tunnel': _get_jenkins_private_tunnel_address(), 'job_name_restriction_regex': '^(?!restricted-).+' # Run only unrestricted jobs }, + 'mxnetlinux-gpu-g4': { + 'num_executors': _get_nb_executors_per_label()['mxnetlinux-gpu-g4'], # Number of executors + 'node_description': '[AUTOSCALING] MXNet slave running Ubuntu 18.04 on a g4dn.4xlarge', + 'remote_fs': '/home/jenkins_slave', # Remote workspace location + 'labels': 'mxnetlinux-gpu-g4', # Space separated labels string + 'exclusive': True, # Only run jobs assigned to it + 'tunnel': _get_jenkins_private_tunnel_address(), + 'job_name_restriction_regex': '^(?!restricted-).+' # Run only unrestricted jobs + }, 'restricted-mxnetlinux-gpu': { 'num_executors': _get_nb_executors_per_label()['restricted-mxnetlinux-gpu'], # Number of executors 'node_description': '[AUTOSCALING] MXNet slave running Ubuntu 16.04 on a g3.8xlarge', diff --git a/tools/jenkins-slave-creation-unix/README.md b/tools/jenkins-slave-creation-unix/README.md index 6b11f58..95ba30a 100644 --- a/tools/jenkins-slave-creation-unix/README.md +++ b/tools/jenkins-slave-creation-unix/README.md @@ -15,4 +15,66 @@ -This Terraform setup will spawn an instance that is ready to be saved into an AMI to create a Jenkins slave. \ No newline at end of file +This Terraform setup will spawn an instance that is ready to be saved into an AMI to create a Jenkins slave. + +# Steps +## Setup Terraform +### Fetch Terraform and unzip the binary + +``` +wget https://releases.hashicorp.com/terraform/0.12.24/terraform_0.12.24_linux_amd64.zip +sudo apt install unzip +unzip terraform_0.12.24_linux_amd64.zip +``` + +### Add to path +Add the binary to the environment variable 'PATH'. +For example + +``` +sudo mv terraform /usr/local/bin/ +mkdir /home/ubuntu/bin +mv /usr/local/bin/terraform /home/ubuntu/bin/terraform +``` + +### Verify +Check whether the terraform binary is in the PATH variable + +``` +echo $PATH +``` + +Verify terraform is properly installed + +``` +$ terraform --version +Terraform v0.12.24 +$ which terraform +/home/ubuntu/bin/terraform +``` + +## Python package requirements +Install the terraform python package + +``` +pip3 install python_terraform +``` + +## Fill the redacted information +- infrastructure.tf [Security groups] +- infrastructure.tfvars [`key_name`, `key_path`, `secret_manager_docker_hub_arn`] +- `~/.aws/config` [Isengard account profile] + +## Run the AMI creation script + +``` +./create_slave.sh +``` + +- Enter the desired directory + +## Create an AMI +- Login to AWS Console +- Instance would be created with the name used in `infrastructure.tfvars.instance_name` +- Wait for the instance till it's state is "Stopped". [Note : Don't manually stop the instance. Manually stopping the instance can cause the AMI to get corrupted. In case it doesn't change state to stop, there was likely an issue in AMI generation. Please refer /var/log/cloud-init-output.log for further debug] +- Once the instance is stopped, Select Instance -> Actions -> Image -> Create Image \ No newline at end of file diff --git a/tools/jenkins-slave-creation-unix/conf-ubuntu-gpu-g3/shell-variables.sh b/tools/jenkins-slave-creation-unix/conf-ubuntu-gpu-g3/shell-variables.sh deleted file mode 100644 index bda8818..0000000 --- a/tools/jenkins-slave-creation-unix/conf-ubuntu-gpu-g3/shell-variables.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - - -export S3_CONFIG_BUCKET="mxnet-ci-slave-dev" -export S3_CONFIG_FILE="ubuntu-gpu-g3-config.tar.bz2" diff --git a/tools/jenkins-slave-creation-unix/conf-ubuntu-gpu-p3/infrastructure.tfvars b/tools/jenkins-slave-creation-unix/conf-ubuntu-gpu-p3/infrastructure.tfvars deleted file mode 100644 index edbe6b7..0000000 --- a/tools/jenkins-slave-creation-unix/conf-ubuntu-gpu-p3/infrastructure.tfvars +++ /dev/null @@ -1,29 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -key_name = "REDACTED" -key_path = "~/.ssh/REDACTED" -instance_type = "p3.2xlarge" - -s3_config_bucket = "mxnet-ci-slave-dev" -s3_config_filename = "ubuntu-gpu-p3-config.tar.bz2" -slave_install_script = "conf-ubuntu-gpu-p3/install.sh" -shell_variables_file = "conf-ubuntu-gpu-p3/shell-variables.sh" -ami = "ami-bd8f33c5" # ftp://64.50.236.216/pub/ubuntu-cloud-images/query/xenial/server/released.txt -instance_name = "Slave-base_Ubuntu-GPU-P3" -aws_region = "us-west-2" -secret_manager_docker_hub_arn = "arn:aws:secretsmanager:us-west-2:REDACTED:secret:REDACTED" diff --git a/tools/jenkins-slave-creation-unix/conf-ubuntu-gpu-p3/infrastructure_backend.tfvars b/tools/jenkins-slave-creation-unix/conf-ubuntu-gpu-p3/infrastructure_backend.tfvars deleted file mode 100644 index 3e07798..0000000 --- a/tools/jenkins-slave-creation-unix/conf-ubuntu-gpu-p3/infrastructure_backend.tfvars +++ /dev/null @@ -1,22 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Terraform backend configuration. See See https://www.terraform.io/docs/backends/config.html for -# more details. Reason being that config can not be interpolated during backend init - -bucket = "mxnet-ci-slave-dev-tfstate" -region = "us-west-2" diff --git a/tools/jenkins-slave-creation-unix/conf-ubuntu-gpu-p3/install.sh b/tools/jenkins-slave-creation-unix/conf-ubuntu-gpu-p3/install.sh deleted file mode 100644 index 2184b6e..0000000 --- a/tools/jenkins-slave-creation-unix/conf-ubuntu-gpu-p3/install.sh +++ /dev/null @@ -1,134 +0,0 @@ -#!/bin/bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -e -set -x - -# Load variables -source /var/lib/cloud/instance/scripts/part-001 - -#Setup jenkins user -sudo useradd jenkins_slave - -#Prevent log in -sudo usermod -L jenkins_slave -sudo mkdir -p /home/jenkins_slave/remoting -sudo chown -R jenkins_slave:jenkins_slave /home/jenkins_slave - -#Remove preinstalled packaged -sudo apt-get -y purge openjdk* -sudo apt-get -y purge nvidia* -echo "Purged packages" - -#Install htop -sudo apt-get update -sudo apt-get -y install htop - -#Install java -sudo apt-get -y install openjdk-8-jre - -#Install git -sudo apt-get -y install git -sudo -H -S -u jenkins_slave git config --global user.email "mxnet-ci" -sudo -H -S -u jenkins_slave git config --global user.name "mxnet-ci" - -#Install python3, pip3 and dependencies for auto-connect.py -sudo apt-get -y install python3 python3-pip -sudo pip3 install boto3 python-jenkins joblib docker - -echo "Installed htop, java, git and python" - -#Install nvidia drivers -sudo apt-get -y install nvidia-418 - -# TODO: - Disabled nvidia updates @ /etc/apt/apt.conf.d/50unattended-upgrades -#Unattended-Upgrade::Package-Blacklist { -#"nvidia-384"; -#"nvidia-opencl-icd-384"; - -echo "Installed nvidia drivers" - -#Install docker engine -sudo apt-get install -y apt-transport-https ca-certificates curl software-properties-common -curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - -sudo apt-key fingerprint 0EBFCD88 -sudo add-apt-repository \ - "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ - $(lsb_release -cs) \ - stable" -sudo apt-get update -sudo apt-get install -y docker-ce -sudo usermod -aG docker jenkins_slave -sudo systemctl enable docker #Enable docker to start on startup -sudo service docker restart -echo "Installed docker engine" - -# Add nvidia-docker and nvidia-docker-plugin -curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | \ - sudo apt-key add - -distribution=$(. /etc/os-release;echo $ID$VERSION_ID) -curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | \ - sudo tee /etc/apt/sources.list.d/nvidia-docker.list -sudo apt-get update - -# Install nvidia-docker2 and reload the Docker daemon configuration -sudo apt-get install -y nvidia-docker2 -sudo pkill -SIGHUP dockerd - -# Download additional scripts -sudo apt-get -y install awscli -tmpdir=$(mktemp -d) -aws s3 cp --quiet s3://$S3_CONFIG_BUCKET/$S3_CONFIG_FILE $tmpdir/scripts.tar.bz2 - -mkdir /home/jenkins_slave/scripts -tar -C /home/jenkins_slave/scripts -xjf $tmpdir/scripts.tar.bz2 -find /home/jenkins_slave/scripts -type f -exec chmod 744 {} \; -chown -R jenkins_slave.jenkins_slave /home/jenkins_slave - -# Set up swap of 50GB -fallocate -l 50G /swapfile -chown root:root /swapfile -chmod 0600 /swapfile -mkswap /swapfile -swapon /swapfile -echo /swapfile none swap sw 0 0 >> /etc/fstab - -# Add auto-connecting to slave to startup -touch /home/jenkins_slave/auto-connect.log -chown -R jenkins_slave:jenkins_slave /home/jenkins_slave/ -echo "@reboot jenkins_slave /home/jenkins_slave/scripts/launch-autoconnect.sh" > /etc/cron.d/jenkins-start-slave - -# Write instructions to home dir -readme="Please use the following command in your cloud-init-script to specify the jenkins master -address: -#!/bin/bash -echo 'http://jenkins.mxnet-ci.amazon-ml.com/' > /home/jenkins_slave/jenkins_master_url -echo 'http://jenkins-priv.mxnet-ci.amazon-ml.com/' > /home/jenkins_slave/jenkins_master_private_url - - -Optional: -echo 'mxnet-linux-cpu10' > /home/jenkins_slave/jenkins_slave_name -" -echo "$readme" > /home/ubuntu/readme.txt - -echo "Setup completed" - -# For testing use reboot, but lateron just turn off the instance to prepare AMI generation -# reboot -shutdown -h now diff --git a/tools/jenkins-slave-creation-unix/conf-ubuntu-gpu-g3/infrastructure.tfvars b/tools/jenkins-slave-creation-unix/conf-ubuntu-gpu/infrastructure.tfvars similarity index 70% rename from tools/jenkins-slave-creation-unix/conf-ubuntu-gpu-g3/infrastructure.tfvars rename to tools/jenkins-slave-creation-unix/conf-ubuntu-gpu/infrastructure.tfvars index e45e58c..9cadfd4 100644 --- a/tools/jenkins-slave-creation-unix/conf-ubuntu-gpu-g3/infrastructure.tfvars +++ b/tools/jenkins-slave-creation-unix/conf-ubuntu-gpu/infrastructure.tfvars @@ -17,13 +17,14 @@ key_name = "REDACTED" key_path = "~/.ssh/REDACTED" -instance_type = "g3.8xlarge" +instance_type = "g4dn.4xlarge" s3_config_bucket = "mxnet-ci-slave-dev" -s3_config_filename = "ubuntu-gpu-g3-config.tar.bz2" -slave_install_script = "conf-ubuntu-gpu-g3/install.sh" -shell_variables_file = "conf-ubuntu-gpu-g3/shell-variables.sh" -ami = "ami-bd8f33c5" # ftp://64.50.236.216/pub/ubuntu-cloud-images/query/xenial/server/released.txt -instance_name = "Slave-base_Ubuntu-GPU-G3" +s3_config_filename = "ubuntu-gpu-config.tar.bz2" +slave_install_script = "conf-ubuntu-gpu/install.sh" +shell_variables_file = "conf-ubuntu-gpu/shell-variables.sh" +# Base AMI, defines the OS of the slave instance [here: Ubuntu18.04 base image] +ami = "ami-0d1cd67c26f5fca19" # Ubuntu/images/hvm-ssd/ubuntu-bionic-18.04-amd64-server-20200112 +instance_name = "Slave-base_Ubuntu-GPU" aws_region = "us-west-2" secret_manager_docker_hub_arn = "arn:aws:secretsmanager:us-west-2:REDACTED:secret:REDACTED" diff --git a/tools/jenkins-slave-creation-unix/conf-ubuntu-gpu-g3/infrastructure_backend.tfvars b/tools/jenkins-slave-creation-unix/conf-ubuntu-gpu/infrastructure_backend.tfvars similarity index 100% rename from tools/jenkins-slave-creation-unix/conf-ubuntu-gpu-g3/infrastructure_backend.tfvars rename to tools/jenkins-slave-creation-unix/conf-ubuntu-gpu/infrastructure_backend.tfvars diff --git a/tools/jenkins-slave-creation-unix/conf-ubuntu-gpu-g3/install.sh b/tools/jenkins-slave-creation-unix/conf-ubuntu-gpu/install.sh similarity index 69% rename from tools/jenkins-slave-creation-unix/conf-ubuntu-gpu-g3/install.sh rename to tools/jenkins-slave-creation-unix/conf-ubuntu-gpu/install.sh index 60fc64a..7ed06b7 100644 --- a/tools/jenkins-slave-creation-unix/conf-ubuntu-gpu-g3/install.sh +++ b/tools/jenkins-slave-creation-unix/conf-ubuntu-gpu/install.sh @@ -57,7 +57,14 @@ sudo pip3 install boto3 python-jenkins joblib docker echo "Installed htop, java, git and python" #Install nvidia drivers -sudo apt-get -y install nvidia-418 +#Chose the latest nvidia driver supported on Tesla driver for Ubuntu18.04 +#Refer : https://www.nvidia.com/Download/driverResults.aspx/158191/en-us +wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin +sudo mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600 +sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub +sudo add-apt-repository "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /" +sudo apt-get update +sudo apt-get -y install cuda-drivers # TODO: - Disabled nvidia updates @ /etc/apt/apt.conf.d/50unattended-upgrades #Unattended-Upgrade::Package-Blacklist { @@ -79,7 +86,12 @@ sudo apt-get install -y docker-ce sudo usermod -aG docker jenkins_slave sudo systemctl enable docker #Enable docker to start on startup sudo service docker restart -echo "Installed docker engine" +# Get latest docker-compose; Ubuntu 18.04 has latest docker in bionic-updates, but not docker-compose and rather ships v1.17 from 2017 +# See https://github.com/docker/compose/releases for latest release +# /usr/local/bin is not on the PATH in Jenkins, thus place binary in /usr/bin +sudo curl -L "https://github.com/docker/compose/releases/download/1.25.5/docker-compose-$(uname -s)-$(uname -m)" -o /usr/bin/docker-compose +sudo chmod +x /usr/bin/docker-compose +echo "Installed docker engine and docker-compose" # Add nvidia-docker and nvidia-docker-plugin curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | \ @@ -89,9 +101,22 @@ curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.li sudo tee /etc/apt/sources.list.d/nvidia-docker.list sudo apt-get update -# Install nvidia-docker2 and reload the Docker daemon configuration -sudo apt-get install -y nvidia-docker2 -sudo pkill -SIGHUP dockerd +# Install nvidia docker related packages and reload the Docker daemon configuration +# Install nvidia-container toolkit and reload the Docker daemon configuration +# Refer Nvidia Docker : https://github.com/NVIDIA/nvidia-docker +sudo apt-get install -y nvidia-container-toolkit +sudo systemctl restart docker + +# Install & add nvidia container runtime to the Docker daemon +# Refer https://github.com/nvidia/nvidia-container-runtime#docker-engine-setup +sudo apt-get install nvidia-container-runtime +sudo mkdir -p /etc/systemd/system/docker.service.d +sudo tee /etc/systemd/system/docker.service.d/override.conf <