Skip to content
This repository was archived by the owner on Nov 17, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@ test:
IAM_JENKINS_RESTRICTED_SLAVE_ROLE: arn:aws:iam::REDACTED:role/jenkins_restricted_slave_role
SECRETS_MANAGER_ARN: arn:aws:secretsmanager:us-west-2:REDACTED:secret:REDACTED
JENKINS_PRIV_TUNNEL: jenkins-priv.mxnet-ci-dev.amazon-ml.com:48593
LAUNCH_TEMPLATES: '{"mxnetlinux-cpu":{"id":"lt-06a15945813ad44f2","version":"14"},"restricted-mxnetlinux-cpu":{"id":"lt-0dc74292f7d647ac6","version":"9"},"mxnetlinux-gpu":{"id":"lt-0c22f238c0edb58ab","version":"19"},"mxnetlinux-gpu-p3":{"id":"lt-00c83ee5d7aeaf4ab","version":"12"},"restricted-mxnetlinux-gpu-p3":{"id":"lt-0f893d7f3f2660c1c","version":"3"},"mxnetlinux-gpu-p3-8xlarge":{"id":"lt-0277305ae5f49782b","version":"6"},"mxnetwindows-cpu":{"id":"lt-09dff2fff6b5586f0","version":"11"},"mxnetwindows-gpu":{"id":"lt-0ce229129d0d3be27","version":"15"},"utility":{"id":"lt-028ee0bc3cef79942","version":"3"},"restricted-utility":{"id":"lt-05d66be1f50c9b3fc","version":"1"}, "restricted-mxnetlinux-gpu":{"id":"lt-0c246487c1570d396","version":"5"}}'
EXECUTORS_PER_LABEL: '{"mxnetlinux-cpu":3,"restricted-mxnetlinux-cpu":3,"mxnetlinux-gpu":1,"mxnetlinux-gpu-p3":1,"restricted-mxnetlinux-gpu-p3":1,"mxnetlinux-gpu-p3-8xlarge":1,"mxnetwindows-cpu":4,"mxnetwindows-gpu":1,"utility":30,"restricted-utility":30, "restricted-mxnetlinux-gpu": 1}'
WARM_POOL_SIZE: '{"mxnetlinux-cpu":1,"restricted-mxnetlinux-cpu":0,"mxnetlinux-gpu":0,"mxnetlinux-gpu-p3":0,"restricted-mxnetlinux-gpu-p3":0,"mxnetlinux-gpu-p3-8xlarge":0,"mxnetwindows-cpu":1,"mxnetwindows-gpu":0,"utility":1,"restricted-utility":1, "restricted-mxnetlinux-gpu": 0}'
MINIMUM_QUEUE_TIMES_SEC: '{"mxnetlinux-cpu":30,"restricted-mxnetlinux-cpu":30,"mxnetlinux-gpu":30,"mxnetlinux-gpu-p3":30,"restricted-mxnetlinux-gpu-p3":30,"mxnetlinux-gpu-p3-8xlarge":30,"mxnetwindows-cpu":30,"mxnetwindows-gpu":30,"utility":3,"restricted-utility":3, "restricted-mxnetlinux-gpu": 30}'
CCACHE_EFS_DNS: '{"mxnetlinux-cpu":"fs-REDACTED.efs.us-west-2.amazonaws.com","restricted-mxnetlinux-cpu":"fs-REDACTED.efs.us-west-2.amazonaws.com","mxnetlinux-gpu":"NONE","mxnetlinux-gpu-p3":"NONE","restricted-mxnetlinux-gpu-p3":"NONE","mxnetlinux-gpu-p3-8xlarge":"NONE","mxnetwindows-cpu":"NONE","mxnetwindows-gpu":"NONE","utility":"NONE","restricted-utility":"NONE", "restricted-mxnetlinux-gpu": "NONE"}'
MAXIMUM_STARTUP_TIME_SEC: '{"mxnetlinux-cpu":300,"restricted-mxnetlinux-cpu":300,"mxnetlinux-gpu":300,"mxnetlinux-gpu-p3":300,"restricted-mxnetlinux-gpu-p3":300,"mxnetlinux-gpu-p3-8xlarge":300,"mxnetwindows-cpu":1800,"mxnetwindows-gpu":1800,"utility":300,"restricted-utility":300, "restricted-mxnetlinux-gpu":300}'
MANAGED_JENKINS_NODE_LABELS: '["mxnetlinux-cpu","restricted-mxnetlinux-cpu","mxnetlinux-gpu", "restricted-mxnetlinux-gpu", "mxnetwindows-cpu","mxnetwindows-gpu","mxnetlinux-gpu-p3","restricted-mxnetlinux-gpu-p3","mxnetlinux-gpu-p3-8xlarge","utility","restricted-utility"]'
LAUNCH_TEMPLATES: '{"mxnetlinux-cpu":{"id":"lt-06a15945813ad44f2","version":"14"},"restricted-mxnetlinux-cpu":{"id":"lt-0dc74292f7d647ac6","version":"9"},"mxnetlinux-gpu":{"id":"lt-0c22f238c0edb58ab","version":"19"},"mxnetlinux-gpu-g4":{"id":"lt-0f830794cba5041e2","version":"1"},"mxnetlinux-gpu-p3":{"id":"lt-00c83ee5d7aeaf4ab","version":"12"},"restricted-mxnetlinux-gpu-p3":{"id":"lt-0f893d7f3f2660c1c","version":"3"},"mxnetlinux-gpu-p3-8xlarge":{"id":"lt-0277305ae5f49782b","version":"6"},"mxnetwindows-cpu":{"id":"lt-09dff2fff6b5586f0","version":"11"},"mxnetwindows-gpu":{"id":"lt-0ce229129d0d3be27","version":"15"},"utility":{"id":"lt-028ee0bc3cef79942","version":"3"},"restricted-utility":{"id":"lt-05d66be1f50c9b3fc","version":"1"}, "restricted-mxnetlinux-gpu":{"id":"lt-0c246487c1570d396","version":"5"}}'
EXECUTORS_PER_LABEL: '{"mxnetlinux-cpu":2,"restricted-mxnetlinux-cpu":3,"mxnetlinux-gpu":1,"mxnetlinux-gpu-g4":1,"mxnetlinux-gpu-p3":1,"restricted-mxnetlinux-gpu-p3":1,"mxnetlinux-gpu-p3-8xlarge":1,"mxnetwindows-cpu":4,"mxnetwindows-gpu":1,"utility":30,"restricted-utility":30, "restricted-mxnetlinux-gpu": 1}'
WARM_POOL_SIZE: '{"mxnetlinux-cpu":1,"restricted-mxnetlinux-cpu":0,"mxnetlinux-gpu":0,"mxnetlinux-gpu-g4":0,"mxnetlinux-gpu-p3":0,"restricted-mxnetlinux-gpu-p3":0,"mxnetlinux-gpu-p3-8xlarge":0,"mxnetwindows-cpu":1,"mxnetwindows-gpu":0,"utility":1,"restricted-utility":1, "restricted-mxnetlinux-gpu": 0}'
MINIMUM_QUEUE_TIMES_SEC: '{"mxnetlinux-cpu":30,"restricted-mxnetlinux-cpu":30,"mxnetlinux-gpu":30,"mxnetlinux-gpu-g4":30,"mxnetlinux-gpu-p3":30,"restricted-mxnetlinux-gpu-p3":30,"mxnetlinux-gpu-p3-8xlarge":30,"mxnetwindows-cpu":30,"mxnetwindows-gpu":30,"utility":3,"restricted-utility":3, "restricted-mxnetlinux-gpu": 30}'
CCACHE_EFS_DNS: '{"mxnetlinux-cpu":"fs-REDACTED.efs.us-west-2.amazonaws.com","restricted-mxnetlinux-cpu":"fs-REDACTED.efs.us-west-2.amazonaws.com","mxnetlinux-gpu":"NONE","mxnetlinux-gpu-g4":"NONE","mxnetlinux-gpu-p3":"NONE","restricted-mxnetlinux-gpu-p3":"NONE","mxnetlinux-gpu-p3-8xlarge":"NONE","mxnetwindows-cpu":"NONE","mxnetwindows-gpu":"NONE","utility":"NONE","restricted-utility":"NONE", "restricted-mxnetlinux-gpu": "NONE"}'
MAXIMUM_STARTUP_TIME_SEC: '{"mxnetlinux-cpu":300,"restricted-mxnetlinux-cpu":300,"mxnetlinux-gpu":300,"mxnetlinux-gpu-g4":300,"mxnetlinux-gpu-p3":300,"restricted-mxnetlinux-gpu-p3":300,"mxnetlinux-gpu-p3-8xlarge":300,"mxnetwindows-cpu":1800,"mxnetwindows-gpu":1800,"utility":300,"restricted-utility":300, "restricted-mxnetlinux-gpu":300}'
MANAGED_JENKINS_NODE_LABELS: '["mxnetlinux-cpu","restricted-mxnetlinux-cpu","mxnetlinux-gpu", "mxnetlinux-gpu-g4", "restricted-mxnetlinux-gpu", "mxnetwindows-cpu","mxnetwindows-gpu","mxnetlinux-gpu-p3","restricted-mxnetlinux-gpu-p3","mxnetlinux-gpu-p3-8xlarge","utility","restricted-utility"]'
IGNORED_JENKINS_NODE_LABELS: '["mxnetlinux","mxnetwindows","master"]'
IGNORED_JENKINS_NODE_NAMES: '["master"]'
LOGGING_LEVEL: DEBUG
Expand All @@ -31,13 +31,13 @@ prod:
IAM_JENKINS_RESTRICTED_SLAVE_ROLE: arn:aws:iam::REDACTED:role/jenkins_restricted_slave_role
SECRETS_MANAGER_ARN: arn:aws:secretsmanager:us-west-2:REDACTED:secret:REDACTED
JENKINS_PRIV_TUNNEL: jenkins-priv.mxnet-ci.amazon-ml.com:48593
LAUNCH_TEMPLATES: '{"mxnetlinux-cpu":{"id":"lt-059ca0af3b73fdd43","version":"6"},"restricted-mxnetlinux-cpu":{"id":"lt-0752e01a2f18939a4","version":"4"},"mxnetlinux-gpu":{"id":"lt-083414b180618edd0","version":"8"},"mxnetlinux-gpu-p3":{"id":"lt-09e887362d145072b","version":"8"},"mxnetlinux-gpu-p3-8xlarge":{"id":"lt-03458db28b362cf92","version":"5"},"mxnetwindows-cpu":{"id":"lt-02d3dfef15faf1298","version":"6"},"mxnetwindows-gpu":{"id":"lt-0eb15ef80c9a69ef1","version":"6"},"utility":{"id":"lt-0b64c7b2c90e53235","version":"1"},"restricted-utility":{"id":"lt-0b8c0a9d4ee3ea089","version":"1"},"restricted-mxnetlinux-gpu-p3":{"id":"lt-0a98711a162486aa5","version":"1"}}'
EXECUTORS_PER_LABEL: '{"mxnetlinux-cpu":3,"restricted-mxnetlinux-cpu":3,"mxnetlinux-gpu":1,"restricted-mxnetlinux-gpu":0,"mxnetlinux-gpu-p3":1,"mxnetlinux-gpu-p3-8xlarge":1,"mxnetwindows-cpu":4,"restricted-mxnetlinux-gpu-p3":1,"mxnetwindows-gpu":1,"utility":30,"restricted-utility":30}'
WARM_POOL_SIZE: '{"mxnetlinux-cpu":1,"restricted-mxnetlinux-cpu":0,"mxnetlinux-gpu":0,"mxnetlinux-gpu-p3":0,"mxnetlinux-gpu-p3-8xlarge":0,"mxnetwindows-cpu":1,"mxnetwindows-gpu":0,"restricted-mxnetlinux-gpu-p3":0,"utility":1,"restricted-utility":1}'
MINIMUM_QUEUE_TIMES_SEC: '{"mxnetlinux-cpu":30,"restricted-mxnetlinux-cpu":30,"mxnetlinux-gpu":30,"mxnetlinux-gpu-p3":30,"mxnetlinux-gpu-p3-8xlarge":30,"restricted-mxnetlinux-gpu-p3":30,"mxnetwindows-cpu":30,"mxnetwindows-gpu":30,"utility":3,"restricted-utility":3}'
CCACHE_EFS_DNS: '{"mxnetlinux-cpu":"fs-REDACTED.efs.us-west-2.amazonaws.com","restricted-mxnetlinux-cpu":"fs-REDACTED.efs.us-west-2.amazonaws.com","mxnetlinux-gpu":"NONE","mxnetlinux-gpu-p3":"NONE","restricted-mxnetlinux-gpu-p3":"NONE","mxnetlinux-gpu-p3-8xlarge":"NONE","mxnetwindows-cpu":"NONE","mxnetwindows-gpu":"NONE","utility":"NONE","restricted-utility":"NONE"}'
MAXIMUM_STARTUP_TIME_SEC: '{"mxnetlinux-cpu":300,"restricted-mxnetlinux-cpu":300,"mxnetlinux-gpu":300,"mxnetlinux-gpu-p3":300,"restricted-mxnetlinux-gpu-p3":300,"mxnetlinux-gpu-p3-8xlarge":300,"mxnetwindows-cpu":1800,"mxnetwindows-gpu":1800,"utility":300,"restricted-utility":300}'
MANAGED_JENKINS_NODE_LABELS: '["mxnetlinux-cpu","restricted-mxnetlinux-cpu","mxnetlinux-gpu","mxnetwindows-cpu","mxnetwindows-gpu","mxnetlinux-gpu-p3","restricted-mxnetlinux-gpu-p3","mxnetlinux-gpu-p3-8xlarge","utility","restricted-utility"]'
LAUNCH_TEMPLATES: '{"mxnetlinux-cpu":{"id":"lt-059ca0af3b73fdd43","version":"15"},"restricted-mxnetlinux-cpu":{"id":"lt-0752e01a2f18939a4","version":"10"},"mxnetlinux-gpu":{"id":"lt-083414b180618edd0","version":"14"},"mxnetlinux-gpu-g4":{"id":"lt-0ebf575cc5a56ebf4","version":"1"},"restricted-mxnetlinux-gpu":{"id":"lt-091e6f84d25af91d2","version":"5"},"mxnetlinux-gpu-p3":{"id":"lt-09e887362d145072b","version":"12"},"mxnetlinux-gpu-p3-8xlarge":{"id":"lt-03458db28b362cf92","version":"5"},"mxnetwindows-cpu":{"id":"lt-02d3dfef15faf1298","version":"22"},"mxnetwindows-gpu":{"id":"lt-0eb15ef80c9a69ef1","version":"30"},"utility":{"id":"lt-0b64c7b2c90e53235","version":"1"},"restricted-utility":{"id":"lt-0b8c0a9d4ee3ea089","version":"7"},"restricted-mxnetlinux-gpu-p3":{"id":"lt-0a98711a162486aa5","version":"5"}}'
EXECUTORS_PER_LABEL: '{"mxnetlinux-cpu":2,"restricted-mxnetlinux-cpu":3,"mxnetlinux-gpu":1,"mxnetlinux-gpu-g4":1,"restricted-mxnetlinux-gpu":1,"mxnetlinux-gpu-p3":1,"mxnetlinux-gpu-p3-8xlarge":1,"mxnetwindows-cpu":1,"restricted-mxnetlinux-gpu-p3":1,"mxnetwindows-gpu":1,"utility":30,"restricted-utility":30}'
WARM_POOL_SIZE: '{"mxnetlinux-cpu":1,"restricted-mxnetlinux-cpu":0,"mxnetlinux-gpu":0,"mxnetlinux-gpu-g4":0,"restricted-mxnetlinux-gpu":0,"mxnetlinux-gpu-p3":0,"mxnetlinux-gpu-p3-8xlarge":0,"mxnetwindows-cpu":1,"mxnetwindows-gpu":0,"restricted-mxnetlinux-gpu-p3":0,"utility":1,"restricted-utility":1}'
MINIMUM_QUEUE_TIMES_SEC: '{"mxnetlinux-cpu":30,"restricted-mxnetlinux-cpu":30,"mxnetlinux-gpu":30,"mxnetlinux-gpu-g4":30,"mxnetlinux-gpu-p3":30,"restricted-mxnetlinux-gpu":30,"mxnetlinux-gpu-p3-8xlarge":30,"restricted-mxnetlinux-gpu-p3":30,"mxnetwindows-cpu":30,"mxnetwindows-gpu":30,"utility":3,"restricted-utility":3}'
CCACHE_EFS_DNS: '{"mxnetlinux-cpu":"fs-REDACTED.efs.us-west-2.amazonaws.com","restricted-mxnetlinux-cpu":"fs-REDACTED.efs.us-west-2.amazonaws.com","mxnetlinux-gpu":"NONE","mxnetlinux-gpu-g4":"NONE","mxnetlinux-gpu-p3":"NONE","restricted-mxnetlinux-gpu":"NONE","restricted-mxnetlinux-gpu-p3":"NONE","mxnetlinux-gpu-p3-8xlarge":"NONE","mxnetwindows-cpu":"NONE","mxnetwindows-gpu":"NONE","utility":"NONE","restricted-utility":"NONE"}'
MAXIMUM_STARTUP_TIME_SEC: '{"mxnetlinux-cpu":300,"restricted-mxnetlinux-cpu":300,"mxnetlinux-gpu":300,"mxnetlinux-gpu-g4":300,"restricted-mxnetlinux-gpu":300,"mxnetlinux-gpu-p3":300,"restricted-mxnetlinux-gpu-p3":300,"mxnetlinux-gpu-p3-8xlarge":300,"mxnetwindows-cpu":1800,"mxnetwindows-gpu":1800,"utility":300,"restricted-utility":300}'
MANAGED_JENKINS_NODE_LABELS: '["mxnetlinux-cpu","restricted-mxnetlinux-cpu","mxnetlinux-gpu", "mxnetlinux-gpu-g4", "mxnetwindows-cpu","mxnetwindows-gpu","mxnetlinux-gpu-p3","restricted-mxnetlinux-gpu","restricted-mxnetlinux-gpu-p3","mxnetlinux-gpu-p3-8xlarge","utility","restricted-utility"]'
IGNORED_JENKINS_NODE_LABELS: '["mxnetlinux","mxnetwindows","master"]'
IGNORED_JENKINS_NODE_NAMES: '["master"]'
LOGGING_LEVEL: DEBUG
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,13 @@ def _unconnected_instances(nodes: list, instance_uptime: Dict[str, int], ec2_res
dict_starting_nodes[label].append(tags['Name'])
else: # pragma: no cover
logging.error("Managed slave instance %s does not have tag label", instance.id)

elif not target_node:
logging.error("Found orphaned / zombie instance: '%s'", instance.id)
if 'label' in tags:
label = tags['label']
dict_starting_nodes[label].append(tags['Name'])
else: # pragma: no cover
logging.error("Managed slave instance %s does not have tag label", instance.id)
return dict_starting_nodes


Expand Down Expand Up @@ -611,7 +617,7 @@ def _instance_uptime(ec2_resource) -> Dict[str, int]:
instances = list(ec2_resource.instances.filter(
Filters=[
{'Name': 'tag:AutoScaledSlave', 'Values': ['True']} # Ensure only listing instances managed by auto scaling
, {'Name': 'instance-state-name', 'Values': ['starting', 'running']}
, {'Name': 'instance-state-name', 'Values': ['pending', 'running']}
Comment thread
ChaiBapchya marked this conversation as resolved.
]
))

Expand Down Expand Up @@ -828,6 +834,7 @@ def format_linux(label, target_instance_name):
linux_types = ['mxnetlinux-cpu',
'restricted-mxnetlinux-cpu',
'mxnetlinux-gpu',
'mxnetlinux-gpu-g4',
'mxnetlinux-gpu-p3',
'restricted-mxnetlinux-gpu-p3',
'restricted-mxnetlinux-gpu',
Expand Down Expand Up @@ -1072,7 +1079,7 @@ def _get_jenkins_handle() -> jenkinsapi.jenkins.Jenkins: # pragma: no cover
except HTTPError as e:
logging.exception('Error initializing Jenkins API.')
if e.response.status_code == 500:
logging.error('Did you properly set up the API token? https://REDACTEDI/MXBLN-376')
logging.error('Did you properly set up the API token? https://REDACTED/MXBLN-376')

logging.error('HTML response - use an HTML beautifier to view it properly: %s', e.response.content)
raise Exception('Error initializing Jenkins API', e)
Expand Down Expand Up @@ -1261,6 +1268,15 @@ def _get_slave_configuration():
'tunnel': _get_jenkins_private_tunnel_address(),
'job_name_restriction_regex': '^(?!restricted-).+' # Run only unrestricted jobs
},
'mxnetlinux-gpu-g4': {
'num_executors': _get_nb_executors_per_label()['mxnetlinux-gpu-g4'], # Number of executors
'node_description': '[AUTOSCALING] MXNet slave running Ubuntu 18.04 on a g4dn.4xlarge',
'remote_fs': '/home/jenkins_slave', # Remote workspace location
'labels': 'mxnetlinux-gpu-g4', # Space separated labels string
'exclusive': True, # Only run jobs assigned to it
'tunnel': _get_jenkins_private_tunnel_address(),
'job_name_restriction_regex': '^(?!restricted-).+' # Run only unrestricted jobs
},
'restricted-mxnetlinux-gpu': {
'num_executors': _get_nb_executors_per_label()['restricted-mxnetlinux-gpu'], # Number of executors
'node_description': '[AUTOSCALING] MXNet slave running Ubuntu 16.04 on a g3.8xlarge',
Expand Down
64 changes: 63 additions & 1 deletion tools/jenkins-slave-creation-unix/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,66 @@
<!--- specific language governing permissions and limitations -->
<!--- under the License. -->

This Terraform setup will spawn an instance that is ready to be saved into an AMI to create a Jenkins slave.
This Terraform setup will spawn an instance that is ready to be saved into an AMI to create a Jenkins slave.

# Steps
## Setup Terraform
### Fetch Terraform and unzip the binary

```
wget https://releases.hashicorp.com/terraform/0.12.24/terraform_0.12.24_linux_amd64.zip
sudo apt install unzip
unzip terraform_0.12.24_linux_amd64.zip
```

### Add to path
Add the binary to the environment variable 'PATH'.
For example

```
sudo mv terraform /usr/local/bin/
mkdir /home/ubuntu/bin
mv /usr/local/bin/terraform /home/ubuntu/bin/terraform
```

### Verify
Check whether the terraform binary is in the PATH variable

```
echo $PATH
```

Verify terraform is properly installed

```
$ terraform --version
Terraform v0.12.24
$ which terraform
/home/ubuntu/bin/terraform
```

## Python package requirements
Install the terraform python package

```
pip3 install python_terraform
```

## Fill the redacted information
- infrastructure.tf [Security groups]
- infrastructure.tfvars [`key_name`, `key_path`, `secret_manager_docker_hub_arn`]
- `~/.aws/config` [Isengard account profile]

## Run the AMI creation script

```
./create_slave.sh
```

- Enter the desired directory

## Create an AMI
- Login to AWS Console
- Instance would be created with the name used in `infrastructure.tfvars.instance_name`
- Wait for the instance till it's state is "Stopped". [Note : Don't manually stop the instance. Manually stopping the instance can cause the AMI to get corrupted. In case it doesn't change state to stop, there was likely an issue in AMI generation. Please refer /var/log/cloud-init-output.log for further debug]
- Once the instance is stopped, Select Instance -> Actions -> Image -> Create Image

This file was deleted.

This file was deleted.

This file was deleted.

Loading