Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update aws-cdk dependencies #119

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,17 +35,17 @@ classifiers = [
]
dependencies = [
"autogluon.common >=1, <2",
"aws-cdk-lib >=2.0.0,<3",
"aws-cdk.aws-batch-alpha >=2.0.0a1,<2.70.0a0",
"aws-cdk-lib >=2.0.0,<2.179",
"awscliv2 >=2.2,<2.4",
"boto3 >=1.26.0,<2",
"constructs >=10.0.0,<10.5",
"fsspec >=2023.5.0,<2024.6",
"matplotlib >=3.4,<3.10",
"fsspec >=2023.5.0,<2025.3",
"matplotlib >=3.4,<3.11",
"pandas >=2.0.0,<3",
"pyarrow >=19.0.0,<20",
"pyyaml >=5.4,<7",
"ray[default] >=2.6.3,<3",
"s3fs >=2023.5.0,<2024.6",
"s3fs >=2023.5.0,<2025.3",
"tqdm >=4.64.0,<=5",
"typer >=0.9.0,<1.0.0",
"wheel >0.38.0,<0.46",
Expand Down Expand Up @@ -94,8 +94,8 @@ legacy_tox_ini = """

[testenv:lint]
deps =
black>=23.1.9, <=23.7.0
isort>=5.11.0, <=5.13.2
black>=23.1.9, <=25.1.0
isort>=5.11.0, <=6.0.0
commands =
black --check --diff src/ tests/
isort --check --diff src/ tests/
Expand Down
27 changes: 15 additions & 12 deletions src/autogluon/bench/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,26 @@ FROM $AG_BENCH_BASE_IMAGE
ENV DEBIAN_FRONTEND=noninteractive
ENV RUNNING_IN_DOCKER=true

# Install essential packages and Python 3.9
# Update package list and install necessary dependencies
RUN apt-get update && \
apt-get install -y software-properties-common build-essential && \
apt-get install -y --no-install-recommends \
software-properties-common build-essential curl unzip git pciutils && \
add-apt-repository ppa:deadsnakes/ppa && \
apt-get update && \
apt-get install -y python3.9 python3.9-dev python3.9-distutils python3.9-venv && \
update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 1
apt-get install -y --no-install-recommends \
python3.11 python3.11-dev python3.11-distutils python3.11-venv && \
update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \
update-alternatives --set python3 /usr/bin/python3.11 && \
rm -rf /var/lib/apt/lists/*

# Install utilities and AWS CLI
RUN apt-get install -y python3-pip unzip curl git pciutils && \
curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" && \
# Install pip
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11 && \
rm -rf /root/.cache

# Install AWS CLI
RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" && \
unzip -q awscliv2.zip && \
./aws/install && \
rm awscliv2.zip && \
python3 -m pip install --upgrade pip && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /usr/local/aws
rm -rf awscliv2.zip aws /var/lib/apt/lists/*

# Application-specific steps
ARG AG_BENCH_VERSION
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import os

import aws_cdk as core
import aws_cdk as cdk
from aws_cdk import aws_iam as iam
from aws_cdk import aws_lambda as _lambda
from constructs import Construct
Expand Down Expand Up @@ -35,7 +35,7 @@ def __init__(

self._lambda_function_role = iam.Role(
self,
"lambda-function-role",
f"{prefix}-lambda-role",
assumed_by=iam.ServicePrincipal("lambda.amazonaws.com"),
managed_policies=[
iam.ManagedPolicy.from_aws_managed_policy_name("service-role/AWSLambdaBasicExecutionRole")
Expand All @@ -62,13 +62,13 @@ def __init__(

self._lambda_function = _lambda.Function(
self,
id,
f"{prefix}-lambda",
function_name=function_name,
runtime=_lambda.Runtime.PYTHON_3_8,
environment=environment,
code=_lambda.Code.from_asset(
code_path,
bundling=core.BundlingOptions(
bundling=cdk.BundlingOptions(
image=_lambda.Runtime.PYTHON_3_8.bundling_image,
command=[
"bash",
Expand All @@ -78,6 +78,6 @@ def __init__(
),
),
handler="lambda_function.handler",
timeout=core.Duration.seconds(timeout),
timeout=cdk.Duration.seconds(timeout),
role=self._lambda_function_role,
)

This file was deleted.

127 changes: 70 additions & 57 deletions src/autogluon/bench/cloud/aws/batch_stack/stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,14 @@

import aws_cdk as core
import boto3
from aws_cdk import aws_batch_alpha as batch
from aws_cdk import aws_batch as batch
from aws_cdk import aws_ec2 as ec2
from aws_cdk import aws_ecr_assets as ecr_assets
from aws_cdk import aws_ecs as ecs
from aws_cdk import aws_iam as iam
from aws_cdk import aws_s3 as s3
from constructs import Construct

from autogluon.bench.cloud.aws.batch_stack.constructs.batch_lambda_function import BatchLambdaFunction
from autogluon.bench.cloud.aws.batch_stack.constructs.instance_profile import InstanceProfile

"""
Sample CDK code for creating the required infrastructure for running a AWS Batch job.
Expand Down Expand Up @@ -140,17 +138,13 @@ def __init__(self, scope: Construct, id: str, static_stack: StaticResourceStack,
block_device_volume = self.node.try_get_context("BLOCK_DEVICE_VOLUME")
lambda_function_name = self.node.try_get_context("LAMBDA_FUNCTION_NAME") + "-" + prefix

instances = []
for instance in instance_types:
instances.append(ec2.InstanceType(instance))

vpc = static_stack.vpc

if vpc is None:
vpc = ec2.Vpc(
self,
f"{prefix}-vpc",
max_azs=2, # You can increase this number for high availability
max_azs=2, # This number can be increased for high availability
nat_gateways=1,
subnet_configuration=[
ec2.SubnetConfiguration(
Expand Down Expand Up @@ -181,9 +175,10 @@ def __init__(self, scope: Construct, id: str, static_stack: StaticResourceStack,
# TODO: use https://github.com/cdklabs/cdk-docker-image-deployment

logger.info(f"Building Dockerfile at {docker_path} with context at {project_root}")
image_name = f"{prefix}-ecr-docker-image-asset"
docker_image_asset = ecr_assets.DockerImageAsset(
self,
f"{prefix}-ecr-docker-image-asset",
image_name,
directory=project_root,
file=docker_path,
follow_symlinks=core.SymlinkFollowMode.ALWAYS,
Expand All @@ -200,41 +195,45 @@ def __init__(self, scope: Construct, id: str, static_stack: StaticResourceStack,
},
)

docker_container_image = ecs.ContainerImage.from_docker_image_asset(docker_image_asset)

container = batch.JobDefinitionContainer(
image=docker_container_image,
gpu_count=container_gpu,
vcpus=container_vcpu,
memory_limit_mib=container_memory,
# Bug that this parameter is not rending in the CF stack under cdk.out
# https://github.com/aws/aws-cdk/issues/13023
linux_params=ecs.LinuxParameters(self, f"{prefix}-linux_params", shared_memory_size=container_memory),
container_properties = batch.CfnJobDefinition.ContainerPropertiesProperty(
image=docker_image_asset.image_uri,
resource_requirements=[
batch.CfnJobDefinition.ResourceRequirementProperty(type="GPU", value=str(container_gpu)),
batch.CfnJobDefinition.ResourceRequirementProperty(type="VCPU", value=str(container_vcpu)),
batch.CfnJobDefinition.ResourceRequirementProperty(type="MEMORY", value=str(container_memory)),
],
linux_parameters=batch.CfnJobDefinition.LinuxParametersProperty(shared_memory_size=container_memory),
)

job_definition = batch.JobDefinition(
job_definition_name = f"{prefix}-job-definition"
job_definition = batch.CfnJobDefinition(
self,
"job-definition",
container=container,
retry_attempts=3,
timeout=core.Duration.minutes(time_limit),
job_definition_name,
type="container",
container_properties=container_properties,
job_definition_name=job_definition_name,
retry_strategy=batch.CfnJobDefinition.RetryStrategyProperty(attempts=3),
timeout=batch.CfnJobDefinition.TimeoutProperty(attempt_duration_seconds=time_limit),
)

# LaunchTemplate.launch_template_name returns Null https://github.com/aws/aws-cdk/issues/19405
# so we are defining the name here instead of tagging
batch_launch_template_name = f"{prefix}-launch-template"
launch_template = ec2.LaunchTemplate(
launch_template = ec2.CfnLaunchTemplate(
self,
f"{prefix}-launch-template",
batch_launch_template_name,
launch_template_name=batch_launch_template_name,
block_devices=[
ec2.BlockDevice(
device_name="/dev/xvda",
volume=ec2.BlockDeviceVolume.ebs(block_device_volume),
)
],
http_tokens=ec2.LaunchTemplateHttpTokens.REQUIRED,
require_imdsv2=True,
launch_template_data={
"blockDeviceMappings": [
{
"deviceName": "/dev/xvda",
"ebs": {
"volumeSize": block_device_volume, # Ensure block_device_volume is defined
"volumeType": "gp3",
"deleteOnTermination": True,
},
}
],
"metadataOptions": {"httpTokens": "required", "httpEndpoint": "enabled"},
},
)

cloudwatch_policy = iam.Policy(
Expand All @@ -249,6 +248,14 @@ def __init__(self, scope: Construct, id: str, static_stack: StaticResourceStack,
)
],
)

batch_service_role = iam.Role(
self,
f"{prefix}-batch-service-role",
assumed_by=iam.ServicePrincipal("batch.amazonaws.com"),
managed_policies=[iam.ManagedPolicy.from_aws_managed_policy_name("service-role/AWSBatchServiceRole")],
)

batch_instance_role = iam.Role(
self,
f"{prefix}-instance-role",
Expand All @@ -269,34 +276,40 @@ def __init__(self, scope: Construct, id: str, static_stack: StaticResourceStack,
data_bucket.grant_read(batch_instance_role)
metrics_bucket.grant_read_write(batch_instance_role)

batch_instance_profile = InstanceProfile(self, f"{prefix}-instance-profile", prefix=prefix)
batch_instance_profile.attach_role(batch_instance_role)
batch_instance_profile = iam.CfnInstanceProfile(
self, f"{prefix}-instance-profile", roles=[batch_instance_role.role_name]
)

compute_environment = batch.ComputeEnvironment(
compute_environment = batch.CfnComputeEnvironment(
self,
f"{prefix}-compute-environment",
compute_resources=batch.ComputeResources(
allocation_strategy=batch.AllocationStrategy.BEST_FIT_PROGRESSIVE,
vpc=vpc,
vpc_subnets=ec2.SubnetSelection(subnets=vpc.private_subnets),
# vpc_subnets=ec2.SubnetSelection(subnets=vpc.public_subnets), # use public subnet for ssh
type="MANAGED",
service_role=batch_service_role.role_arn,
compute_resources=batch.CfnComputeEnvironment.ComputeResourcesProperty(
type="EC2",
maxv_cpus=compute_env_maxv_cpus,
instance_role=batch_instance_profile.profile_arn,
instance_types=instances,
security_groups=[sg],
type=batch.ComputeResourceType.ON_DEMAND,
minv_cpus=0,
subnets=[subnet.subnet_id for subnet in vpc.private_subnets],
# subnets=[subnet.subnet_id for subnet in vpc.public_subnets], # use public subnet for ssh
# ec2_key_pair=f"{prefix}-perm-key", # set this if you need ssh into instance
launch_template=batch.LaunchTemplateSpecification(
launch_template_name=batch_launch_template_name # LaunchTemplate.launch_template_name returns None
allocation_strategy="BEST_FIT_PROGRESSIVE",
instance_role=batch_instance_profile.attr_arn,
instance_types=instance_types,
security_group_ids=[sg.security_group_id],
launch_template=batch.CfnComputeEnvironment.LaunchTemplateSpecificationProperty(
launch_template_name=batch_launch_template_name,
),
),
)

job_queue = batch.JobQueue(
job_queue = batch.CfnJobQueue(
self,
f"{prefix}-job-queue",
priority=1,
compute_environments=[batch.JobQueueComputeEnvironment(compute_environment=compute_environment, order=1)],
compute_environment_order=[
batch.CfnJobQueue.ComputeEnvironmentOrderProperty(compute_environment=compute_environment.ref, order=1)
],
state="ENABLED",
)

lambda_function = BatchLambdaFunction(
Expand All @@ -307,8 +320,8 @@ def __init__(self, scope: Construct, id: str, static_stack: StaticResourceStack,
function_name=lambda_function_name,
code_path=lambda_script_dir,
environment={
"BATCH_JOB_QUEUE": job_queue.job_queue_name,
"BATCH_JOB_DEFINITION": job_definition.job_definition_name,
"BATCH_JOB_QUEUE": job_queue.ref,
"BATCH_JOB_DEFINITION": job_definition.ref,
"METRICS_BUCKET": metrics_bucket.bucket_name,
"STACK_NAME_PREFIX": prefix,
},
Expand All @@ -319,10 +332,10 @@ def __init__(self, scope: Construct, id: str, static_stack: StaticResourceStack,
core.CfnOutput(
self,
"ComputeEnvironmentARN",
value=compute_environment.compute_environment_arn,
value=compute_environment.ref,
)
core.CfnOutput(self, "JobQueueARN", value=job_queue.job_queue_arn)
core.CfnOutput(self, "JobDefinitionARN", value=job_definition.job_definition_arn)
core.CfnOutput(self, "JobQueueARN", value=job_queue.ref)
core.CfnOutput(self, "JobDefinitionARN", value=job_definition.ref)
core.CfnOutput(
self,
"EcrRepositoryName",
Expand Down
Loading