Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move aws functions. #4134

Open
wants to merge 21 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions contrib/admin/mypy-with-ignore.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,14 @@ def main():
'src/toil/lib/encryption/conftest.py',
'src/toil/lib/encryption/__init__.py',
'src/toil/lib/aws/__init__.py',
'src/toil/lib/aws/ec2.py',
'src/toil/lib/aws/s3.py',
'src/toil/lib/aws/iam.py',
'src/toil/lib/aws/config.py',
'src/toil/lib/aws/utils.py',
'src/toil/lib/aws/util.py',
'src/toil/lib/checksum.py',
'src/toil/lib/pipes.py',
'src/toil/server/utils.py',
'src/toil/utils/toilStats.py'
]]
Expand Down
2 changes: 1 addition & 1 deletion src/toil/batchSystems/awsBatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
from toil.bus import MessageBus, MessageOutbox, JobAnnotationMessage
from toil.common import Config, Toil
from toil.job import JobDescription
from toil.lib.aws import get_current_aws_region, zone_to_region
from toil.lib.aws.util import get_current_aws_region, zone_to_region
from toil.lib.aws.session import establish_boto3_session
from toil.lib.conversions import b_to_mib, mib_to_b
from toil.lib.misc import slow_down, utc_now, unix_now_ms
Expand Down
2 changes: 1 addition & 1 deletion src/toil/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@
ClusterSizeMessage,
ClusterDesiredSizeMessage)
from toil.fileStores import FileID
from toil.lib.aws import zone_to_region
from toil.lib.aws.util import zone_to_region
from toil.lib.compatibility import deprecated
from toil.lib.conversions import bytes2human, human2bytes
from toil.lib.retry import retry
Expand Down
34 changes: 8 additions & 26 deletions src/toil/jobStores/aws/jobStore.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
retry_s3,
retryable_s3_errors
)
from toil.lib.aws.s3 import list_multipart_uploads, delete_bucket
from toil.lib.compatibility import compat_bytes
from toil.lib.aws.session import establish_boto3_session
from toil.lib.ec2nodes import EC2Regions
Expand Down Expand Up @@ -1211,14 +1212,15 @@ def readFrom(self, readable):
parts = []
logger.debug('Multipart upload started as %s', uploadId)


for attempt in retry_s3():
with attempt:
for i in range(CONSISTENCY_TICKS):
# Sometimes we can create a multipart upload and not see it. Wait around for it.
response = client.list_multipart_uploads(Bucket=bucket_name,
MaxUploads=1,
Prefix=compat_bytes(info.fileID))
# Sometimes we can create a multipart upload and can not see it. Wait around for it.
response = list_multipart_uploads(
s3_resource=store.s3_resource,
bucket=bucket_name,
prefix=compat_bytes(info.fileID)
)
if len(response['Uploads']) != 0 and response['Uploads'][0]['UploadId'] == uploadId:
logger.debug('Multipart upload visible as %s', uploadId)
break
Expand Down Expand Up @@ -1627,27 +1629,7 @@ def _delete_domain(self, domain):

@staticmethod
def _delete_bucket(bucket):
"""
:param bucket: S3.Bucket
"""
for attempt in retry_s3():
with attempt:
try:
uploads = s3_boto3_client.list_multipart_uploads(Bucket=bucket.name).get('Uploads')
if uploads:
for u in uploads:
s3_boto3_client.abort_multipart_upload(Bucket=bucket.name,
Key=u["Key"],
UploadId=u["UploadId"])

bucket.objects.all().delete()
bucket.object_versions.delete()
bucket.delete()
except s3_boto3_client.exceptions.NoSuchBucket:
pass
except ClientError as e:
if get_error_status(e) != 404:
raise
delete_bucket(s3_boto3_resource, bucket.name)


aRepr = reprlib.Repr()
Expand Down
2 changes: 1 addition & 1 deletion src/toil/jobStores/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,7 @@ def generate_locator(
from toil.jobStores.aws.jobStore import AWSJobStore # noqa

# Find a region
from toil.lib.aws import get_current_aws_region
from toil.lib.aws.util import get_current_aws_region

region = get_current_aws_region()

Expand Down
155 changes: 0 additions & 155 deletions src/toil/lib/aws/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,158 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import inspect
import json
import logging
import os
import re
import socket
import threading
from functools import lru_cache
from urllib.request import urlopen
from urllib.error import URLError

from typing import Any, Callable, Dict, Iterable, List, Optional, TypeVar, Union

logger = logging.getLogger(__name__)

# This file isn't allowed to import anything that depends on Boto or Boto3,
# which may not be installed, because it has to be importable everywhere.

def get_current_aws_region() -> Optional[str]:
"""
Return the AWS region that the currently configured AWS zone (see
get_current_aws_zone()) is in.
"""
# Try to derive it from the zone.
aws_zone = get_current_aws_zone()
return zone_to_region(aws_zone) if aws_zone else None

def get_aws_zone_from_environment() -> Optional[str]:
"""
Get the AWS zone from TOIL_AWS_ZONE if set.
"""
return os.environ.get('TOIL_AWS_ZONE', None)

def get_aws_zone_from_metadata() -> Optional[str]:
"""
Get the AWS zone from instance metadata, if on EC2 and the boto module is
available. Otherwise, gets the AWS zone from ECS task metadata, if on ECS.
"""

# When running on ECS, we also appear to be running on EC2, but the EC2
# metadata service doesn't seem to be contactable. So we check ECS first.

if running_on_ecs():
# Use the ECS metadata service
logger.debug("Fetch AZ from ECS metadata")
try:
resp = json.load(urlopen(os.environ['ECS_CONTAINER_METADATA_URI_V4'] + '/task', timeout=1))
logger.debug("ECS metadata: %s", resp)
if isinstance(resp, dict):
# We found something. Go with that.
return resp.get('AvailabilityZone')
except (json.decoder.JSONDecodeError, KeyError, URLError) as e:
# We're on ECS but can't get the metadata. That's odd.
logger.warning("Skipping ECS metadata due to error: %s", e)
if running_on_ec2():
# On EC2 alone, or on ECS but we couldn't get ahold of the ECS
# metadata.
try:
# Use the EC2 metadata service
import boto
from boto.utils import get_instance_metadata
logger.debug("Fetch AZ from EC2 metadata")
return get_instance_metadata()['placement']['availability-zone']
except ImportError:
# This is expected to happen a lot
logger.debug("No boto to fetch ECS metadata")
except (KeyError, URLError) as e:
# We're on EC2 but can't get the metadata. That's odd.
logger.warning("Skipping EC2 metadata due to error: %s", e)
return None

def get_aws_zone_from_boto() -> Optional[str]:
"""
Get the AWS zone from the Boto config file, if it is configured and the
boto module is available.
"""
try:
import boto
zone = boto.config.get('Boto', 'ec2_region_name')
if zone is not None:
zone += 'a' # derive an availability zone in the region
return zone
except ImportError:
pass
return None

def get_aws_zone_from_environment_region() -> Optional[str]:
"""
Pick an AWS zone in the region defined by TOIL_AWS_REGION, if it is set.
"""
aws_region = os.environ.get('TOIL_AWS_REGION')
if aws_region is not None:
# If a region is specified, use the first zone in the region.
return aws_region + 'a'
# Otherwise, don't pick a region and let us fall back on the next method.
return None

def get_current_aws_zone() -> Optional[str]:
"""
Get the currently configured or occupied AWS zone to use.

Reports the TOIL_AWS_ZONE environment variable if set.

Otherwise, if we have boto and are running on EC2, or if we are on ECS,
reports the zone we are running in.

Otherwise, if we have the TOIL_AWS_REGION variable set, chooses a zone in
that region.

Finally, if we have boto2, and a default region is configured in Boto 2,
chooses a zone in that region.

Returns None if no method can produce a zone to use.
"""
return get_aws_zone_from_environment() or \
get_aws_zone_from_metadata() or \
get_aws_zone_from_environment_region() or \
get_aws_zone_from_boto()

def zone_to_region(zone: str) -> str:
"""Get a region (e.g. us-west-2) from a zone (e.g. us-west-1c)."""
# re.compile() caches the regex internally so we don't have to
availability_zone = re.compile(r'^([a-z]{2}-[a-z]+-[1-9][0-9]*)([a-z])$')
m = availability_zone.match(zone)
if not m:
raise ValueError(f"Can't extract region from availability zone '{zone}'")
return m.group(1)

def running_on_ec2() -> bool:
"""
Return True if we are currently running on EC2, and false otherwise.
"""
# TODO: Move this to toil.lib.ec2 and make toil.lib.ec2 importable without boto?
def file_begins_with(path, prefix):
with open(path) as f:
return f.read(len(prefix)) == prefix

hv_uuid_path = '/sys/hypervisor/uuid'
if os.path.exists(hv_uuid_path) and file_begins_with(hv_uuid_path, 'ec2'):
return True
# Some instances do not have the /sys/hypervisor/uuid file, so check the identity document instead.
# See https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-identity-documents.html
try:
urlopen('http://169.254.169.254/latest/dynamic/instance-identity/document', timeout=1)
return True
except (URLError, socket.timeout):
return False

def running_on_ecs() -> bool:
"""
Return True if we are currently running on Amazon ECS, and false otherwise.
"""
# We only care about relatively current ECS
return 'ECS_CONTAINER_METADATA_URI_V4' in os.environ
30 changes: 30 additions & 0 deletions src/toil/lib/aws/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import math

from toil.lib.units import MIB, TB

S3_PARALLELIZATION_FACTOR = 8
S3_PART_SIZE = 16 * MIB

# AWS Defined Limits
# https://docs.aws.amazon.com/AmazonS3/latest/userguide/qfacts.html
AWS_MAX_CHUNK_SIZE = 5 * TB
# Files must be larger than this before we consider multipart uploads.
AWS_MIN_CHUNK_SIZE = 64 * MIB
# Convenience variable for Boto3 TransferConfig(multipart_threshold=).
MULTIPART_THRESHOLD = AWS_MIN_CHUNK_SIZE + 1
# Maximum number of parts allowed in a multipart upload. This is a limitation imposed by S3.
AWS_MAX_MULTIPART_COUNT = 10000
# Note: There is no minimum size limit on the last part of a multipart upload.

# The chunk size we chose arbitrarily, but it must be consistent for etags
DEFAULT_AWS_CHUNK_SIZE = 128 * MIB
assert AWS_MAX_CHUNK_SIZE > DEFAULT_AWS_CHUNK_SIZE > AWS_MIN_CHUNK_SIZE


def get_s3_multipart_chunk_size(file_size: int) -> int:
if file_size >= AWS_MAX_CHUNK_SIZE * AWS_MAX_MULTIPART_COUNT:
return AWS_MAX_CHUNK_SIZE
elif file_size <= DEFAULT_AWS_CHUNK_SIZE * AWS_MAX_MULTIPART_COUNT:
return DEFAULT_AWS_CHUNK_SIZE
else:
return math.ceil(file_size / AWS_MAX_MULTIPART_COUNT)
39 changes: 39 additions & 0 deletions src/toil/lib/aws/ec2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Copyright (C) 2015-2021 Regents of the University of California
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""AWS functions dealing with ec2."""
import os
import socket
from urllib.error import URLError
from urllib.request import urlopen


def running_on_ec2() -> bool:
"""
Return True if we are currently running on EC2, and false otherwise.
"""
# TODO: Move this to toil.lib.ec2 and make toil.lib.ec2 importable without boto?
def file_begins_with(path, prefix):
with open(path) as f:
return f.read(len(prefix)) == prefix

hv_uuid_path = '/sys/hypervisor/uuid'
if os.path.exists(hv_uuid_path) and file_begins_with(hv_uuid_path, 'ec2'):
return True
# Some instances do not have the /sys/hypervisor/uuid file, so check the identity document instead.
# See https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-identity-documents.html
try:
urlopen('http://169.254.169.254/latest/dynamic/instance-identity/document', timeout=1)
return True
except (URLError, socket.timeout):
return False
9 changes: 9 additions & 0 deletions src/toil/lib/aws/ecs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import os


def running_on_ecs() -> bool:
"""
Return True if we are currently running on Amazon ECS, and false otherwise.
"""
# We only care about relatively current ECS
return 'ECS_CONTAINER_METADATA_URI_V4' in os.environ
Loading