Complete Guide to Amazon ECS: Container Orchestration at Scale#

Amazon Elastic Container Service (ECS) is a fully managed container orchestration service that makes it easy to run, stop, and manage Docker containers on a cluster. ECS eliminates the need to install and operate your own container orchestration software, manage and scale a cluster of virtual machines, or schedule containers on those virtual machines.

Overview#

ECS supports two launch types: EC2 for more control over the infrastructure, and Fargate for serverless container execution. You can run your containers on a serverless infrastructure managed by AWS Fargate, or for more control you can run your tasks and services on a cluster of Amazon EC2 instances that you manage.

Key Benefits#

1. Fully Managed#

No control plane to manage
Integrated with AWS services
Automatic patching and scaling
Built-in security and monitoring

2. Flexible#

EC2 and Fargate launch types
Support for Docker containers
Multiple scheduling strategies
Service discovery and load balancing

3. Secure#

IAM integration for fine-grained access
VPC networking isolation
Secrets management integration
Container image vulnerability scanning

4. Cost-Effective#

Pay only for resources used
Spot instances support
Right-sizing recommendations
Reserved capacity pricing

Core Concepts#

1. Clusters#

1
# ECS Cluster with EC2 capacity
2
ECSCluster:
3
  Type: AWS::ECS::Cluster
4
  Properties:
5
    ClusterName: my-ecs-cluster
6
    CapacityProviders:
7
      - EC2
8
      - FARGATE
9
      - FARGATE_SPOT
10
    DefaultCapacityProviderStrategy:
11
      - CapacityProvider: FARGATE
12
        Weight: 1
13
        Base: 2
14
    ClusterSettings:
15
      - Name: containerInsights
16
        Value: enabled
17
    Tags:
18
      - Key: Environment
19
        Value: Production
20

21
# Auto Scaling Group for EC2 capacity
22
ECSAutoScalingGroup:
23
  Type: AWS::AutoScaling::AutoScalingGroup
24
  Properties:
25
    VPCZoneIdentifier:
26
      - !Ref PrivateSubnet1
27
      - !Ref PrivateSubnet2
28
    LaunchTemplate:
29
      LaunchTemplateId: !Ref ECSLaunchTemplate
30
      Version: !GetAtt ECSLaunchTemplate.LatestVersionNumber
31
    MinSize: 1
32
    MaxSize: 10
33
    DesiredCapacity: 3
34
    TargetGroupARNs:
35
      - !Ref ApplicationTargetGroup
36
    Tags:
37
      - Key: Name
38
        Value: ECS-Instance
39
        PropagateAtLaunch: true
40

41
# Launch Template for ECS instances
42
ECSLaunchTemplate:
43
  Type: AWS::EC2::LaunchTemplate
44
  Properties:
45
    LaunchTemplateName: ecs-launch-template
46
    LaunchTemplateData:
47
      ImageId: ami-0c02fb55956c7d316  # ECS-optimized AMI
48
      InstanceType: t3.medium
49
      SecurityGroupIds:
50
        - !Ref ECSSecurityGroup
51
      IamInstanceProfile:
52
        Arn: !GetAtt ECSInstanceProfile.Arn
53
      UserData: !Base64
54
        !Sub |
55
          #!/bin/bash
56
          echo ECS_CLUSTER=${ECSCluster} >> /etc/ecs/ecs.config
57
          echo ECS_ENABLE_CONTAINER_METADATA=true >> /etc/ecs/ecs.config

2. Task Definitions#

1
# Fargate Task Definition
2
FargateTaskDefinition:
3
  Type: AWS::ECS::TaskDefinition
4
  Properties:
5
    Family: my-fargate-task
6
    NetworkMode: awsvpc
7
    RequiresCompatibilities:
8
      - FARGATE
9
    Cpu: 512
10
    Memory: 1024
11
    ExecutionRoleArn: !GetAtt ECSExecutionRole.Arn
12
    TaskRoleArn: !GetAtt ECSTaskRole.Arn
13
    ContainerDefinitions:
14
      - Name: web-server
15
        Image: nginx:latest
16
        PortMappings:
17
          - ContainerPort: 80
18
            Protocol: tcp
19
        LogConfiguration:
20
          LogDriver: awslogs
21
          Options:
22
            awslogs-group: !Ref ECSLogGroup
23
            awslogs-region: !Ref AWS::Region
24
            awslogs-stream-prefix: ecs
25
        Environment:
26
          - Name: ENV
27
            Value: production
28
        Secrets:
29
          - Name: DATABASE_PASSWORD
30
            ValueFrom: !Ref DatabaseSecret
31
        HealthCheck:
32
          Command:
33
            - CMD-SHELL
34
            - curl -f http://localhost/ || exit 1
35
          Interval: 30
36
          Timeout: 5
37
          Retries: 3
38
          StartPeriod: 60
39

40
# Multi-container Task Definition
41
MultiContainerTaskDefinition:
42
  Type: AWS::ECS::TaskDefinition
43
  Properties:
44
    Family: multi-container-task
45
    NetworkMode: awsvpc
46
    RequiresCompatibilities:
47
      - FARGATE
48
    Cpu: 1024
49
    Memory: 2048
50
    ExecutionRoleArn: !GetAtt ECSExecutionRole.Arn
51
    ContainerDefinitions:
52
      - Name: web-app
53
        Image: my-app:latest
54
        PortMappings:
55
          - ContainerPort: 8080
56
        DependsOn:
57
          - ContainerName: redis-cache
58
            Condition: HEALTHY
59
        Links:
60
          - redis-cache
61
        LogConfiguration:
62
          LogDriver: awslogs
63
          Options:
64
            awslogs-group: !Ref ECSLogGroup
65
            awslogs-region: !Ref AWS::Region
66
            awslogs-stream-prefix: web-app
67
      - Name: redis-cache
68
        Image: redis:alpine
69
        PortMappings:
70
          - ContainerPort: 6379
71
        HealthCheck:
72
          Command:
73
            - CMD-SHELL
74
            - redis-cli ping
75
          Interval: 30
76
          Timeout: 5
77
          Retries: 3
78
        LogConfiguration:
79
          LogDriver: awslogs
80
          Options:
81
            awslogs-group: !Ref ECSLogGroup
82
            awslogs-region: !Ref AWS::Region
83
            awslogs-stream-prefix: redis

3. Services#

1
# Fargate Service
2
FargateService:
3
  Type: AWS::ECS::Service
4
  Properties:
5
    ServiceName: my-fargate-service
6
    Cluster: !Ref ECSCluster
7
    TaskDefinition: !Ref FargateTaskDefinition
8
    LaunchType: FARGATE
9
    DesiredCount: 3
10
    DeploymentConfiguration:
11
      MinimumHealthyPercent: 50
12
      MaximumPercent: 200
13
      DeploymentCircuitBreaker:
14
        Enable: true
15
        Rollback: true
16
    NetworkConfiguration:
17
      AwsvpcConfiguration:
18
        SecurityGroups:
19
          - !Ref ECSSecurityGroup
20
        Subnets:
21
          - !Ref PrivateSubnet1
22
          - !Ref PrivateSubnet2
23
        AssignPublicIp: DISABLED
24
    LoadBalancers:
25
      - TargetGroupArn: !Ref ApplicationTargetGroup
26
        ContainerName: web-server
27
        ContainerPort: 80
28
    ServiceRegistries:
29
      - RegistryArn: !GetAtt ServiceDiscoveryService.Arn
30
        ContainerName: web-server
31
    EnableExecuteCommand: true  # Enable ECS Exec
32
    PropagateTags: SERVICE
33
    Tags:
34
      - Key: Environment
35
        Value: Production
36

37
# Service with Capacity Provider Strategy
38
CapacityProviderService:
39
  Type: AWS::ECS::Service
40
  Properties:
41
    ServiceName: mixed-capacity-service
42
    Cluster: !Ref ECSCluster
43
    TaskDefinition: !Ref TaskDefinition
44
    DesiredCount: 5
45
    CapacityProviderStrategy:
46
      - CapacityProvider: FARGATE
47
        Weight: 1
48
        Base: 2
49
      - CapacityProvider: FARGATE_SPOT
50
        Weight: 4
51
    DeploymentConfiguration:
52
      MinimumHealthyPercent: 50
53
      MaximumPercent: 200

Container Management#

1. Container Images and ECR Integration#

1
# ECR Repository for container images
2
ECRRepository:
3
  Type: AWS::ECR::Repository
4
  Properties:
5
    RepositoryName: my-application
6
    ImageScanningConfiguration:
7
      ScanOnPush: true
8
    ImageTagMutability: MUTABLE
9
    LifecyclePolicy:
10
      LifecyclePolicyText: |
11
        {
12
          "rules": [
13
            {
14
              "rulePriority": 1,
15
              "description": "Keep last 10 production images",
16
              "selection": {
17
                "tagStatus": "tagged",
18
                "tagPrefixList": ["prod"],
19
                "countType": "imageCountMoreThan",
20
                "countNumber": 10
21
              },
22
              "action": {
23
                "type": "expire"
24
              }
25
            },
26
            {
27
              "rulePriority": 2,
28
              "description": "Keep only 5 untagged images",
29
              "selection": {
30
                "tagStatus": "untagged",
31
                "countType": "imageCountMoreThan",
32
                "countNumber": 5
33
              },
34
              "action": {
35
                "type": "expire"
36
              }
37
            }
38
          ]
39
        }

1
# Build and push Docker image to ECR
2
#!/bin/bash
3
ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
4
REGION=us-east-1
5
REPOSITORY_NAME=my-application
6
IMAGE_TAG=v1.0.0
7

8
# Build Docker image
9
docker build -t ${REPOSITORY_NAME}:${IMAGE_TAG} .
10

11
# Get ECR login token
12
aws ecr get-login-password --region ${REGION} | docker login --username AWS --password-stdin ${ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com
13

14
# Tag and push image
15
docker tag ${REPOSITORY_NAME}:${IMAGE_TAG} ${ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com/${REPOSITORY_NAME}:${IMAGE_TAG}
16
docker push ${ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com/${REPOSITORY_NAME}:${IMAGE_TAG}
17

18
# Also tag as latest
19
docker tag ${REPOSITORY_NAME}:${IMAGE_TAG} ${ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com/${REPOSITORY_NAME}:latest
20
docker push ${ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com/${REPOSITORY_NAME}:latest

2. Service Discovery#

1
# Cloud Map Service Discovery
2
ServiceDiscoveryNamespace:
3
  Type: AWS::ServiceDiscovery::PrivateDnsNamespace
4
  Properties:
5
    Name: my-app.local
6
    Vpc: !Ref VPC
7

8
ServiceDiscoveryService:
9
  Type: AWS::ServiceDiscovery::Service
10
  Properties:
11
    Name: web-service
12
    NamespaceId: !Ref ServiceDiscoveryNamespace
13
    DnsConfig:
14
      DnsRecords:
15
        - Type: A
16
          TTL: 300
17
      RoutingPolicy: MULTIVALUE
18
    HealthCheckCustomConfig:
19
      FailureThreshold: 2

3. Load Balancing#

1
# Application Load Balancer
2
ApplicationLoadBalancer:
3
  Type: AWS::ElasticLoadBalancingV2::LoadBalancer
4
  Properties:
5
    Name: ecs-alb
6
    Type: application
7
    Scheme: internet-facing
8
    IpAddressType: ipv4
9
    Subnets:
10
      - !Ref PublicSubnet1
11
      - !Ref PublicSubnet2
12
    SecurityGroups:
13
      - !Ref ALBSecurityGroup
14

15
ApplicationTargetGroup:
16
  Type: AWS::ElasticLoadBalancingV2::TargetGroup
17
  Properties:
18
    Name: ecs-targets
19
    Port: 80
20
    Protocol: HTTP
21
    TargetType: ip  # For Fargate
22
    VpcId: !Ref VPC
23
    HealthCheckPath: /health
24
    HealthCheckProtocol: HTTP
25
    HealthCheckIntervalSeconds: 30
26
    HealthCheckTimeoutSeconds: 5
27
    HealthyThresholdCount: 2
28
    UnhealthyThresholdCount: 5
29
    Matcher:
30
      HttpCode: 200
31

32
ALBListener:
33
  Type: AWS::ElasticLoadBalancingV2::Listener
34
  Properties:
35
    DefaultActions:
36
      - Type: forward
37
        TargetGroupArn: !Ref ApplicationTargetGroup
38
    LoadBalancerArn: !Ref ApplicationLoadBalancer
39
    Port: 80
40
    Protocol: HTTP

Auto Scaling#

1. Service Auto Scaling#

1
# Service Auto Scaling Configuration
2
ServiceScalingTarget:
3
  Type: AWS::ApplicationAutoScaling::ScalableTarget
4
  Properties:
5
    ServiceNamespace: ecs
6
    ResourceId: !Sub 'service/${ECSCluster}/${FargateService}'
7
    ScalableDimension: ecs:service:DesiredCount
8
    MinCapacity: 2
9
    MaxCapacity: 20
10
    RoleARN: !GetAtt ECSAutoScalingRole.Arn
11

12
# CPU-based scaling policy
13
CPUScalingPolicy:
14
  Type: AWS::ApplicationAutoScaling::ScalingPolicy
15
  Properties:
16
    PolicyName: ECS-CPU-Scaling
17
    ServiceNamespace: ecs
18
    ResourceId: !Sub 'service/${ECSCluster}/${FargateService}'
19
    ScalableDimension: ecs:service:DesiredCount
20
    PolicyType: TargetTrackingScaling
21
    TargetTrackingScalingPolicyConfiguration:
22
      TargetValue: 70.0
23
      PredefinedMetricSpecification:
24
        PredefinedMetricType: ECSServiceAverageCPUUtilization
25
      ScaleOutCooldown: 300
26
      ScaleInCooldown: 300
27

28
# Memory-based scaling policy
29
MemoryScalingPolicy:
30
  Type: AWS::ApplicationAutoScaling::ScalingPolicy
31
  Properties:
32
    PolicyName: ECS-Memory-Scaling
33
    ServiceNamespace: ecs
34
    ResourceId: !Sub 'service/${ECSCluster}/${FargateService}'
35
    ScalableDimension: ecs:service:DesiredCount
36
    PolicyType: TargetTrackingScaling
37
    TargetTrackingScalingPolicyConfiguration:
38
      TargetValue: 80.0
39
      PredefinedMetricSpecification:
40
        PredefinedMetricType: ECSServiceAverageMemoryUtilization
41
      ScaleOutCooldown: 300
42
      ScaleInCooldown: 300
43

44
# ALB Request-based scaling
45
ALBScalingPolicy:
46
  Type: AWS::ApplicationAutoScaling::ScalingPolicy
47
  Properties:
48
    PolicyName: ECS-ALB-Scaling
49
    ServiceNamespace: ecs
50
    ResourceId: !Sub 'service/${ECSCluster}/${FargateService}'
51
    ScalableDimension: ecs:service:DesiredCount
52
    PolicyType: TargetTrackingScaling
53
    TargetTrackingScalingPolicyConfiguration:
54
      TargetValue: 1000.0
55
      PredefinedMetricSpecification:
56
        PredefinedMetricType: ALBRequestCountPerTarget
57
        ResourceLabel: !Sub
58
          - '${LoadBalancerFullName}/${TargetGroupFullName}'
59
          - LoadBalancerFullName: !GetAtt ApplicationLoadBalancer.LoadBalancerFullName
60
            TargetGroupFullName: !GetAtt ApplicationTargetGroup.TargetGroupFullName

2. Cluster Auto Scaling#

1
# Cluster Capacity Provider
2
EC2CapacityProvider:
3
  Type: AWS::ECS::CapacityProvider
4
  Properties:
5
    Name: ec2-capacity-provider
6
    AutoScalingGroupProvider:
7
      AutoScalingGroupArn: !Ref ECSAutoScalingGroup
8
      ManagedScaling:
9
        Status: ENABLED
10
        TargetCapacity: 100
11
        MinimumScalingStepSize: 1
12
        MaximumScalingStepSize: 10
13
      ManagedTerminationProtection: ENABLED
14
    Tags:
15
      - Key: Name
16
        Value: EC2CapacityProvider

Deployment Strategies#

1. Blue/Green Deployment#

1
import boto3
2
import json
3
import time
4

5
class ECSBlueGreenDeployment:
6
    def __init__(self, cluster_name, service_name, region='us-east-1'):
7
        self.ecs = boto3.client('ecs', region_name=region)
8
        self.elbv2 = boto3.client('elbv2', region_name=region)
9
        self.cluster_name = cluster_name
10
        self.service_name = service_name
11

12
    def deploy(self, new_task_definition_arn, target_group_arns):
13
        """
14
        Perform blue/green deployment
15
        """
16
        # Get current service configuration
17
        service = self.get_service_details()
18
        current_task_def = service['taskDefinition']
19
        current_count = service['desiredCount']
20

21
        # Create new service with new task definition
22
        new_service_name = f"{self.service_name}-green"
23

24
        print(f"Creating green service: {new_service_name}")
25
        self.create_green_service(
26
            new_service_name,
27
            new_task_definition_arn,
28
            current_count,
29
            service
30
        )
31

32
        # Wait for green service to be stable
33
        print("Waiting for green service to stabilize...")
34
        self.wait_for_service_stable(new_service_name)
35

36
        # Switch traffic to green service
37
        print("Switching traffic to green service...")
38
        self.switch_traffic(target_group_arns, new_service_name)
39

40
        # Cleanup old (blue) service
41
        print("Cleaning up blue service...")
42
        time.sleep(300)  # Wait 5 minutes before cleanup
43
        self.cleanup_blue_service()
44

45
        return True
46

47
    def get_service_details(self):
48
        """
49
        Get current service configuration
50
        """
51
        response = self.ecs.describe_services(
52
            cluster=self.cluster_name,
53
            services=[self.service_name]
54
        )
55
        return response['services'][0]
56

57
    def create_green_service(self, green_service_name, task_def_arn, desired_count, blue_service):
58
        """
59
        Create green service with new task definition
60
        """
61
        service_config = {
62
            'serviceName': green_service_name,
63
            'cluster': self.cluster_name,
64
            'taskDefinition': task_def_arn,
65
            'desiredCount': desired_count,
66
            'launchType': blue_service.get('launchType', 'FARGATE'),
67
            'networkConfiguration': blue_service.get('networkConfiguration', {}),
68
            'loadBalancers': [],  # Will be added after traffic switch
69
            'serviceRegistries': blue_service.get('serviceRegistries', []),
70
            'deploymentConfiguration': blue_service.get('deploymentConfiguration', {})
71
        }
72

73
        self.ecs.create_service(**service_config)
74

75
    def wait_for_service_stable(self, service_name):
76
        """
77
        Wait for service to reach stable state
78
        """
79
        waiter = self.ecs.get_waiter('services_stable')
80
        waiter.wait(
81
            cluster=self.cluster_name,
82
            services=[service_name],
83
            WaiterConfig={
84
                'Delay': 30,
85
                'MaxAttempts': 20
86
            }
87
        )
88

89
    def switch_traffic(self, target_group_arns, green_service_name):
90
        """
91
        Switch load balancer traffic to green service
92
        """
93
        # Update target group to point to green service
94
        for tg_arn in target_group_arns:
95
            # This is simplified - in practice, you'd gradually shift traffic
96
            self.update_service_load_balancers(green_service_name, tg_arn)
97

98
    def update_service_load_balancers(self, service_name, target_group_arn):
99
        """
100
        Update service load balancer configuration
101
        """
102
        self.ecs.update_service(
103
            cluster=self.cluster_name,
104
            service=service_name,
105
            loadBalancers=[
106
                {
107
                    'targetGroupArn': target_group_arn,
108
                    'containerName': 'web-server',  # Adjust as needed
109
                    'containerPort': 80
110
                }
111
            ]
112
        )
113

114
    def cleanup_blue_service(self):
115
        """
116
        Scale down and delete blue service
117
        """
118
        # Scale down blue service
119
        self.ecs.update_service(
120
            cluster=self.cluster_name,
121
            service=self.service_name,
122
            desiredCount=0
123
        )
124

125
        # Wait for scale down
126
        self.wait_for_service_stable(self.service_name)
127

128
        # Delete blue service
129
        self.ecs.delete_service(
130
            cluster=self.cluster_name,
131
            service=self.service_name
132
        )
133

134
# Usage example
135
deployer = ECSBlueGreenDeployment('my-cluster', 'my-service')
136
deployer.deploy(
137
    'arn:aws:ecs:region:account:task-definition/my-app:2',
138
    ['arn:aws:elasticloadbalancing:region:account:targetgroup/my-targets/abc123']
139
)

2. Rolling Updates#

1
# Service with rolling update configuration
2
RollingUpdateService:
3
  Type: AWS::ECS::Service
4
  Properties:
5
    ServiceName: rolling-update-service
6
    Cluster: !Ref ECSCluster
7
    TaskDefinition: !Ref TaskDefinition
8
    DesiredCount: 4
9
    DeploymentConfiguration:
10
      MinimumHealthyPercent: 50  # Keep at least 50% running during deployment
11
      MaximumPercent: 200        # Can scale up to 200% during deployment
12
      DeploymentCircuitBreaker:
13
        Enable: true
14
        Rollback: true           # Auto rollback on failure
15
    PropagateTags: SERVICE

Monitoring and Logging#

1. Container Insights#

1
# Enable Container Insights on cluster
2
ClusterWithInsights:
3
  Type: AWS::ECS::Cluster
4
  Properties:
5
    ClusterName: monitored-cluster
6
    ClusterSettings:
7
      - Name: containerInsights
8
        Value: enabled
9

10
# CloudWatch Log Group
11
ECSLogGroup:
12
  Type: AWS::Logs::LogGroup
13
  Properties:
14
    LogGroupName: /ecs/my-application
15
    RetentionInDays: 30
16

17
# Custom CloudWatch Dashboard
18
ECSMonitoringDashboard:
19
  Type: AWS::CloudWatch::Dashboard
20
  Properties:
21
    DashboardName: ECS-Monitoring
22
    DashboardBody: !Sub |
23
      {
24
        "widgets": [
25
          {
26
            "type": "metric",
27
            "properties": {
28
              "metrics": [
29
                ["AWS/ECS", "CPUUtilization", "ServiceName", "${FargateService}", "ClusterName", "${ECSCluster}"],
30
                ["AWS/ECS", "MemoryUtilization", "ServiceName", "${FargateService}", "ClusterName", "${ECSCluster}"]
31
              ],
32
              "period": 300,
33
              "stat": "Average",
34
              "region": "${AWS::Region}",
35
              "title": "ECS Service Metrics"
36
            }
37
          }
38
        ]
39
      }

2. Health Monitoring#

1
import boto3
2
import json
3
from datetime import datetime, timedelta
4

5
class ECSHealthMonitor:
6
    def __init__(self, cluster_name, region='us-east-1'):
7
        self.ecs = boto3.client('ecs', region_name=region)
8
        self.cloudwatch = boto3.client('cloudwatch', region_name=region)
9
        self.cluster_name = cluster_name
10

11
    def get_cluster_health(self):
12
        """
13
        Get overall cluster health
14
        """
15
        # Get cluster details
16
        cluster_response = self.ecs.describe_clusters(
17
            clusters=[self.cluster_name],
18
            include=['STATISTICS']
19
        )
20
        cluster = cluster_response['clusters'][0]
21

22
        # Get services
23
        services_response = self.ecs.list_services(cluster=self.cluster_name)
24
        service_arns = services_response['serviceArns']
25

26
        health_status = {
27
            'cluster_name': self.cluster_name,
28
            'cluster_status': cluster['status'],
29
            'active_services_count': cluster['activeServicesCount'],
30
            'running_tasks_count': cluster['runningTasksCount'],
31
            'pending_tasks_count': cluster['pendingTasksCount'],
32
            'services': []
33
        }
34

35
        # Check individual services
36
        if service_arns:
37
            services_detail = self.ecs.describe_services(
38
                cluster=self.cluster_name,
39
                services=service_arns
40
            )
41

42
            for service in services_detail['services']:
43
                service_health = self.analyze_service_health(service)
44
                health_status['services'].append(service_health)
45

46
        return health_status
47

48
    def analyze_service_health(self, service):
49
        """
50
        Analyze individual service health
51
        """
52
        service_name = service['serviceName']
53
        desired_count = service['desiredCount']
54
        running_count = service['runningCount']
55
        pending_count = service['pendingCount']
56

57
        # Calculate health percentage
58
        health_percentage = (running_count / desired_count * 100) if desired_count > 0 else 0
59

60
        # Determine health status
61
        if health_percentage >= 100:
62
            status = 'HEALTHY'
63
        elif health_percentage >= 80:
64
            status = 'DEGRADED'
65
        else:
66
            status = 'UNHEALTHY'
67

68
        # Check for deployment issues
69
        deployments = service.get('deployments', [])
70
        active_deployment = next((d for d in deployments if d['status'] == 'PRIMARY'), None)
71

72
        deployment_status = 'STABLE'
73
        if active_deployment:
74
            if active_deployment['rolloutState'] == 'IN_PROGRESS':
75
                deployment_status = 'DEPLOYING'
76
            elif active_deployment['rolloutState'] == 'FAILED':
77
                deployment_status = 'FAILED'
78

79
        return {
80
            'service_name': service_name,
81
            'desired_count': desired_count,
82
            'running_count': running_count,
83
            'pending_count': pending_count,
84
            'health_percentage': health_percentage,
85
            'status': status,
86
            'deployment_status': deployment_status,
87
            'task_definition': service['taskDefinition'].split('/')[-1]
88
        }
89

90
    def get_service_metrics(self, service_name, hours=24):
91
        """
92
        Get CloudWatch metrics for service
93
        """
94
        end_time = datetime.utcnow()
95
        start_time = end_time - timedelta(hours=hours)
96

97
        metrics = ['CPUUtilization', 'MemoryUtilization']
98
        service_metrics = {}
99

100
        for metric in metrics:
101
            response = self.cloudwatch.get_metric_statistics(
102
                Namespace='AWS/ECS',
103
                MetricName=metric,
104
                Dimensions=[
105
                    {'Name': 'ServiceName', 'Value': service_name},
106
                    {'Name': 'ClusterName', 'Value': self.cluster_name}
107
                ],
108
                StartTime=start_time,
109
                EndTime=end_time,
110
                Period=3600,  # 1 hour intervals
111
                Statistics=['Average', 'Maximum']
112
            )
113

114
            datapoints = response['Datapoints']
115
            if datapoints:
116
                avg_value = sum(point['Average'] for point in datapoints) / len(datapoints)
117
                max_value = max(point['Maximum'] for point in datapoints)
118

119
                service_metrics[metric.lower()] = {
120
                    'average': round(avg_value, 2),
121
                    'maximum': round(max_value, 2),
122
                    'datapoints': len(datapoints)
123
                }
124

125
        return service_metrics
126

127
    def check_task_health(self, service_name):
128
        """
129
        Check health of individual tasks
130
        """
131
        # Get tasks for service
132
        tasks_response = self.ecs.list_tasks(
133
            cluster=self.cluster_name,
134
            serviceName=service_name
135
        )
136

137
        task_arns = tasks_response['taskArns']
138
        if not task_arns:
139
            return []
140

141
        # Get task details
142
        tasks_detail = self.ecs.describe_tasks(
143
            cluster=self.cluster_name,
144
            tasks=task_arns
145
        )
146

147
        task_health = []
148
        for task in tasks_detail['tasks']:
149
            containers = task.get('containers', [])
150

151
            healthy_containers = sum(1 for c in containers if c.get('healthStatus') == 'HEALTHY')
152
            total_containers = len(containers)
153

154
            task_info = {
155
                'task_arn': task['taskArn'].split('/')[-1],
156
                'last_status': task['lastStatus'],
157
                'desired_status': task['desiredStatus'],
158
                'health_status': task.get('healthStatus', 'UNKNOWN'),
159
                'healthy_containers': healthy_containers,
160
                'total_containers': total_containers,
161
                'cpu_utilization': None,  # Would need additional API calls
162
                'memory_utilization': None
163
            }
164

165
            task_health.append(task_info)
166

167
        return task_health
168

169
# Usage example
170
monitor = ECSHealthMonitor('my-cluster')
171
cluster_health = monitor.get_cluster_health()
172
print(json.dumps(cluster_health, indent=2))
173

174
service_metrics = monitor.get_service_metrics('my-service')
175
print(json.dumps(service_metrics, indent=2))

3. Alerting#

1
# CloudWatch Alarms for ECS monitoring
2
ServiceCPUAlarm:
3
  Type: AWS::CloudWatch::Alarm
4
  Properties:
5
    AlarmName: ECS-High-CPU-Utilization
6
    AlarmDescription: ECS service CPU utilization is too high
7
    MetricName: CPUUtilization
8
    Namespace: AWS/ECS
9
    Statistic: Average
10
    Period: 300
11
    EvaluationPeriods: 2
12
    Threshold: 80
13
    ComparisonOperator: GreaterThanThreshold
14
    Dimensions:
15
      - Name: ServiceName
16
        Value: !Ref FargateService
17
      - Name: ClusterName
18
        Value: !Ref ECSCluster
19
    AlarmActions:
20
      - !Ref SNSAlarmTopic
21

22
TaskCountAlarm:
23
  Type: AWS::CloudWatch::Alarm
24
  Properties:
25
    AlarmName: ECS-Low-Running-Tasks
26
    AlarmDescription: ECS service has fewer running tasks than desired
27
    MetricName: RunningTaskCount
28
    Namespace: AWS/ECS
29
    Statistic: Average
30
    Period: 300
31
    EvaluationPeriods: 2
32
    Threshold: 2
33
    ComparisonOperator: LessThanThreshold
34
    Dimensions:
35
      - Name: ServiceName
36
        Value: !Ref FargateService
37
      - Name: ClusterName
38
        Value: !Ref ECSCluster

Security Best Practices#

1. IAM Roles and Policies#

1
# ECS Task Execution Role
2
ECSExecutionRole:
3
  Type: AWS::IAM::Role
4
  Properties:
5
    AssumeRolePolicyDocument:
6
      Version: '2012-10-17'
7
      Statement:
8
        - Effect: Allow
9
          Principal:
10
            Service: ecs-tasks.amazonaws.com
11
          Action: sts:AssumeRole
12
    ManagedPolicyArns:
13
      - arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy
14
    Policies:
15
      - PolicyName: ECRAccess
16
        PolicyDocument:
17
          Version: '2012-10-17'
18
          Statement:
19
            - Effect: Allow
20
              Action:
21
                - ecr:GetAuthorizationToken
22
                - ecr:BatchCheckLayerAvailability
23
                - ecr:GetDownloadUrlForLayer
24
                - ecr:BatchGetImage
25
              Resource: '*'
26
      - PolicyName: SecretsManagerAccess
27
        PolicyDocument:
28
          Version: '2012-10-17'
29
          Statement:
30
            - Effect: Allow
31
              Action:
32
                - secretsmanager:GetSecretValue
33
              Resource: !Ref DatabaseSecret
34

35
# ECS Task Role (for application access)
36
ECSTaskRole:
37
  Type: AWS::IAM::Role
38
  Properties:
39
    AssumeRolePolicyDocument:
40
      Version: '2012-10-17'
41
      Statement:
42
        - Effect: Allow
43
          Principal:
44
            Service: ecs-tasks.amazonaws.com
45
          Action: sts:AssumeRole
46
    Policies:
47
      - PolicyName: ApplicationAccess
48
        PolicyDocument:
49
          Version: '2012-10-17'
50
          Statement:
51
            - Effect: Allow
52
              Action:
53
                - s3:GetObject
54
                - s3:PutObject
55
              Resource: !Sub '${ApplicationBucket}/*'
56
            - Effect: Allow
57
              Action:
58
                - dynamodb:GetItem
59
                - dynamodb:PutItem
60
                - dynamodb:UpdateItem
61
                - dynamodb:DeleteItem
62
              Resource: !GetAtt ApplicationTable.Arn

2. Network Security#

1
# Security Group for ECS tasks
2
ECSSecurityGroup:
3
  Type: AWS::EC2::SecurityGroup
4
  Properties:
5
    GroupDescription: Security group for ECS tasks
6
    VpcId: !Ref VPC
7
    SecurityGroupIngress:
8
      - IpProtocol: tcp
9
        FromPort: 80
10
        ToPort: 80
11
        SourceSecurityGroupId: !Ref ALBSecurityGroup
12
        Description: HTTP from ALB
13
      - IpProtocol: tcp
14
        FromPort: 443
15
        ToPort: 443
16
        SourceSecurityGroupId: !Ref ALBSecurityGroup
17
        Description: HTTPS from ALB
18
    SecurityGroupEgress:
19
      - IpProtocol: tcp
20
        FromPort: 443
21
        ToPort: 443
22
        CidrIp: 0.0.0.0/0
23
        Description: HTTPS outbound
24
      - IpProtocol: tcp
25
        FromPort: 80
26
        ToPort: 80
27
        CidrIp: 0.0.0.0/0
28
        Description: HTTP outbound
29

30
# Security Group for ALB
31
ALBSecurityGroup:
32
  Type: AWS::EC2::SecurityGroup
33
  Properties:
34
    GroupDescription: Security group for Application Load Balancer
35
    VpcId: !Ref VPC
36
    SecurityGroupIngress:
37
      - IpProtocol: tcp
38
        FromPort: 80
39
        ToPort: 80
40
        CidrIp: 0.0.0.0/0
41
        Description: HTTP from internet
42
      - IpProtocol: tcp
43
        FromPort: 443
44
        ToPort: 443
45
        CidrIp: 0.0.0.0/0
46
        Description: HTTPS from internet

3. Secrets Management#

1
# Secrets Manager secret
2
DatabaseSecret:
3
  Type: AWS::SecretsManager::Secret
4
  Properties:
5
    Description: Database password for application
6
    GenerateSecretString:
7
      SecretStringTemplate: '{"username": "admin"}'
8
      GenerateStringKey: password
9
      PasswordLength: 16
10
      ExcludeCharacters: '"@/\'
11

12
# Task definition using secrets
13
SecureTaskDefinition:
14
  Type: AWS::ECS::TaskDefinition
15
  Properties:
16
    ContainerDefinitions:
17
      - Name: app
18
        Image: my-app:latest
19
        Secrets:
20
          - Name: DB_PASSWORD
21
            ValueFrom: !Ref DatabaseSecret
22
        Environment:
23
          - Name: DB_HOST
24
            Value: !GetAtt Database.Endpoint.Address

Cost Optimization#

1. Spot Instances and Fargate Spot#

1
# Mixed capacity provider strategy
2
MixedCapacityService:
3
  Type: AWS::ECS::Service
4
  Properties:
5
    CapacityProviderStrategy:
6
      - CapacityProvider: FARGATE
7
        Weight: 1
8
        Base: 1
9
      - CapacityProvider: FARGATE_SPOT
10
        Weight: 4  # 80% spot instances
11
    # Other service properties...

1
def optimize_ecs_costs(cluster_name, service_name):
2
    """
3
    Analyze and optimize ECS costs
4
    """
5
    ecs = boto3.client('ecs')
6

7
    # Get service details
8
    service = ecs.describe_services(
9
        cluster=cluster_name,
10
        services=[service_name]
11
    )['services'][0]
12

13
    recommendations = []
14

15
    # Check if using spot instances
16
    capacity_strategy = service.get('capacityProviderStrategy', [])
17
    spot_weight = sum(cp['weight'] for cp in capacity_strategy if 'SPOT' in cp['capacityProvider'])
18
    total_weight = sum(cp['weight'] for cp in capacity_strategy)
19

20
    if total_weight > 0:
21
        spot_percentage = (spot_weight / total_weight) * 100
22
        if spot_percentage < 70:
23
            recommendations.append(f"Consider increasing Spot usage (currently {spot_percentage:.1f}%)")
24
    else:
25
        recommendations.append("Consider using Fargate Spot for cost savings")
26

27
    # Check task definition resource allocation
28
    task_def_arn = service['taskDefinition']
29
    task_def = ecs.describe_task_definition(taskDefinition=task_def_arn)['taskDefinition']
30

31
    cpu = int(task_def.get('cpu', 0))
32
    memory = int(task_def.get('memory', 0))
33

34
    # Basic right-sizing recommendations
35
    if cpu >= 2048:  # 2 vCPUs
36
        recommendations.append("High CPU allocation - monitor utilization for right-sizing")
37

38
    if memory >= 4096:  # 4 GB
39
        recommendations.append("High memory allocation - monitor utilization for right-sizing")
40

41
    return {
42
        'spot_percentage': spot_percentage if total_weight > 0 else 0,
43
        'cpu_allocation': cpu,
44
        'memory_allocation': memory,
45
        'recommendations': recommendations
46
    }

2. Resource Right-Sizing#

1
def analyze_resource_utilization(cluster_name, service_name, days=7):
2
    """
3
    Analyze resource utilization for right-sizing recommendations
4
    """
5
    cloudwatch = boto3.client('cloudwatch')
6
    end_time = datetime.utcnow()
7
    start_time = end_time - timedelta(days=days)
8

9
    metrics = ['CPUUtilization', 'MemoryUtilization']
10
    utilization_data = {}
11

12
    for metric in metrics:
13
        response = cloudwatch.get_metric_statistics(
14
            Namespace='AWS/ECS',
15
            MetricName=metric,
16
            Dimensions=[
17
                {'Name': 'ServiceName', 'Value': service_name},
18
                {'Name': 'ClusterName', 'Value': cluster_name}
19
            ],
20
            StartTime=start_time,
21
            EndTime=end_time,
22
            Period=3600,
23
            Statistics=['Average', 'Maximum']
24
        )
25

26
        if response['Datapoints']:
27
            avg_utilization = sum(p['Average'] for p in response['Datapoints']) / len(response['Datapoints'])
28
            max_utilization = max(p['Maximum'] for p in response['Datapoints'])
29

30
            utilization_data[metric.lower()] = {
31
                'average': avg_utilization,
32
                'maximum': max_utilization
33
            }
34

35
    # Generate right-sizing recommendations
36
    recommendations = []
37

38
    cpu_avg = utilization_data.get('cpuutilization', {}).get('average', 0)
39
    cpu_max = utilization_data.get('cpuutilization', {}).get('maximum', 0)
40

41
    if cpu_avg < 20 and cpu_max < 50:
42
        recommendations.append("CPU: Consider downsizing - low utilization detected")
43
    elif cpu_avg > 70:
44
        recommendations.append("CPU: Consider upsizing - high utilization detected")
45

46
    memory_avg = utilization_data.get('memoryutilization', {}).get('average', 0)
47
    memory_max = utilization_data.get('memoryutilization', {}).get('maximum', 0)
48

49
    if memory_avg < 30 and memory_max < 60:
50
        recommendations.append("Memory: Consider downsizing - low utilization detected")
51
    elif memory_avg > 80:
52
        recommendations.append("Memory: Consider upsizing - high utilization detected")
53

54
    return {
55
        'utilization_data': utilization_data,
56
        'recommendations': recommendations
57
    }

Troubleshooting#

1. Common Issues and Solutions#

1
def diagnose_ecs_issues(cluster_name, service_name):
2
    """
3
    Diagnose common ECS issues
4
    """
5
    ecs = boto3.client('ecs')
6
    issues = []
7
    solutions = []
8

9
    # Get service details
10
    service = ecs.describe_services(
11
        cluster=cluster_name,
12
        services=[service_name]
13
    )['services'][0]
14

15
    # Check service status
16
    if service['status'] != 'ACTIVE':
17
        issues.append(f"Service status is {service['status']}")
18
        solutions.append("Check service events for error details")
19

20
    # Check task health
21
    running_count = service['runningCount']
22
    desired_count = service['desiredCount']
23

24
    if running_count < desired_count:
25
        issues.append(f"Running tasks ({running_count}) less than desired ({desired_count})")
26

27
        # Get recent tasks to understand why
28
        tasks = ecs.list_tasks(
29
            cluster=cluster_name,
30
            serviceName=service_name
31
        )
32

33
        if tasks['taskArns']:
34
            task_details = ecs.describe_tasks(
35
                cluster=cluster_name,
36
                tasks=tasks['taskArns'][:5]  # Check last 5 tasks
37
            )
38

39
            for task in task_details['tasks']:
40
                if task['lastStatus'] == 'STOPPED':
41
                    stop_reason = task.get('stoppedReason', 'Unknown')
42
                    issues.append(f"Task stopped: {stop_reason}")
43

44
                    if 'OutOfMemory' in stop_reason:
45
                        solutions.append("Increase memory allocation in task definition")
46
                    elif 'CannotPullContainer' in stop_reason:
47
                        solutions.append("Check ECR permissions and image availability")
48
                    elif 'HealthCheck' in stop_reason:
49
                        solutions.append("Review container health check configuration")
50

51
    # Check deployment status
52
    deployments = service.get('deployments', [])
53
    for deployment in deployments:
54
        if deployment['status'] == 'PRIMARY' and deployment['rolloutState'] == 'FAILED':
55
            issues.append("Deployment failed")
56
            solutions.append("Check deployment events and task definition")
57

58
    return {
59
        'issues': issues,
60
        'solutions': solutions,
61
        'service_health': {
62
            'desired_count': desired_count,
63
            'running_count': running_count,
64
            'pending_count': service['pendingCount']
65
        }
66
    }
67

68
def get_ecs_events(cluster_name, service_name=None, max_events=20):
69
    """
70
    Get recent ECS events for troubleshooting
71
    """
72
    ecs = boto3.client('ecs')
73

74
    if service_name:
75
        # Get service events
76
        service = ecs.describe_services(
77
            cluster=cluster_name,
78
            services=[service_name]
79
        )['services'][0]
80

81
        events = service.get('events', [])[:max_events]
82
        return [
83
            {
84
                'timestamp': event['createdAt'].isoformat(),
85
                'message': event['message']
86
            }
87
            for event in events
88
        ]
89
    else:
90
        # Get cluster-level events (would need additional implementation)
91
        return []