The Complete Guide to Amazon CloudWatch: Comprehensive Monitoring and Observability for AWS#

Amazon CloudWatch is AWS’s comprehensive monitoring and observability service that provides data and actionable insights to monitor applications, respond to system-wide performance changes, and optimize resource utilization. This guide covers everything from basic metrics collection to advanced monitoring strategies.

Introduction to CloudWatch {#introduction}#

Amazon CloudWatch is a monitoring service for AWS cloud resources and applications. It provides real-time monitoring, custom metrics, log aggregation, and automated actions based on defined thresholds.

Key Benefits#

Unified Monitoring: Single platform for metrics, logs, and events
Real-time Insights: Near real-time data collection and visualization
Automated Actions: Trigger actions based on metric thresholds
Cost-effective: Pay only for what you use
Integration: Works seamlessly with all AWS services

Core Components {#core-components}#

1. Metrics#

Quantitative data points collected over time intervals.

2. Logs#

Text-based log data from applications and AWS services.

3. Alarms#

Notifications and automated actions based on metric thresholds.

4. Dashboards#

Customizable visualization of metrics and logs.

5. Events#

System events from AWS services and custom applications.

CloudWatch Metrics {#cloudwatch-metrics}#

Basic Metrics Collection#

1
import boto3
2
import time
3
from datetime import datetime, timedelta
4

5
# Initialize CloudWatch client
6
cloudwatch = boto3.client('cloudwatch')
7

8
def put_custom_metric(metric_name, value, unit='Count', namespace='MyApp'):
9
    """
10
    Send custom metric to CloudWatch
11
    """
12
    try:
13
        response = cloudwatch.put_metric_data(
14
            Namespace=namespace,
15
            MetricData=[
16
                {
17
                    'MetricName': metric_name,
18
                    'Value': value,
19
                    'Unit': unit,
20
                    'Timestamp': datetime.utcnow()
21
                }
22
            ]
23
        )
24
        print(f"Custom metric {metric_name} sent successfully")
25
        return response
26
    except Exception as e:
27
        print(f"Error sending metric: {e}")
28

29
# Example usage
30
put_custom_metric('UserLogins', 25, 'Count', 'WebApp')
31
put_custom_metric('ResponseTime', 120.5, 'Milliseconds', 'WebApp')

Advanced Metrics with Dimensions#

1
def put_metric_with_dimensions(metric_name, value, dimensions, namespace='MyApp'):
2
    """
3
    Send metric with dimensions for better filtering and aggregation
4
    """
5
    try:
6
        response = cloudwatch.put_metric_data(
7
            Namespace=namespace,
8
            MetricData=[
9
                {
10
                    'MetricName': metric_name,
11
                    'Value': value,
12
                    'Unit': 'Count',
13
                    'Dimensions': dimensions,
14
                    'Timestamp': datetime.utcnow()
15
                }
16
            ]
17
        )
18
        return response
19
    except Exception as e:
20
        print(f"Error sending metric with dimensions: {e}")
21

22
# Example with dimensions
23
dimensions = [
24
    {'Name': 'Environment', 'Value': 'Production'},
25
    {'Name': 'Region', 'Value': 'us-east-1'},
26
    {'Name': 'Service', 'Value': 'UserService'}
27
]
28

29
put_metric_with_dimensions('APIRequests', 100, dimensions)

Retrieving Metrics#

1
def get_metric_statistics(metric_name, namespace, start_time, end_time, period=300):
2
    """
3
    Retrieve metric statistics from CloudWatch
4
    """
5
    try:
6
        response = cloudwatch.get_metric_statistics(
7
            Namespace=namespace,
8
            MetricName=metric_name,
9
            StartTime=start_time,
10
            EndTime=end_time,
11
            Period=period,
12
            Statistics=['Average', 'Maximum', 'Minimum', 'Sum', 'SampleCount']
13
        )
14

15
        datapoints = sorted(response['Datapoints'], key=lambda x: x['Timestamp'])
16

17
        for point in datapoints:
18
            print(f"Time: {point['Timestamp']}, Average: {point['Average']}")
19

20
        return datapoints
21
    except Exception as e:
22
        print(f"Error retrieving metrics: {e}")
23

24
# Get metrics for the last hour
25
end_time = datetime.utcnow()
26
start_time = end_time - timedelta(hours=1)
27

28
get_metric_statistics('CPUUtilization', 'AWS/EC2', start_time, end_time)

CloudWatch Logs {#cloudwatch-logs}#

Log Groups and Streams Management#

1
import boto3
2
import json
3
from datetime import datetime
4

5
logs_client = boto3.client('logs')
6

7
def create_log_group(log_group_name):
8
    """
9
    Create a CloudWatch log group
10
    """
11
    try:
12
        response = logs_client.create_log_group(
13
            logGroupName=log_group_name,
14
            tags={
15
                'Environment': 'Production',
16
                'Application': 'MyApp'
17
            }
18
        )
19
        print(f"Log group {log_group_name} created successfully")
20
        return response
21
    except logs_client.exceptions.ResourceAlreadyExistsException:
22
        print(f"Log group {log_group_name} already exists")
23
    except Exception as e:
24
        print(f"Error creating log group: {e}")
25

26
def create_log_stream(log_group_name, log_stream_name):
27
    """
28
    Create a log stream within a log group
29
    """
30
    try:
31
        response = logs_client.create_log_stream(
32
            logGroupName=log_group_name,
33
            logStreamName=log_stream_name
34
        )
35
        print(f"Log stream {log_stream_name} created successfully")
36
        return response
37
    except Exception as e:
38
        print(f"Error creating log stream: {e}")
39

40
# Create log infrastructure
41
create_log_group('/aws/myapp/production')
42
create_log_stream('/aws/myapp/production', 'web-server-001')

Sending Logs#

1
def send_log_events(log_group_name, log_stream_name, log_messages):
2
    """
3
    Send log events to CloudWatch Logs
4
    """
5
    try:
6
        # Get the sequence token if stream exists
7
        try:
8
            response = logs_client.describe_log_streams(
9
                logGroupName=log_group_name,
10
                logStreamNamePrefix=log_stream_name
11
            )
12

13
            sequence_token = None
14
            if response['logStreams']:
15
                sequence_token = response['logStreams'][0].get('uploadSequenceToken')
16
        except:
17
            sequence_token = None
18

19
        # Prepare log events
20
        log_events = []
21
        for message in log_messages:
22
            log_events.append({
23
                'timestamp': int(datetime.utcnow().timestamp() * 1000),
24
                'message': json.dumps(message) if isinstance(message, dict) else str(message)
25
            })
26

27
        # Send logs
28
        kwargs = {
29
            'logGroupName': log_group_name,
30
            'logStreamName': log_stream_name,
31
            'logEvents': log_events
32
        }
33

34
        if sequence_token:
35
            kwargs['sequenceToken'] = sequence_token
36

37
        response = logs_client.put_log_events(**kwargs)
38
        print(f"Log events sent successfully")
39
        return response
40

41
    except Exception as e:
42
        print(f"Error sending log events: {e}")
43

44
# Send structured logs
45
log_messages = [
46
    {
47
        'level': 'INFO',
48
        'message': 'User login successful',
49
        'user_id': 'user123',
50
        'ip_address': '192.168.1.100',
51
        'timestamp': datetime.utcnow().isoformat()
52
    },
53
    {
54
        'level': 'ERROR',
55
        'message': 'Database connection failed',
56
        'error_code': 'DB_CONN_001',
57
        'retry_count': 3
58
    }
59
]
60

61
send_log_events('/aws/myapp/production', 'web-server-001', log_messages)

Log Queries with CloudWatch Insights#

1
def run_log_insights_query(log_group_name, query_string, start_time, end_time):
2
    """
3
    Run CloudWatch Logs Insights query
4
    """
5
    try:
6
        # Start query
7
        response = logs_client.start_query(
8
            logGroupName=log_group_name,
9
            startTime=int(start_time.timestamp()),
10
            endTime=int(end_time.timestamp()),
11
            queryString=query_string
12
        )
13

14
        query_id = response['queryId']
15
        print(f"Query started with ID: {query_id}")
16

17
        # Poll for results
18
        import time
19
        while True:
20
            result = logs_client.get_query_results(queryId=query_id)
21

22
            if result['status'] == 'Complete':
23
                print("Query completed successfully")
24

25
                for record in result['results']:
26
                    print({field['field']: field['value'] for field in record})
27

28
                return result['results']
29
            elif result['status'] == 'Failed':
30
                print("Query failed")
31
                break
32
            else:
33
                print(f"Query status: {result['status']}")
34
                time.sleep(2)
35

36
    except Exception as e:
37
        print(f"Error running insights query: {e}")
38

39
# Example queries
40
end_time = datetime.utcnow()
41
start_time = end_time - timedelta(hours=24)
42

43
# Query for errors
44
error_query = """
45
fields @timestamp, @message
46
| filter @message like /ERROR/
47
| sort @timestamp desc
48
| limit 20
49
"""
50

51
run_log_insights_query('/aws/myapp/production', error_query, start_time, end_time)
52

53
# Query for performance metrics
54
performance_query = """
55
fields @timestamp, @message
56
| filter @message like /response_time/
57
| stats avg(@message) by bin(5m)
58
"""
59

60
run_log_insights_query('/aws/myapp/production', performance_query, start_time, end_time)

CloudWatch Alarms {#cloudwatch-alarms}#

Creating Metric Alarms#

1
def create_metric_alarm(alarm_name, metric_name, namespace, threshold, comparison_operator='GreaterThanThreshold'):
2
    """
3
    Create a CloudWatch alarm for a specific metric
4
    """
5
    try:
6
        response = cloudwatch.put_metric_alarm(
7
            AlarmName=alarm_name,
8
            ComparisonOperator=comparison_operator,
9
            EvaluationPeriods=2,
10
            MetricName=metric_name,
11
            Namespace=namespace,
12
            Period=300,
13
            Statistic='Average',
14
            Threshold=threshold,
15
            ActionsEnabled=True,
16
            AlarmActions=[
17
                'arn:aws:sns:us-east-1:123456789012:my-alarm-topic'
18
            ],
19
            AlarmDescription=f'Alarm for {metric_name}',
20
            Unit='Percent' if 'Utilization' in metric_name else 'Count'
21
        )
22

23
        print(f"Alarm {alarm_name} created successfully")
24
        return response
25

26
    except Exception as e:
27
        print(f"Error creating alarm: {e}")
28

29
# Create CPU utilization alarm
30
create_metric_alarm(
31
    'HighCPUUtilization',
32
    'CPUUtilization',
33
    'AWS/EC2',
34
    80.0,
35
    'GreaterThanThreshold'
36
)
37

38
# Create custom metric alarm
39
create_metric_alarm(
40
    'HighErrorRate',
41
    'ErrorCount',
42
    'MyApp',
43
    10.0,
44
    'GreaterThanThreshold'
45
)

Composite Alarms#

1
def create_composite_alarm(alarm_name, alarm_rule):
2
    """
3
    Create a composite alarm based on multiple conditions
4
    """
5
    try:
6
        response = cloudwatch.put_composite_alarm(
7
            AlarmName=alarm_name,
8
            AlarmRule=alarm_rule,
9
            ActionsEnabled=True,
10
            AlarmActions=[
11
                'arn:aws:sns:us-east-1:123456789012:critical-alerts'
12
            ],
13
            AlarmDescription='Composite alarm for critical system health'
14
        )
15

16
        print(f"Composite alarm {alarm_name} created successfully")
17
        return response
18

19
    except Exception as e:
20
        print(f"Error creating composite alarm: {e}")
21

22
# Create composite alarm
23
alarm_rule = """
24
(ALARM("HighCPUUtilization") OR ALARM("HighMemoryUtilization"))
25
AND ALARM("HighErrorRate")
26
"""
27

28
create_composite_alarm('CriticalSystemHealth', alarm_rule)

Anomaly Detection#

1
def create_anomaly_detector(metric_name, namespace, dimensions=None):
2
    """
3
    Create anomaly detector for a metric
4
    """
5
    try:
6
        detector_config = {
7
            'Namespace': namespace,
8
            'MetricName': metric_name,
9
            'Stat': 'Average'
10
        }
11

12
        if dimensions:
13
            detector_config['Dimensions'] = dimensions
14

15
        response = cloudwatch.put_anomaly_detector(**detector_config)
16

17
        print(f"Anomaly detector created for {metric_name}")
18
        return response
19

20
    except Exception as e:
21
        print(f"Error creating anomaly detector: {e}")
22

23
def create_anomaly_alarm(alarm_name, metric_name, namespace, dimensions=None):
24
    """
25
    Create alarm based on anomaly detection
26
    """
27
    try:
28
        metric_config = {
29
            'Id': 'm1',
30
            'MetricStat': {
31
                'Metric': {
32
                    'Namespace': namespace,
33
                    'MetricName': metric_name
34
                },
35
                'Period': 300,
36
                'Stat': 'Average'
37
            }
38
        }
39

40
        if dimensions:
41
            metric_config['MetricStat']['Metric']['Dimensions'] = dimensions
42

43
        response = cloudwatch.put_metric_alarm(
44
            AlarmName=alarm_name,
45
            ComparisonOperator='LessThanLowerOrGreaterThanUpperThreshold',
46
            EvaluationPeriods=2,
47
            Metrics=[
48
                metric_config,
49
                {
50
                    'Id': 'ad1',
51
                    'AnomalyDetector': {
52
                        'Namespace': namespace,
53
                        'MetricName': metric_name,
54
                        'Stat': 'Average'
55
                    }
56
                }
57
            ],
58
            ThresholdMetricId='ad1',
59
            ActionsEnabled=True,
60
            AlarmActions=[
61
                'arn:aws:sns:us-east-1:123456789012:anomaly-alerts'
62
            ],
63
            AlarmDescription=f'Anomaly detection alarm for {metric_name}'
64
        )
65

66
        print(f"Anomaly alarm {alarm_name} created successfully")
67
        return response
68

69
    except Exception as e:
70
        print(f"Error creating anomaly alarm: {e}")
71

72
# Create anomaly detection
73
create_anomaly_detector('ResponseTime', 'MyApp')
74
create_anomaly_alarm('ResponseTimeAnomaly', 'ResponseTime', 'MyApp')

CloudWatch Dashboards {#cloudwatch-dashboards}#

Creating Custom Dashboards#

1
import json
2

3
def create_dashboard(dashboard_name, dashboard_body):
4
    """
5
    Create a CloudWatch dashboard
6
    """
7
    try:
8
        response = cloudwatch.put_dashboard(
9
            DashboardName=dashboard_name,
10
            DashboardBody=json.dumps(dashboard_body)
11
        )
12

13
        print(f"Dashboard {dashboard_name} created successfully")
14
        return response
15

16
    except Exception as e:
17
        print(f"Error creating dashboard: {e}")
18

19
# Define dashboard configuration
20
dashboard_config = {
21
    "widgets": [
22
        {
23
            "type": "metric",
24
            "x": 0,
25
            "y": 0,
26
            "width": 12,
27
            "height": 6,
28
            "properties": {
29
                "metrics": [
30
                    ["AWS/EC2", "CPUUtilization", "InstanceId", "i-1234567890abcdef0"],
31
                    ["AWS/EC2", "NetworkIn", "InstanceId", "i-1234567890abcdef0"],
32
                    ["AWS/EC2", "NetworkOut", "InstanceId", "i-1234567890abcdef0"]
33
                ],
34
                "period": 300,
35
                "stat": "Average",
36
                "region": "us-east-1",
37
                "title": "EC2 Instance Metrics",
38
                "yAxis": {
39
                    "left": {
40
                        "min": 0,
41
                        "max": 100
42
                    }
43
                }
44
            }
45
        },
46
        {
47
            "type": "log",
48
            "x": 0,
49
            "y": 6,
50
            "width": 24,
51
            "height": 6,
52
            "properties": {
53
                "query": "SOURCE '/aws/lambda/my-function'\n| fields @timestamp, @message\n| sort @timestamp desc\n| limit 20",
54
                "region": "us-east-1",
55
                "title": "Recent Lambda Logs",
56
                "view": "table"
57
            }
58
        },
59
        {
60
            "type": "metric",
61
            "x": 12,
62
            "y": 0,
63
            "width": 12,
64
            "height": 6,
65
            "properties": {
66
                "metrics": [
67
                    ["MyApp", "UserLogins", "Environment", "Production"],
68
                    ["MyApp", "ErrorCount", "Environment", "Production"],
69
                    ["MyApp", "ResponseTime", "Environment", "Production"]
70
                ],
71
                "period": 300,
72
                "stat": "Sum",
73
                "region": "us-east-1",
74
                "title": "Application Metrics"
75
            }
76
        }
77
    ]
78
}
79

80
create_dashboard('MyApplicationDashboard', dashboard_config)

CloudWatch Events/EventBridge {#cloudwatch-events}#

Creating Event Rules#

1
import boto3
2

3
events_client = boto3.client('events')
4

5
def create_event_rule(rule_name, event_pattern, targets):
6
    """
7
    Create CloudWatch Events rule
8
    """
9
    try:
10
        # Create the rule
11
        response = events_client.put_rule(
12
            Name=rule_name,
13
            EventPattern=json.dumps(event_pattern),
14
            State='ENABLED',
15
            Description=f'Event rule for {rule_name}'
16
        )
17

18
        rule_arn = response['RuleArn']
19
        print(f"Event rule {rule_name} created: {rule_arn}")
20

21
        # Add targets to the rule
22
        events_client.put_targets(
23
            Rule=rule_name,
24
            Targets=targets
25
        )
26

27
        print(f"Targets added to rule {rule_name}")
28
        return response
29

30
    except Exception as e:
31
        print(f"Error creating event rule: {e}")
32

33
# Create rule for EC2 instance state changes
34
ec2_event_pattern = {
35
    "source": ["aws.ec2"],
36
    "detail-type": ["EC2 Instance State-change Notification"],
37
    "detail": {
38
        "state": ["running", "stopped", "terminated"]
39
    }
40
}
41

42
ec2_targets = [
43
    {
44
        'Id': '1',
45
        'Arn': 'arn:aws:sns:us-east-1:123456789012:ec2-notifications',
46
        'InputTransformer': {
47
            'InputPathsMap': {
48
                'instance': '$.detail.instance-id',
49
                'state': '$.detail.state'
50
            },
51
            'InputTemplate': '{"instance": "<instance>", "state": "<state>"}'
52
        }
53
    }
54
]
55

56
create_event_rule('EC2StateChangeRule', ec2_event_pattern, ec2_targets)

Custom Application Events#

1
def send_custom_event(source, detail_type, detail):
2
    """
3
    Send custom event to EventBridge
4
    """
5
    try:
6
        response = events_client.put_events(
7
            Entries=[
8
                {
9
                    'Source': source,
10
                    'DetailType': detail_type,
11
                    'Detail': json.dumps(detail),
12
                    'Time': datetime.utcnow()
13
                }
14
            ]
15
        )
16

17
        print(f"Custom event sent successfully")
18
        return response
19

20
    except Exception as e:
21
        print(f"Error sending custom event: {e}")
22

23
# Send custom application event
24
custom_detail = {
25
    'user_id': 'user123',
26
    'action': 'purchase',
27
    'amount': 99.99,
28
    'product_id': 'prod456',
29
    'timestamp': datetime.utcnow().isoformat()
30
}
31

32
send_custom_event('myapp.orders', 'Order Completed', custom_detail)

Advanced Features {#advanced-features}#

Cross-Account Monitoring#

1
def setup_cross_account_dashboard(dashboard_name, source_account_widgets):
2
    """
3
    Create dashboard with metrics from multiple accounts
4
    """
5
    dashboard_config = {
6
        "widgets": []
7
    }
8

9
    for widget in source_account_widgets:
10
        # Add account ID to metric specifications
11
        for metric in widget['properties']['metrics']:
12
            if len(metric) >= 2:
13
                # Insert account ID into metric specification
14
                metric.insert(0, {
15
                    'accountId': widget['account_id']
16
                })
17

18
        dashboard_config['widgets'].append(widget)
19

20
    return create_dashboard(dashboard_name, dashboard_config)
21

22
# Example cross-account widget configuration
23
cross_account_widgets = [
24
    {
25
        'account_id': '123456789012',
26
        'type': 'metric',
27
        'x': 0,
28
        'y': 0,
29
        'width': 12,
30
        'height': 6,
31
        'properties': {
32
            'metrics': [
33
                ['AWS/EC2', 'CPUUtilization', 'InstanceId', 'i-1234567890abcdef0']
34
            ],
35
            'period': 300,
36
            'stat': 'Average',
37
            'region': 'us-east-1',
38
            'title': 'Cross-Account EC2 Metrics'
39
        }
40
    }
41
]
42

43
setup_cross_account_dashboard('CrossAccountDashboard', cross_account_widgets)

Custom Widgets with Lambda#

1
def create_custom_widget_lambda():
2
    """
3
    Lambda function for custom CloudWatch widget
4
    """
5
    lambda_code = '''
6
import json
7
import boto3
8
from datetime import datetime, timedelta
9

10
def lambda_handler(event, context):
11
    # Extract widget parameters
12
    widget_context = json.loads(event.get('widgetContext', '{}'))
13
    time_range = widget_context.get('timeRange', {})
14

15
    # Calculate custom metrics
16
    cloudwatch = boto3.client('cloudwatch')
17

18
    # Example: Calculate cost efficiency metric
19
    end_time = datetime.fromisoformat(time_range.get('end', datetime.utcnow().isoformat()))
20
    start_time = datetime.fromisoformat(time_range.get('start', (datetime.utcnow() - timedelta(hours=1)).isoformat()))
21

22
    # Get CPU utilization
23
    cpu_response = cloudwatch.get_metric_statistics(
24
        Namespace='AWS/EC2',
25
        MetricName='CPUUtilization',
26
        StartTime=start_time,
27
        EndTime=end_time,
28
        Period=3600,
29
        Statistics=['Average']
30
    )
31

32
    # Calculate efficiency score
33
    avg_cpu = sum(point['Average'] for point in cpu_response['Datapoints']) / len(cpu_response['Datapoints']) if cpu_response['Datapoints'] else 0
34
    efficiency_score = min(avg_cpu / 80 * 100, 100)  # Optimal at 80% CPU
35

36
    # Return widget data
37
    return {
38
        'statusCode': 200,
39
        'body': json.dumps({
40
            'efficiency_score': efficiency_score,
41
            'timestamp': datetime.utcnow().isoformat(),
42
            'period': f"{start_time.isoformat()} to {end_time.isoformat()}"
43
        })
44
    }
45
    '''
46

47
    return lambda_code
48

49
# Custom widget configuration
50
custom_widget_config = {
51
    "type": "custom",
52
    "x": 0,
53
    "y": 0,
54
    "width": 6,
55
    "height": 6,
56
    "properties": {
57
        "endpoint": "arn:aws:lambda:us-east-1:123456789012:function:custom-widget-function",
58
        "title": "Resource Efficiency Score",
59
        "updateOn": {
60
            "refresh": True,
61
            "resize": True,
62
            "timeRange": True
63
        }
64
    }
65
}

Best Practices {#best-practices}#

Monitoring Strategy#

1
class CloudWatchMonitoringStrategy:
2
    def __init__(self):
3
        self.cloudwatch = boto3.client('cloudwatch')
4
        self.logs_client = boto3.client('logs')
5

6
    def implement_layered_monitoring(self):
7
        """
8
        Implement comprehensive monitoring strategy
9
        """
10
        layers = {
11
            'infrastructure': self.setup_infrastructure_monitoring(),
12
            'application': self.setup_application_monitoring(),
13
            'business': self.setup_business_monitoring(),
14
            'user_experience': self.setup_ux_monitoring()
15
        }
16

17
        return layers
18

19
    def setup_infrastructure_monitoring(self):
20
        """
21
        Monitor infrastructure components
22
        """
23
        infrastructure_metrics = [
24
            {'metric': 'CPUUtilization', 'threshold': 80, 'namespace': 'AWS/EC2'},
25
            {'metric': 'MemoryUtilization', 'threshold': 85, 'namespace': 'AWS/EC2'},
26
            {'metric': 'DiskSpaceUtilization', 'threshold': 90, 'namespace': 'AWS/EC2'},
27
            {'metric': 'NetworkPacketsIn', 'threshold': 10000, 'namespace': 'AWS/EC2'},
28
            {'metric': 'DatabaseConnections', 'threshold': 80, 'namespace': 'AWS/RDS'},
29
            {'metric': 'FreeStorageSpace', 'threshold': 2000000000, 'namespace': 'AWS/RDS', 'comparison': 'LessThanThreshold'}
30
        ]
31

32
        for metric in infrastructure_metrics:
33
            alarm_name = f"Infrastructure-{metric['metric']}-Alert"
34
            comparison = metric.get('comparison', 'GreaterThanThreshold')
35

36
            self.cloudwatch.put_metric_alarm(
37
                AlarmName=alarm_name,
38
                ComparisonOperator=comparison,
39
                EvaluationPeriods=2,
40
                MetricName=metric['metric'],
41
                Namespace=metric['namespace'],
42
                Period=300,
43
                Statistic='Average',
44
                Threshold=metric['threshold'],
45
                ActionsEnabled=True,
46
                AlarmActions=[
47
                    'arn:aws:sns:us-east-1:123456789012:infrastructure-alerts'
48
                ],
49
                AlarmDescription=f'Infrastructure monitoring for {metric["metric"]}'
50
            )
51

52
        return infrastructure_metrics
53

54
    def setup_application_monitoring(self):
55
        """
56
        Monitor application-level metrics
57
        """
58
        app_metrics = [
59
            {'metric': 'ResponseTime', 'threshold': 1000, 'unit': 'Milliseconds'},
60
            {'metric': 'ErrorRate', 'threshold': 5, 'unit': 'Percent'},
61
            {'metric': 'ThroughputTPS', 'threshold': 100, 'unit': 'Count/Second', 'comparison': 'LessThanThreshold'},
62
            {'metric': 'MemoryLeaks', 'threshold': 1, 'unit': 'Count'},
63
            {'metric': 'FailedTransactions', 'threshold': 10, 'unit': 'Count'}
64
        ]
65

66
        for metric in app_metrics:
67
            alarm_name = f"Application-{metric['metric']}-Alert"
68
            comparison = metric.get('comparison', 'GreaterThanThreshold')
69

70
            self.cloudwatch.put_metric_alarm(
71
                AlarmName=alarm_name,
72
                ComparisonOperator=comparison,
73
                EvaluationPeriods=3,
74
                MetricName=metric['metric'],
75
                Namespace='MyApp',
76
                Period=60,
77
                Statistic='Average',
78
                Threshold=metric['threshold'],
79
                ActionsEnabled=True,
80
                AlarmActions=[
81
                    'arn:aws:sns:us-east-1:123456789012:application-alerts'
82
                ],
83
                AlarmDescription=f'Application monitoring for {metric["metric"]}'
84
            )
85

86
        return app_metrics
87

88
    def setup_business_monitoring(self):
89
        """
90
        Monitor business KPIs
91
        """
92
        business_metrics = [
93
            {'metric': 'DailyActiveUsers', 'threshold': 1000, 'comparison': 'LessThanThreshold'},
94
            {'metric': 'ConversionRate', 'threshold': 2.5, 'unit': 'Percent', 'comparison': 'LessThanThreshold'},
95
            {'metric': 'RevenuePerHour', 'threshold': 500, 'comparison': 'LessThanThreshold'},
96
            {'metric': 'CustomerSatisfactionScore', 'threshold': 4.0, 'comparison': 'LessThanThreshold'},
97
            {'metric': 'ChurnRate', 'threshold': 5, 'unit': 'Percent'}
98
        ]
99

100
        for metric in business_metrics:
101
            alarm_name = f"Business-{metric['metric']}-Alert"
102
            comparison = metric.get('comparison', 'GreaterThanThreshold')
103

104
            self.cloudwatch.put_metric_alarm(
105
                AlarmName=alarm_name,
106
                ComparisonOperator=comparison,
107
                EvaluationPeriods=1,
108
                MetricName=metric['metric'],
109
                Namespace='Business/KPIs',
110
                Period=3600,  # Hourly evaluation
111
                Statistic='Average',
112
                Threshold=metric['threshold'],
113
                ActionsEnabled=True,
114
                AlarmActions=[
115
                    'arn:aws:sns:us-east-1:123456789012:business-alerts'
116
                ],
117
                AlarmDescription=f'Business KPI monitoring for {metric["metric"]}'
118
            )
119

120
        return business_metrics
121

122
# Initialize monitoring strategy
123
monitoring = CloudWatchMonitoringStrategy()
124
monitoring.implement_layered_monitoring()

Efficient Log Management#

1
class LogManagementBestPractices:
2
    def __init__(self):
3
        self.logs_client = boto3.client('logs')
4

5
    def setup_log_retention_policies(self, log_groups_config):
6
        """
7
        Set appropriate retention policies for different log types
8
        """
9
        retention_policies = {
10
            'application_logs': 30,      # 30 days for application logs
11
            'access_logs': 90,           # 90 days for access logs
12
            'audit_logs': 2555,          # 7 years for audit logs
13
            'debug_logs': 7,             # 7 days for debug logs
14
            'error_logs': 180,           # 6 months for error logs
15
            'security_logs': 1095        # 3 years for security logs
16
        }
17

18
        for log_group, log_type in log_groups_config.items():
19
            if log_type in retention_policies:
20
                try:
21
                    self.logs_client.put_retention_policy(
22
                        logGroupName=log_group,
23
                        retentionInDays=retention_policies[log_type]
24
                    )
25
                    print(f"Retention policy set for {log_group}: {retention_policies[log_type]} days")
26
                except Exception as e:
27
                    print(f"Error setting retention policy for {log_group}: {e}")
28

29
    def setup_log_filters(self, log_group_name):
30
        """
31
        Create metric filters for important log patterns
32
        """
33
        filters = [
34
            {
35
                'filter_name': 'ErrorFilter',
36
                'filter_pattern': '[timestamp, request_id, level="ERROR", ...]',
37
                'metric_name': 'ErrorCount',
38
                'metric_namespace': 'LogMetrics',
39
                'metric_value': '1'
40
            },
41
            {
42
                'filter_name': 'WarningFilter',
43
                'filter_pattern': '[timestamp, request_id, level="WARN", ...]',
44
                'metric_name': 'WarningCount',
45
                'metric_namespace': 'LogMetrics',
46
                'metric_value': '1'
47
            },
48
            {
49
                'filter_name': 'ResponseTimeFilter',
50
                'filter_pattern': '[timestamp, request_id, level, method, url, response_time]',
51
                'metric_name': 'ResponseTime',
52
                'metric_namespace': 'LogMetrics',
53
                'metric_value': '$response_time'
54
            }
55
        ]
56

57
        for filter_config in filters:
58
            try:
59
                self.logs_client.put_metric_filter(
60
                    logGroupName=log_group_name,
61
                    filterName=filter_config['filter_name'],
62
                    filterPattern=filter_config['filter_pattern'],
63
                    metricTransformations=[
64
                        {
65
                            'metricName': filter_config['metric_name'],
66
                            'metricNamespace': filter_config['metric_namespace'],
67
                            'metricValue': filter_config['metric_value']
68
                        }
69
                    ]
70
                )
71
                print(f"Metric filter {filter_config['filter_name']} created for {log_group_name}")
72
            except Exception as e:
73
                print(f"Error creating metric filter {filter_config['filter_name']}: {e}")
74

75
# Example usage
76
log_manager = LogManagementBestPractices()
77

78
# Set retention policies
79
log_groups_config = {
80
    '/aws/myapp/application': 'application_logs',
81
    '/aws/myapp/access': 'access_logs',
82
    '/aws/myapp/audit': 'audit_logs',
83
    '/aws/myapp/debug': 'debug_logs',
84
    '/aws/myapp/errors': 'error_logs',
85
    '/aws/myapp/security': 'security_logs'
86
}
87

88
log_manager.setup_log_retention_policies(log_groups_config)
89
log_manager.setup_log_filters('/aws/myapp/application')

Cost Optimization {#cost-optimization}#

CloudWatch Cost Management#

1
class CloudWatchCostOptimization:
2
    def __init__(self):
3
        self.cloudwatch = boto3.client('cloudwatch')
4
        self.logs_client = boto3.client('logs')
5
        self.ce_client = boto3.client('ce')  # Cost Explorer
6

7
    def analyze_cloudwatch_costs(self, start_date, end_date):
8
        """
9
        Analyze CloudWatch costs and identify optimization opportunities
10
        """
11
        try:
12
            response = self.ce_client.get_cost_and_usage(
13
                TimePeriod={
14
                    'Start': start_date.strftime('%Y-%m-%d'),
15
                    'End': end_date.strftime('%Y-%m-%d')
16
                },
17
                Granularity='MONTHLY',
18
                Metrics=['BlendedCost'],
19
                GroupBy=[
20
                    {
21
                        'Type': 'DIMENSION',
22
                        'Key': 'SERVICE'
23
                    }
24
                ],
25
                Filter={
26
                    'Dimensions': {
27
                        'Key': 'SERVICE',
28
                        'Values': ['Amazon CloudWatch', 'Amazon CloudWatch Logs']
29
                    }
30
                }
31
            )
32

33
            cost_analysis = {}
34
            for result in response['ResultsByTime']:
35
                for group in result['Groups']:
36
                    service = group['Keys'][0]
37
                    cost = float(group['Metrics']['BlendedCost']['Amount'])
38
                    cost_analysis[service] = cost_analysis.get(service, 0) + cost
39

40
            print("CloudWatch Cost Analysis:")
41
            for service, cost in cost_analysis.items():
42
                print(f"{service}: ${cost:.2f}")
43

44
            return cost_analysis
45

46
        except Exception as e:
47
            print(f"Error analyzing costs: {e}")
48
            return {}
49

50
    def optimize_log_groups(self):
51
        """
52
        Identify and optimize expensive log groups
53
        """
54
        try:
55
            paginator = self.logs_client.get_paginator('describe_log_groups')
56

57
            optimization_recommendations = []
58

59
            for page in paginator.paginate():
60
                for log_group in page['logGroups']:
61
                    log_group_name = log_group['logGroupName']
62

63
                    # Check storage size
64
                    storage_bytes = log_group.get('storedBytes', 0)
65
                    storage_gb = storage_bytes / (1024**3)
66

67
                    # Check retention policy
68
                    retention_days = log_group.get('retentionInDays', 'Never expire')
69

70
                    recommendations = []
71

72
                    if storage_gb > 10:  # More than 10GB
73
                        recommendations.append("Large storage size - consider retention policy")
74

75
                    if retention_days == 'Never expire':
76
                        recommendations.append("No retention policy - data stored indefinitely")
77

78
                    if isinstance(retention_days, int) and retention_days > 365:
79
                        recommendations.append("Long retention period - review necessity")
80

81
                    if recommendations:
82
                        optimization_recommendations.append({
83
                            'log_group': log_group_name,
84
                            'storage_gb': storage_gb,
85
                            'retention_days': retention_days,
86
                            'recommendations': recommendations
87
                        })
88

89
            # Sort by storage size (largest first)
90
            optimization_recommendations.sort(key=lambda x: x['storage_gb'], reverse=True)
91

92
            print("Log Group Optimization Recommendations:")
93
            for rec in optimization_recommendations[:10]:  # Top 10
94
                print(f"\nLog Group: {rec['log_group']}")
95
                print(f"Storage: {rec['storage_gb']:.2f} GB")
96
                print(f"Retention: {rec['retention_days']}")
97
                print("Recommendations:")
98
                for r in rec['recommendations']:
99
                    print(f"  - {r}")
100

101
            return optimization_recommendations
102

103
        except Exception as e:
104
            print(f"Error optimizing log groups: {e}")
105
            return []
106

107
    def optimize_metric_usage(self):
108
        """
109
        Analyze and optimize custom metric usage
110
        """
111
        try:
112
            # Get list of custom metrics
113
            paginator = self.cloudwatch.get_paginator('list_metrics')
114

115
            metric_usage = {}
116
            total_custom_metrics = 0
117

118
            for page in paginator.paginate():
119
                for metric in page['Metrics']:
120
                    namespace = metric['Namespace']
121

122
                    # Focus on custom metrics (non-AWS namespaces)
123
                    if not namespace.startswith('AWS/'):
124
                        total_custom_metrics += 1
125
                        metric_usage[namespace] = metric_usage.get(namespace, 0) + 1
126

127
            print(f"Total Custom Metrics: {total_custom_metrics}")
128
            print("\nCustom Metrics by Namespace:")
129

130
            sorted_namespaces = sorted(metric_usage.items(), key=lambda x: x[1], reverse=True)
131
            for namespace, count in sorted_namespaces:
132
                estimated_cost = count * 0.30  # $0.30 per metric per month
133
                print(f"{namespace}: {count} metrics (Est. ${estimated_cost:.2f}/month)")
134

135
            # Recommendations
136
            recommendations = []
137
            if total_custom_metrics > 100:
138
                recommendations.append("High number of custom metrics - review necessity")
139

140
            for namespace, count in sorted_namespaces:
141
                if count > 50:
142
                    recommendations.append(f"Namespace '{namespace}' has many metrics ({count}) - consider consolidation")
143

144
            if recommendations:
145
                print("\nOptimization Recommendations:")
146
                for rec in recommendations:
147
                    print(f"  - {rec}")
148

149
            return {
150
                'total_metrics': total_custom_metrics,
151
                'by_namespace': dict(sorted_namespaces),
152
                'recommendations': recommendations
153
            }
154

155
        except Exception as e:
156
            print(f"Error analyzing metric usage: {e}")
157
            return {}
158

159
# Cost optimization analysis
160
cost_optimizer = CloudWatchCostOptimization()
161

162
# Analyze costs for the last 3 months
163
end_date = datetime.utcnow()
164
start_date = end_date - timedelta(days=90)
165

166
cost_optimizer.analyze_cloudwatch_costs(start_date, end_date)
167
cost_optimizer.optimize_log_groups()
168
cost_optimizer.optimize_metric_usage()

Security Considerations {#security}#

IAM Best Practices for CloudWatch#

1
# CloudWatch IAM Policy Template
2
Version: '2012-10-17'
3
Statement:
4
  # Read-only access to metrics and dashboards
5
  - Effect: Allow
6
    Action:
7
      - cloudwatch:GetMetricStatistics
8
      - cloudwatch:ListMetrics
9
      - cloudwatch:GetDashboard
10
      - cloudwatch:ListDashboards
11
      - cloudwatch:DescribeAlarms
12
      - cloudwatch:DescribeAlarmHistory
13
    Resource: '*'
14

15
  # Limited write access for custom metrics
16
  - Effect: Allow
17
    Action:
18
      - cloudwatch:PutMetricData
19
    Resource: '*'
20
    Condition:
21
      StringEquals:
22
        'cloudwatch:namespace':
23
          - 'MyApp/*'
24
          - 'Custom/*'
25

26
  # Log access restrictions
27
  - Effect: Allow
28
    Action:
29
      - logs:CreateLogGroup
30
      - logs:CreateLogStream
31
      - logs:PutLogEvents
32
      - logs:DescribeLogGroups
33
      - logs:DescribeLogStreams
34
    Resource:
35
      - 'arn:aws:logs:*:*:log-group:/aws/myapp/*'
36
      - 'arn:aws:logs:*:*:log-group:/custom/*'
37

38
  # Alarm management (limited)
39
  - Effect: Allow
40
    Action:
41
      - cloudwatch:PutMetricAlarm
42
      - cloudwatch:DeleteAlarms
43
    Resource: '*'
44
    Condition:
45
      StringLike:
46
        'cloudwatch:AlarmName': 'MyApp-*'

Secure Logging Practices#

1
import hashlib
2
import hmac
3
import json
4
from datetime import datetime
5

6
class SecureCloudWatchLogging:
7
    def __init__(self, secret_key):
8
        self.secret_key = secret_key
9
        self.logs_client = boto3.client('logs')
10

11
    def sanitize_log_data(self, log_data):
12
        """
13
        Remove sensitive information from log data
14
        """
15
        sensitive_fields = [
16
            'password', 'api_key', 'token', 'secret',
17
            'ssn', 'credit_card', 'email', 'phone'
18
        ]
19

20
        if isinstance(log_data, dict):
21
            sanitized = {}
22
            for key, value in log_data.items():
23
                if any(sensitive in key.lower() for sensitive in sensitive_fields):
24
                    sanitized[key] = '[REDACTED]'
25
                elif isinstance(value, dict):
26
                    sanitized[key] = self.sanitize_log_data(value)
27
                elif isinstance(value, list):
28
                    sanitized[key] = [self.sanitize_log_data(item) if isinstance(item, dict) else item for item in value]
29
                else:
30
                    sanitized[key] = value
31
            return sanitized
32

33
        return log_data
34

35
    def add_integrity_check(self, log_data):
36
        """
37
        Add integrity check to log data
38
        """
39
        log_json = json.dumps(log_data, sort_keys=True)
40
        signature = hmac.new(
41
            self.secret_key.encode('utf-8'),
42
            log_json.encode('utf-8'),
43
            hashlib.sha256
44
        ).hexdigest()
45

46
        log_data['_integrity_hash'] = signature
47
        return log_data
48

49
    def secure_log(self, log_group_name, log_stream_name, log_data):
50
        """
51
        Send secure log with sanitization and integrity check
52
        """
53
        try:
54
            # Sanitize data
55
            sanitized_data = self.sanitize_log_data(log_data.copy())
56

57
            # Add metadata
58
            sanitized_data['_timestamp'] = datetime.utcnow().isoformat()
59
            sanitized_data['_log_level'] = log_data.get('level', 'INFO')
60

61
            # Add integrity check
62
            secure_data = self.add_integrity_check(sanitized_data)
63

64
            # Send to CloudWatch
65
            response = self.logs_client.put_log_events(
66
                logGroupName=log_group_name,
67
                logStreamName=log_stream_name,
68
                logEvents=[
69
                    {
70
                        'timestamp': int(datetime.utcnow().timestamp() * 1000),
71
                        'message': json.dumps(secure_data)
72
                    }
73
                ]
74
            )
75

76
            return response
77

78
        except Exception as e:
79
            print(f"Error in secure logging: {e}")
80

81
# Usage example
82
secure_logger = SecureCloudWatchLogging('your-secret-key-here')
83

84
# Example log with sensitive data
85
log_entry = {
86
    'user_id': 'user123',
87
    'action': 'login',
88
    'password': 'secret123',  # Will be redacted
89
    'api_key': 'abc123',      # Will be redacted
90
    'ip_address': '192.168.1.100',
91
    'timestamp': datetime.utcnow().isoformat()
92
}
93

94
secure_logger.secure_log('/aws/myapp/secure', 'auth-service', log_entry)

Troubleshooting {#troubleshooting}#

Common Issues and Solutions#

1
class CloudWatchTroubleshooter:
2
    def __init__(self):
3
        self.cloudwatch = boto3.client('cloudwatch')
4
        self.logs_client = boto3.client('logs')
5

6
    def diagnose_metric_issues(self, namespace, metric_name, start_time, end_time):
7
        """
8
        Diagnose issues with metric collection
9
        """
10
        issues = []
11

12
        try:
13
            # Check if metrics exist
14
            metrics = self.cloudwatch.list_metrics(
15
                Namespace=namespace,
16
                MetricName=metric_name
17
            )
18

19
            if not metrics['Metrics']:
20
                issues.append("No metrics found - check metric name and namespace")
21
                return issues
22

23
            # Check for data points
24
            response = self.cloudwatch.get_metric_statistics(
25
                Namespace=namespace,
26
                MetricName=metric_name,
27
                StartTime=start_time,
28
                EndTime=end_time,
29
                Period=300,
30
                Statistics=['Sum']
31
            )
32

33
            if not response['Datapoints']:
34
                issues.append("No data points found - check time range and metric publishing")
35

36
            # Check for gaps in data
37
            datapoints = sorted(response['Datapoints'], key=lambda x: x['Timestamp'])
38
            for i in range(1, len(datapoints)):
39
                time_diff = (datapoints[i]['Timestamp'] - datapoints[i-1]['Timestamp']).total_seconds()
40
                if time_diff > 600:  # More than 10 minutes gap
41
                    issues.append(f"Data gap detected between {datapoints[i-1]['Timestamp']} and {datapoints[i]['Timestamp']}")
42

43
            # Check metric dimensions
44
            unique_dimensions = set()
45
            for metric in metrics['Metrics']:
46
                dimension_set = frozenset((d['Name'], d['Value']) for d in metric.get('Dimensions', []))
47
                unique_dimensions.add(dimension_set)
48

49
            if len(unique_dimensions) > 10:
50
                issues.append(f"High cardinality detected: {len(unique_dimensions)} unique dimension combinations")
51

52
        except Exception as e:
53
            issues.append(f"Error diagnosing metrics: {e}")
54

55
        return issues
56

57
    def diagnose_alarm_issues(self, alarm_name):
58
        """
59
        Diagnose alarm configuration issues
60
        """
61
        issues = []
62

63
        try:
64
            response = self.cloudwatch.describe_alarms(
65
                AlarmNames=[alarm_name]
66
            )
67

68
            if not response['MetricAlarms']:
69
                issues.append("Alarm not found")
70
                return issues
71

72
            alarm = response['MetricAlarms'][0]
73

74
            # Check alarm state
75
            if alarm['StateValue'] == 'INSUFFICIENT_DATA':
76
                issues.append("Alarm has insufficient data - check metric availability")
77

78
            # Check evaluation periods and period
79
            if alarm['EvaluationPeriods'] * alarm['Period'] < 600:
80
                issues.append("Evaluation period too short - may cause false alarms")
81

82
            # Check if actions are enabled
83
            if not alarm['ActionsEnabled']:
84
                issues.append("Alarm actions are disabled")
85

86
            # Check if there are actions configured
87
            if not alarm.get('AlarmActions') and not alarm.get('OKActions'):
88
                issues.append("No actions configured for alarm")
89

90
            # Get alarm history
91
            history = self.cloudwatch.describe_alarm_history(
92
                AlarmName=alarm_name,
93
                MaxRecords=10
94
            )
95

96
            # Check for frequent state changes
97
            state_changes = [h for h in history['AlarmHistoryItems'] if h['HistoryItemType'] == 'StateUpdate']
98
            if len(state_changes) > 5:
99
                issues.append("Alarm changing states frequently - review threshold and evaluation criteria")
100

101
        except Exception as e:
102
            issues.append(f"Error diagnosing alarm: {e}")
103

104
        return issues
105

106
    def diagnose_log_issues(self, log_group_name):
107
        """
108
        Diagnose log ingestion issues
109
        """
110
        issues = []
111

112
        try:
113
            # Check if log group exists
114
            response = self.logs_client.describe_log_groups(
115
                logGroupNamePrefix=log_group_name
116
            )
117

118
            matching_groups = [lg for lg in response['logGroups'] if lg['logGroupName'] == log_group_name]
119
            if not matching_groups:
120
                issues.append("Log group does not exist")
121
                return issues
122

123
            log_group = matching_groups[0]
124

125
            # Check log streams
126
            streams_response = self.logs_client.describe_log_streams(
127
                logGroupName=log_group_name,
128
                orderBy='LastEventTime',
129
                descending=True,
130
                limit=10
131
            )
132

133
            if not streams_response['logStreams']:
134
                issues.append("No log streams found")
135
            else:
136
                # Check for recent activity
137
                latest_stream = streams_response['logStreams'][0]
138
                if 'lastEventTime' in latest_stream:
139
                    last_event_time = datetime.fromtimestamp(latest_stream['lastEventTime'] / 1000)
140
                    time_since_last = datetime.utcnow() - last_event_time
141

142
                    if time_since_last.total_seconds() > 3600:  # More than 1 hour
143
                        issues.append(f"No recent log events (last event: {last_event_time})")
144

145
                # Check for stuck streams
146
                stuck_streams = 0
147
                for stream in streams_response['logStreams']:
148
                    if 'lastEventTime' in stream and 'lastIngestionTime' in stream:
149
                        event_time = stream['lastEventTime']
150
                        ingestion_time = stream['lastIngestionTime']
151
                        if ingestion_time - event_time > 300000:  # More than 5 minutes delay
152
                            stuck_streams += 1
153

154
                if stuck_streams > 0:
155
                    issues.append(f"{stuck_streams} log streams have ingestion delays")
156

157
            # Check retention policy
158
            if 'retentionInDays' not in log_group:
159
                issues.append("No retention policy set - logs will be stored indefinitely")
160

161
        except Exception as e:
162
            issues.append(f"Error diagnosing logs: {e}")
163

164
        return issues
165

166
    def run_comprehensive_diagnosis(self, resources):
167
        """
168
        Run comprehensive diagnosis on multiple resources
169
        """
170
        diagnosis_report = {
171
            'timestamp': datetime.utcnow().isoformat(),
172
            'resources': {}
173
        }
174

175
        for resource in resources:
176
            resource_type = resource['type']
177
            resource_name = resource['name']
178

179
            if resource_type == 'metric':
180
                issues = self.diagnose_metric_issues(
181
                    resource['namespace'],
182
                    resource['metric_name'],
183
                    resource['start_time'],
184
                    resource['end_time']
185
                )
186
            elif resource_type == 'alarm':
187
                issues = self.diagnose_alarm_issues(resource_name)
188
            elif resource_type == 'log_group':
189
                issues = self.diagnose_log_issues(resource_name)
190
            else:
191
                issues = [f"Unknown resource type: {resource_type}"]
192

193
            diagnosis_report['resources'][resource_name] = {
194
                'type': resource_type,
195
                'issues': issues,
196
                'status': 'healthy' if not issues else 'issues_detected'
197
            }
198

199
        return diagnosis_report
200

201
# Example usage
202
troubleshooter = CloudWatchTroubleshooter()
203

204
# Define resources to diagnose
205
resources_to_check = [
206
    {
207
        'type': 'metric',
208
        'name': 'CPUUtilization',
209
        'namespace': 'AWS/EC2',
210
        'metric_name': 'CPUUtilization',
211
        'start_time': datetime.utcnow() - timedelta(hours=2),
212
        'end_time': datetime.utcnow()
213
    },
214
    {
215
        'type': 'alarm',
216
        'name': 'HighCPUUtilization'
217
    },
218
    {
219
        'type': 'log_group',
220
        'name': '/aws/myapp/production'
221
    }
222
]
223

224
# Run diagnosis
225
diagnosis = troubleshooter.run_comprehensive_diagnosis(resources_to_check)
226

227
print("CloudWatch Diagnosis Report:")
228
print(json.dumps(diagnosis, indent=2, default=str))

Conclusion#

Amazon CloudWatch provides comprehensive monitoring and observability capabilities for AWS infrastructure and applications. Key takeaways:

Essential Features:#

Metrics: Collect and monitor quantitative data from AWS services and custom applications
Logs: Centralized log management with powerful query capabilities
Alarms: Automated notifications and actions based on thresholds
Dashboards: Visual monitoring and real-time insights
Events: React to system changes and custom application events

Best Practices:#

Implement layered monitoring (infrastructure, application, business)
Use appropriate log retention policies
Set up meaningful alarms with proper thresholds
Leverage anomaly detection for dynamic thresholds
Optimize costs through metric and log management

Advanced Capabilities:#

Cross-account monitoring
Custom widgets with Lambda
Composite alarms for complex scenarios
Log Insights for sophisticated log analysis
Integration with EventBridge for event-driven architectures

CloudWatch forms the foundation of observability in AWS, enabling proactive monitoring, rapid troubleshooting, and data-driven decision making. Proper implementation ensures system reliability, performance optimization, and cost control while maintaining security and compliance standards.