The Complete Guide to Amazon Kinesis: Real-time Data Streaming and Analytics#

Amazon Kinesis is AWS’s platform for streaming data on AWS, offering powerful services for real-time data ingestion, processing, and analytics. This comprehensive guide covers all Kinesis services and advanced streaming architectures.

Introduction to Kinesis {#introduction}#

Amazon Kinesis is a platform for streaming data on AWS that makes it easy to collect, process, and analyze real-time, streaming data. It enables you to get timely insights and react quickly to new information.

Key Benefits:#

Real-time Processing: Process data as it arrives
Fully Managed: No infrastructure to manage
Scalable: Handle any amount of streaming data
Cost-effective: Pay only for what you use
Integrated: Works with other AWS services

Use Cases:#

Real-time analytics and dashboards
Log and event data collection
IoT data ingestion
Clickstream analysis
Machine learning inference
Video streaming and processing

Kinesis Services Overview {#services-overview}#

1
import boto3
2
import json
3
from datetime import datetime
4

5
def kinesis_services_overview():
6
    """
7
    Overview of Kinesis services and their use cases
8
    """
9
    services = {
10
        "kinesis_data_streams": {
11
            "description": "Real-time data streaming service",
12
            "use_cases": [
13
                "Real-time data processing",
14
                "Log aggregation",
15
                "IoT data ingestion",
16
                "Clickstream processing"
17
            ],
18
            "key_features": [
19
                "Multiple producers and consumers",
20
                "Configurable retention period (1-365 days)",
21
                "Automatic scaling with on-demand mode",
22
                "Integration with Lambda, KDF, KDA"
23
            ]
24
        },
25
        "kinesis_data_firehose": {
26
            "description": "Data delivery service to AWS data stores",
27
            "use_cases": [
28
                "Data lake ingestion",
29
                "Data warehouse loading",
30
                "Log delivery to S3",
31
                "Real-time ETL"
32
            ],
33
            "key_features": [
34
                "Serverless and fully managed",
35
                "Built-in data transformation",
36
                "Compression and format conversion",
37
                "Direct delivery to S3, Redshift, OpenSearch"
38
            ]
39
        },
40
        "kinesis_data_analytics": {
41
            "description": "Real-time analytics with SQL or Apache Flink",
42
            "use_cases": [
43
                "Real-time dashboards",
44
                "Anomaly detection",
45
                "Real-time recommendations",
46
                "Complex event processing"
47
            ],
48
            "key_features": [
49
                "Standard SQL for stream processing",
50
                "Apache Flink for advanced analytics",
51
                "Built-in windowing functions",
52
                "Machine learning integration"
53
            ]
54
        },
55
        "kinesis_video_streams": {
56
            "description": "Video streaming service for analytics and ML",
57
            "use_cases": [
58
                "Security camera streams",
59
                "Smart home devices",
60
                "Industrial IoT video",
61
                "Live video processing"
62
            ],
63
            "key_features": [
64
                "Secure video ingestion",
65
                "WebRTC support",
66
                "Integration with ML services",
67
                "HLS streaming support"
68
            ]
69
        }
70
    }
71

72
    return services
73

74
print("Amazon Kinesis Services Overview:")
75
print(json.dumps(kinesis_services_overview(), indent=2))

Kinesis Data Streams {#data-streams}#

Creating and Managing Data Streams#

1
class KinesisDataStreamManager:
2
    def __init__(self):
3
        self.kinesis = boto3.client('kinesis')
4
        self.cloudwatch = boto3.client('cloudwatch')
5

6
    def create_stream(self, stream_name, shard_count=1, mode='PROVISIONED'):
7
        """
8
        Create a Kinesis Data Stream
9
        """
10
        try:
11
            if mode == 'PROVISIONED':
12
                response = self.kinesis.create_stream(
13
                    StreamName=stream_name,
14
                    ShardCount=shard_count
15
                )
16
            else:  # ON_DEMAND mode
17
                response = self.kinesis.create_stream(
18
                    StreamName=stream_name,
19
                    StreamModeDetails={
20
                        'StreamMode': 'ON_DEMAND'
21
                    }
22
                )
23

24
            print(f"Stream '{stream_name}' creation initiated")
25

26
            # Wait for stream to become active
27
            self.wait_for_stream_active(stream_name)
28

29
            return response
30

31
        except Exception as e:
32
            print(f"Error creating stream: {e}")
33
            return None
34

35
    def wait_for_stream_active(self, stream_name, timeout=300):
36
        """
37
        Wait for stream to become active
38
        """
39
        import time
40

41
        print(f"Waiting for stream '{stream_name}' to become active...")
42
        start_time = time.time()
43

44
        while time.time() - start_time < timeout:
45
            try:
46
                response = self.kinesis.describe_stream(StreamName=stream_name)
47
                status = response['StreamDescription']['StreamStatus']
48

49
                if status == 'ACTIVE':
50
                    print(f"Stream '{stream_name}' is now active!")
51
                    return True
52
                elif status in ['DELETING', 'FAILED']:
53
                    print(f"Stream '{stream_name}' is in {status} state")
54
                    return False
55
                else:
56
                    print(f"Stream status: {status}")
57
                    time.sleep(10)
58

59
            except Exception as e:
60
                print(f"Error checking stream status: {e}")
61
                time.sleep(10)
62

63
        print(f"Timeout waiting for stream '{stream_name}' to become active")
64
        return False
65

66
    def put_record(self, stream_name, data, partition_key):
67
        """
68
        Put a single record to the stream
69
        """
70
        try:
71
            response = self.kinesis.put_record(
72
                StreamName=stream_name,
73
                Data=json.dumps(data) if isinstance(data, dict) else data,
74
                PartitionKey=partition_key
75
            )
76

77
            return {
78
                'shard_id': response['ShardId'],
79
                'sequence_number': response['SequenceNumber']
80
            }
81

82
        except Exception as e:
83
            print(f"Error putting record: {e}")
84
            return None
85

86
    def put_records_batch(self, stream_name, records):
87
        """
88
        Put multiple records in a batch
89
        """
90
        try:
91
            kinesis_records = []
92
            for record in records:
93
                kinesis_records.append({
94
                    'Data': json.dumps(record['data']) if isinstance(record['data'], dict) else record['data'],
95
                    'PartitionKey': record['partition_key']
96
                })
97

98
            response = self.kinesis.put_records(
99
                Records=kinesis_records,
100
                StreamName=stream_name
101
            )
102

103
            # Check for failed records
104
            failed_records = []
105
            for i, record_result in enumerate(response['Records']):
106
                if 'ErrorCode' in record_result:
107
                    failed_records.append({
108
                        'index': i,
109
                        'error_code': record_result['ErrorCode'],
110
                        'error_message': record_result['ErrorMessage']
111
                    })
112

113
            result = {
114
                'failed_record_count': response['FailedRecordCount'],
115
                'successful_records': len(kinesis_records) - response['FailedRecordCount'],
116
                'failed_records': failed_records
117
            }
118

119
            return result
120

121
        except Exception as e:
122
            print(f"Error putting records batch: {e}")
123
            return None
124

125
    def get_records(self, stream_name, shard_iterator_type='TRIM_HORIZON', sequence_number=None):
126
        """
127
        Get records from stream
128
        """
129
        try:
130
            # Get stream description to find shards
131
            stream_desc = self.kinesis.describe_stream(StreamName=stream_name)
132
            shards = stream_desc['StreamDescription']['Shards']
133

134
            all_records = []
135

136
            for shard in shards:
137
                shard_id = shard['ShardId']
138

139
                # Get shard iterator
140
                iterator_request = {
141
                    'StreamName': stream_name,
142
                    'ShardId': shard_id,
143
                    'ShardIteratorType': shard_iterator_type
144
                }
145

146
                if sequence_number:
147
                    iterator_request['StartingSequenceNumber'] = sequence_number
148

149
                iterator_response = self.kinesis.get_shard_iterator(**iterator_request)
150
                shard_iterator = iterator_response['ShardIterator']
151

152
                # Get records
153
                if shard_iterator:
154
                    records_response = self.kinesis.get_records(ShardIterator=shard_iterator)
155

156
                    for record in records_response['Records']:
157
                        all_records.append({
158
                            'sequence_number': record['SequenceNumber'],
159
                            'partition_key': record['PartitionKey'],
160
                            'data': record['Data'].decode('utf-8'),
161
                            'approximate_arrival_timestamp': record['ApproximateArrivalTimestamp'],
162
                            'shard_id': shard_id
163
                        })
164

165
            return all_records
166

167
        except Exception as e:
168
            print(f"Error getting records: {e}")
169
            return []
170

171
    def update_shard_count(self, stream_name, target_shard_count):
172
        """
173
        Update shard count for provisioned stream
174
        """
175
        try:
176
            response = self.kinesis.update_shard_count(
177
                StreamName=stream_name,
178
                TargetShardCount=target_shard_count,
179
                ScalingType='UNIFORM_SCALING'
180
            )
181

182
            print(f"Stream '{stream_name}' shard count update initiated")
183
            return response
184

185
        except Exception as e:
186
            print(f"Error updating shard count: {e}")
187
            return None
188

189
    def get_stream_metrics(self, stream_name):
190
        """
191
        Get stream metrics from CloudWatch
192
        """
193
        try:
194
            end_time = datetime.utcnow()
195
            start_time = end_time - timedelta(hours=1)
196

197
            metrics = {}
198

199
            # Get incoming records metric
200
            incoming_response = self.cloudwatch.get_metric_statistics(
201
                Namespace='AWS/Kinesis',
202
                MetricName='IncomingRecords',
203
                Dimensions=[
204
                    {
205
                        'Name': 'StreamName',
206
                        'Value': stream_name
207
                    }
208
                ],
209
                StartTime=start_time,
210
                EndTime=end_time,
211
                Period=300,
212
                Statistics=['Sum']
213
            )
214

215
            metrics['incoming_records'] = incoming_response['Datapoints']
216

217
            # Get incoming bytes metric
218
            bytes_response = self.cloudwatch.get_metric_statistics(
219
                Namespace='AWS/Kinesis',
220
                MetricName='IncomingBytes',
221
                Dimensions=[
222
                    {
223
                        'Name': 'StreamName',
224
                        'Value': stream_name
225
                    }
226
                ],
227
                StartTime=start_time,
228
                EndTime=end_time,
229
                Period=300,
230
                Statistics=['Sum']
231
            )
232

233
            metrics['incoming_bytes'] = bytes_response['Datapoints']
234

235
            return metrics
236

237
        except Exception as e:
238
            print(f"Error getting stream metrics: {e}")
239
            return {}
240

241
# Usage examples
242
stream_manager = KinesisDataStreamManager()
243

244
# Create a stream
245
stream_manager.create_stream('my-data-stream', shard_count=2)
246

247
# Put single record
248
record_result = stream_manager.put_record(
249
    'my-data-stream',
250
    {
251
        'user_id': 'user123',
252
        'event_type': 'click',
253
        'timestamp': datetime.utcnow().isoformat(),
254
        'page': 'homepage'
255
    },
256
    'user123'
257
)
258

259
print(f"Record put result: {record_result}")
260

261
# Put batch records
262
batch_records = [
263
    {
264
        'data': {
265
            'user_id': 'user124',
266
            'event_type': 'view',
267
            'timestamp': datetime.utcnow().isoformat(),
268
            'page': 'product'
269
        },
270
        'partition_key': 'user124'
271
    },
272
    {
273
        'data': {
274
            'user_id': 'user125',
275
            'event_type': 'purchase',
276
            'timestamp': datetime.utcnow().isoformat(),
277
            'amount': 99.99
278
        },
279
        'partition_key': 'user125'
280
    }
281
]
282

283
batch_result = stream_manager.put_records_batch('my-data-stream', batch_records)
284
print(f"Batch put result: {batch_result}")
285

286
# Get records
287
records = stream_manager.get_records('my-data-stream')
288
print(f"Retrieved {len(records)} records")
289

290
# Get stream metrics
291
metrics = stream_manager.get_stream_metrics('my-data-stream')
292
print(f"Stream metrics: {metrics}")

Kinesis Producer Library (KPL) Integration#

1
import threading
2
import time
3
import random
4
from datetime import datetime
5

6
class KinesisProducer:
7
    def __init__(self, stream_name, region='us-east-1'):
8
        self.stream_name = stream_name
9
        self.kinesis = boto3.client('kinesis', region_name=region)
10
        self.buffer = []
11
        self.buffer_lock = threading.Lock()
12
        self.flush_interval = 5  # seconds
13
        self.max_buffer_size = 100
14
        self.running = False
15

16
    def start(self):
17
        """
18
        Start the producer with background flushing
19
        """
20
        self.running = True
21
        self.flush_thread = threading.Thread(target=self._flush_periodically)
22
        self.flush_thread.daemon = True
23
        self.flush_thread.start()
24

25
    def stop(self):
26
        """
27
        Stop the producer and flush remaining records
28
        """
29
        self.running = False
30
        if hasattr(self, 'flush_thread'):
31
            self.flush_thread.join()
32
        self._flush_buffer()
33

34
    def put_record_async(self, data, partition_key):
35
        """
36
        Add record to buffer for asynchronous processing
37
        """
38
        record = {
39
            'data': data,
40
            'partition_key': partition_key,
41
            'timestamp': datetime.utcnow()
42
        }
43

44
        with self.buffer_lock:
45
            self.buffer.append(record)
46

47
            # Flush if buffer is full
48
            if len(self.buffer) >= self.max_buffer_size:
49
                self._flush_buffer()
50

51
    def _flush_periodically(self):
52
        """
53
        Flush buffer periodically
54
        """
55
        while self.running:
56
            time.sleep(self.flush_interval)
57
            if self.buffer:
58
                self._flush_buffer()
59

60
    def _flush_buffer(self):
61
        """
62
        Flush buffer to Kinesis
63
        """
64
        with self.buffer_lock:
65
            if not self.buffer:
66
                return
67

68
            records_to_flush = self.buffer.copy()
69
            self.buffer.clear()
70

71
        # Convert to Kinesis records format
72
        kinesis_records = []
73
        for record in records_to_flush:
74
            kinesis_records.append({
75
                'Data': json.dumps(record['data']) if isinstance(record['data'], dict) else record['data'],
76
                'PartitionKey': record['partition_key']
77
            })
78

79
        try:
80
            response = self.kinesis.put_records(
81
                Records=kinesis_records,
82
                StreamName=self.stream_name
83
            )
84

85
            print(f"Flushed {len(kinesis_records)} records, "
86
                  f"Failed: {response['FailedRecordCount']}")
87

88
            # Handle failed records
89
            if response['FailedRecordCount'] > 0:
90
                self._handle_failed_records(kinesis_records, response['Records'])
91

92
        except Exception as e:
93
            print(f"Error flushing buffer: {e}")
94
            # Re-add records to buffer for retry
95
            with self.buffer_lock:
96
                self.buffer.extend(records_to_flush)
97

98
    def _handle_failed_records(self, original_records, response_records):
99
        """
100
        Handle failed records with retry logic
101
        """
102
        failed_records = []
103

104
        for i, response_record in enumerate(response_records):
105
            if 'ErrorCode' in response_record:
106
                failed_records.append(original_records[i])
107
                print(f"Failed record {i}: {response_record['ErrorCode']}")
108

109
        # Implement retry logic here
110
        if failed_records:
111
            print(f"Retrying {len(failed_records)} failed records...")
112
            # Simple retry - add back to buffer
113
            with self.buffer_lock:
114
                self.buffer.extend([
115
                    {
116
                        'data': json.loads(record['Data']),
117
                        'partition_key': record['PartitionKey'],
118
                        'timestamp': datetime.utcnow()
119
                    }
120
                    for record in failed_records
121
                ])
122

123
# Enhanced producer with aggregation
124
class AggregatingKinesisProducer(KinesisProducer):
125
    def __init__(self, stream_name, region='us-east-1'):
126
        super().__init__(stream_name, region)
127
        self.aggregation_enabled = True
128
        self.max_aggregated_size = 1024 * 1024  # 1MB
129

130
    def put_user_record(self, data, explicit_hash_key=None, partition_key=None):
131
        """
132
        Put user record with KPL-style aggregation
133
        """
134
        if not partition_key:
135
            partition_key = str(random.randint(0, 999999))
136

137
        record = {
138
            'data': data,
139
            'partition_key': partition_key,
140
            'explicit_hash_key': explicit_hash_key,
141
            'timestamp': datetime.utcnow()
142
        }
143

144
        if self.aggregation_enabled:
145
            self._add_to_aggregated_record(record)
146
        else:
147
            self.put_record_async(data, partition_key)
148

149
    def _add_to_aggregated_record(self, record):
150
        """
151
        Add record to aggregated record (simplified KPL aggregation)
152
        """
153
        # In real KPL implementation, this would use protobuf aggregation
154
        # This is a simplified version
155

156
        aggregated_data = {
157
            'records': [record],
158
            'aggregated': True,
159
            'created_at': datetime.utcnow().isoformat()
160
        }
161

162
        self.put_record_async(aggregated_data, record['partition_key'])
163

164
# Usage example
165
producer = AggregatingKinesisProducer('my-data-stream')
166
producer.start()
167

168
# Simulate real-time data ingestion
169
for i in range(1000):
170
    event_data = {
171
        'event_id': f'event_{i}',
172
        'user_id': f'user_{random.randint(1, 100)}',
173
        'event_type': random.choice(['click', 'view', 'purchase', 'scroll']),
174
        'timestamp': datetime.utcnow().isoformat(),
175
        'metadata': {
176
            'session_id': f'session_{random.randint(1, 20)}',
177
            'page': random.choice(['home', 'product', 'checkout', 'profile']),
178
            'device': random.choice(['mobile', 'desktop', 'tablet'])
179
        }
180
    }
181

182
    producer.put_user_record(event_data, partition_key=event_data['user_id'])
183

184
    # Simulate realistic timing
185
    time.sleep(0.01)
186

187
# Stop producer
188
producer.stop()
189
print("Producer stopped")

Kinesis Data Firehose {#data-firehose}#

Setting up Data Delivery Streams#

1
class KinesisFirehoseManager:
2
    def __init__(self):
3
        self.firehose = boto3.client('firehose')
4
        self.s3 = boto3.client('s3')
5

6
    def create_s3_delivery_stream(self, delivery_stream_name, s3_bucket, s3_prefix='',
7
                                  buffer_interval=60, buffer_size=5):
8
        """
9
        Create Firehose delivery stream to S3
10
        """
11
        try:
12
            response = self.firehose.create_delivery_stream(
13
                DeliveryStreamName=delivery_stream_name,
14
                DeliveryStreamType='DirectPut',
15
                S3DestinationConfiguration={
16
                    'RoleARN': 'arn:aws:iam::123456789012:role/firehose_delivery_role',
17
                    'BucketARN': f'arn:aws:s3:::{s3_bucket}',
18
                    'Prefix': s3_prefix,
19
                    'ErrorOutputPrefix': 'errors/',
20
                    'BufferingHints': {
21
                        'SizeInMBs': buffer_size,
22
                        'IntervalInSeconds': buffer_interval
23
                    },
24
                    'CompressionFormat': 'GZIP',
25
                    'EncryptionConfiguration': {
26
                        'NoEncryptionConfig': 'NoEncryption'
27
                    },
28
                    'CloudWatchLoggingOptions': {
29
                        'Enabled': True,
30
                        'LogGroupName': f'/aws/kinesisfirehose/{delivery_stream_name}'
31
                    },
32
                    'ProcessingConfiguration': {
33
                        'Enabled': False
34
                    }
35
                }
36
            )
37

38
            print(f"Delivery stream '{delivery_stream_name}' created successfully")
39
            return response
40

41
        except Exception as e:
42
            print(f"Error creating delivery stream: {e}")
43
            return None
44

45
    def create_redshift_delivery_stream(self, delivery_stream_name, s3_bucket,
46
                                        redshift_cluster_jdbc, redshift_table,
47
                                        redshift_username, redshift_password):
48
        """
49
        Create Firehose delivery stream to Redshift
50
        """
51
        try:
52
            response = self.firehose.create_delivery_stream(
53
                DeliveryStreamName=delivery_stream_name,
54
                DeliveryStreamType='DirectPut',
55
                RedshiftDestinationConfiguration={
56
                    'RoleARN': 'arn:aws:iam::123456789012:role/firehose_delivery_role',
57
                    'ClusterJDBCURL': redshift_cluster_jdbc,
58
                    'CopyCommand': {
59
                        'DataTableName': redshift_table,
60
                        'DataTableColumns': 'user_id,event_type,timestamp,metadata',
61
                        'CopyOptions': "JSON 'auto' GZIP"
62
                    },
63
                    'Username': redshift_username,
64
                    'Password': redshift_password,
65
                    'RetryDuration': 3600,
66
                    'S3Configuration': {
67
                        'RoleARN': 'arn:aws:iam::123456789012:role/firehose_delivery_role',
68
                        'BucketARN': f'arn:aws:s3:::{s3_bucket}',
69
                        'Prefix': 'redshift-staging/',
70
                        'ErrorOutputPrefix': 'redshift-errors/',
71
                        'BufferingHints': {
72
                            'SizeInMBs': 128,
73
                            'IntervalInSeconds': 60
74
                        },
75
                        'CompressionFormat': 'GZIP'
76
                    },
77
                    'ProcessingConfiguration': {
78
                        'Enabled': True,
79
                        'Processors': [
80
                            {
81
                                'Type': 'Lambda',
82
                                'Parameters': [
83
                                    {
84
                                        'ParameterName': 'LambdaArn',
85
                                        'ParameterValue': 'arn:aws:lambda:us-east-1:123456789012:function:firehose-transform'
86
                                    }
87
                                ]
88
                            }
89
                        ]
90
                    }
91
                }
92
            )
93

94
            print(f"Redshift delivery stream '{delivery_stream_name}' created successfully")
95
            return response
96

97
        except Exception as e:
98
            print(f"Error creating Redshift delivery stream: {e}")
99
            return None
100

101
    def create_opensearch_delivery_stream(self, delivery_stream_name, domain_arn,
102
                                          index_name, type_name='_doc'):
103
        """
104
        Create Firehose delivery stream to OpenSearch
105
        """
106
        try:
107
            response = self.firehose.create_delivery_stream(
108
                DeliveryStreamName=delivery_stream_name,
109
                DeliveryStreamType='DirectPut',
110
                AmazonOpenSearchServiceDestinationConfiguration={
111
                    'RoleARN': 'arn:aws:iam::123456789012:role/firehose_delivery_role',
112
                    'DomainARN': domain_arn,
113
                    'IndexName': index_name,
114
                    'TypeName': type_name,
115
                    'IndexRotationPeriod': 'OneDay',
116
                    'BufferingHints': {
117
                        'SizeInMBs': 5,
118
                        'IntervalInSeconds': 60
119
                    },
120
                    'RetryDuration': 300,
121
                    'S3BackupMode': 'AllDocuments',
122
                    'S3Configuration': {
123
                        'RoleARN': 'arn:aws:iam::123456789012:role/firehose_delivery_role',
124
                        'BucketARN': 'arn:aws:s3:::my-backup-bucket',
125
                        'Prefix': 'opensearch-backup/',
126
                        'ErrorOutputPrefix': 'opensearch-errors/',
127
                        'BufferingHints': {
128
                            'SizeInMBs': 10,
129
                            'IntervalInSeconds': 300
130
                        },
131
                        'CompressionFormat': 'GZIP'
132
                    }
133
                }
134
            )
135

136
            print(f"OpenSearch delivery stream '{delivery_stream_name}' created successfully")
137
            return response
138

139
        except Exception as e:
140
            print(f"Error creating OpenSearch delivery stream: {e}")
141
            return None
142

143
    def put_record(self, delivery_stream_name, record_data):
144
        """
145
        Put single record to Firehose delivery stream
146
        """
147
        try:
148
            response = self.firehose.put_record(
149
                DeliveryStreamName=delivery_stream_name,
150
                Record={
151
                    'Data': json.dumps(record_data) + '\n' if isinstance(record_data, dict) else record_data
152
                }
153
            )
154

155
            return response['RecordId']
156

157
        except Exception as e:
158
            print(f"Error putting record: {e}")
159
            return None
160

161
    def put_record_batch(self, delivery_stream_name, records):
162
        """
163
        Put batch of records to Firehose delivery stream
164
        """
165
        try:
166
            firehose_records = []
167
            for record in records:
168
                data = json.dumps(record) + '\n' if isinstance(record, dict) else record
169
                firehose_records.append({'Data': data})
170

171
            response = self.firehose.put_record_batch(
172
                DeliveryStreamName=delivery_stream_name,
173
                Records=firehose_records
174
            )
175

176
            result = {
177
                'failed_put_count': response['FailedPutCount'],
178
                'successful_records': len(firehose_records) - response['FailedPutCount'],
179
                'request_responses': response['RequestResponses']
180
            }
181

182
            return result
183

184
        except Exception as e:
185
            print(f"Error putting record batch: {e}")
186
            return None
187

188
    def create_data_transformation_lambda(self):
189
        """
190
        Example Lambda function for data transformation
191
        """
192
        lambda_function_code = '''
193
import base64
194
import json
195
import gzip
196
from datetime import datetime
197

198
def lambda_handler(event, context):
199
    output = []
200

201
    for record in event['records']:
202
        # Decode the data
203
        compressed_payload = base64.b64decode(record['data'])
204
        uncompressed_payload = gzip.decompress(compressed_payload)
205
        data = json.loads(uncompressed_payload.decode('utf-8'))
206

207
        # Transform the data
208
        transformed_data = transform_record(data)
209

210
        # Encode the transformed data
211
        output_record = {
212
            'recordId': record['recordId'],
213
            'result': 'Ok',
214
            'data': base64.b64encode(
215
                (json.dumps(transformed_data) + '\\n').encode('utf-8')
216
            ).decode('utf-8')
217
        }
218

219
        output.append(output_record)
220

221
    return {'records': output}
222

223
def transform_record(data):
224
    """
225
    Transform a single record
226
    """
227
    # Add transformation timestamp
228
    data['processed_timestamp'] = datetime.utcnow().isoformat()
229

230
    # Normalize event types
231
    if 'event_type' in data:
232
        data['event_type'] = data['event_type'].lower()
233

234
    # Add derived fields
235
    if 'timestamp' in data:
236
        try:
237
            ts = datetime.fromisoformat(data['timestamp'].replace('Z', '+00:00'))
238
            data['hour_of_day'] = ts.hour
239
            data['day_of_week'] = ts.weekday()
240
        except:
241
            pass
242

243
    # Filter out sensitive data
244
    sensitive_fields = ['password', 'ssn', 'credit_card']
245
    for field in sensitive_fields:
246
        if field in data:
247
            data[field] = '[REDACTED]'
248

249
    return data
250
'''
251

252
        return lambda_function_code
253

254
# Usage examples
255
firehose_manager = KinesisFirehoseManager()
256

257
# Create S3 delivery stream
258
s3_stream = firehose_manager.create_s3_delivery_stream(
259
    'my-s3-delivery-stream',
260
    'my-data-lake-bucket',
261
    s3_prefix='year=!{timestamp:yyyy}/month=!{timestamp:MM}/day=!{timestamp:dd}/hour=!{timestamp:HH}/',
262
    buffer_interval=60,
263
    buffer_size=5
264
)
265

266
# Create Redshift delivery stream
267
# redshift_stream = firehose_manager.create_redshift_delivery_stream(
268
#     'my-redshift-delivery-stream',
269
#     'my-staging-bucket',
270
#     'jdbc:redshift://mycluster.abc123.us-east-1.redshift.amazonaws.com:5439/mydb',
271
#     'events',
272
#     'username',
273
#     'password'
274
# )
275

276
# Put single record
277
record_id = firehose_manager.put_record(
278
    'my-s3-delivery-stream',
279
    {
280
        'user_id': 'user123',
281
        'event_type': 'page_view',
282
        'timestamp': datetime.utcnow().isoformat(),
283
        'page': 'homepage',
284
        'referrer': 'google.com'
285
    }
286
)
287

288
print(f"Record ID: {record_id}")
289

290
# Put batch records
291
batch_records = [
292
    {
293
        'user_id': 'user124',
294
        'event_type': 'click',
295
        'timestamp': datetime.utcnow().isoformat(),
296
        'element': 'buy_button'
297
    },
298
    {
299
        'user_id': 'user125',
300
        'event_type': 'purchase',
301
        'timestamp': datetime.utcnow().isoformat(),
302
        'amount': 99.99,
303
        'product_id': 'prod123'
304
    }
305
]
306

307
batch_result = firehose_manager.put_record_batch('my-s3-delivery-stream', batch_records)
308
print(f"Batch result: {batch_result}")
309

310
# Get transformation Lambda code
311
lambda_code = firehose_manager.create_data_transformation_lambda()
312
print("Data transformation Lambda function code:")
313
print(lambda_code)

Kinesis Data Analytics {#data-analytics}#

Real-time Analytics with SQL#

1
class KinesisAnalyticsManager:
2
    def __init__(self):
3
        self.analytics = boto3.client('kinesisanalytics')
4
        self.analyticsv2 = boto3.client('kinesisanalyticsv2')
5

6
    def create_sql_application(self, application_name, input_stream_arn, output_stream_arn):
7
        """
8
        Create Kinesis Data Analytics SQL application (v1)
9
        """
10
        sql_code = """
11
-- Create input stream
12
CREATE OR REPLACE STREAM "SOURCE_SQL_STREAM_001" (
13
    user_id VARCHAR(32),
14
    event_type VARCHAR(16),
15
    timestamp TIMESTAMP,
16
    amount DOUBLE,
17
    metadata VARCHAR(1024)
18
);
19

20
-- Create output stream for aggregated data
21
CREATE OR REPLACE STREAM "DESTINATION_SQL_STREAM_001" (
22
    user_id VARCHAR(32),
23
    event_count INTEGER,
24
    total_amount DOUBLE,
25
    window_start TIMESTAMP,
26
    window_end TIMESTAMP
27
);
28

29
-- Windowed aggregation query
30
CREATE OR REPLACE PUMP "STREAM_PUMP_001" AS INSERT INTO "DESTINATION_SQL_STREAM_001"
31
SELECT STREAM
32
    user_id,
33
    COUNT(*) as event_count,
34
    SUM(amount) as total_amount,
35
    ROWTIME_TO_TIMESTAMP(MIN(ROWTIME)) as window_start,
36
    ROWTIME_TO_TIMESTAMP(MAX(ROWTIME)) as window_end
37
FROM "SOURCE_SQL_STREAM_001"
38
WHERE event_type = 'purchase'
39
GROUP BY user_id,
40
         RANGE_INTERVAL '5' MINUTE;
41

42
-- Real-time anomaly detection
43
CREATE OR REPLACE STREAM "ANOMALY_STREAM_001" (
44
    user_id VARCHAR(32),
45
    event_count INTEGER,
46
    avg_amount DOUBLE,
47
    anomaly_score DOUBLE,
48
    window_timestamp TIMESTAMP
49
);
50

51
CREATE OR REPLACE PUMP "ANOMALY_PUMP_001" AS INSERT INTO "ANOMALY_STREAM_001"
52
SELECT STREAM
53
    user_id,
54
    event_count,
55
    avg_amount,
56
    CASE
57
        WHEN event_count > 10 THEN event_count * 1.5
58
        WHEN avg_amount > 1000 THEN avg_amount / 100
59
        ELSE 0.0
60
    END as anomaly_score,
61
    ROWTIME_TO_TIMESTAMP(ROWTIME) as window_timestamp
62
FROM (
63
    SELECT STREAM
64
        user_id,
65
        COUNT(*) as event_count,
66
        AVG(amount) as avg_amount,
67
        ROWTIME
68
    FROM "SOURCE_SQL_STREAM_001"
69
    WHERE event_type = 'purchase'
70
    GROUP BY user_id,
71
             RANGE_INTERVAL '1' MINUTE
72
)
73
WHERE event_count > 5 OR avg_amount > 500;
74
"""
75

76
        try:
77
            response = self.analytics.create_application(
78
                ApplicationName=application_name,
79
                ApplicationDescription='Real-time analytics application',
80
                Inputs=[
81
                    {
82
                        'NamePrefix': 'SOURCE_SQL_STREAM',
83
                        'InputProcessingConfiguration': {
84
                            'InputLambdaProcessor': {
85
                                'ResourceARN': 'arn:aws:lambda:us-east-1:123456789012:function:analytics-processor',
86
                                'RoleARN': 'arn:aws:iam::123456789012:role/service-role/kinesis-analytics-role'
87
                            }
88
                        },
89
                        'KinesisStreamsInput': {
90
                            'ResourceARN': input_stream_arn,
91
                            'RoleARN': 'arn:aws:iam::123456789012:role/service-role/kinesis-analytics-role'
92
                        },
93
                        'InputSchema': {
94
                            'RecordFormat': {
95
                                'RecordFormatType': 'JSON',
96
                                'MappingParameters': {
97
                                    'JSONMappingParameters': {
98
                                        'RecordRowPath': '$'
99
                                    }
100
                                }
101
                            },
102
                            'RecordEncoding': 'UTF-8',
103
                            'RecordColumns': [
104
                                {
105
                                    'Name': 'user_id',
106
                                    'Mapping': '$.user_id',
107
                                    'SqlType': 'VARCHAR(32)'
108
                                },
109
                                {
110
                                    'Name': 'event_type',
111
                                    'Mapping': '$.event_type',
112
                                    'SqlType': 'VARCHAR(16)'
113
                                },
114
                                {
115
                                    'Name': 'timestamp',
116
                                    'Mapping': '$.timestamp',
117
                                    'SqlType': 'TIMESTAMP'
118
                                },
119
                                {
120
                                    'Name': 'amount',
121
                                    'Mapping': '$.amount',
122
                                    'SqlType': 'DOUBLE'
123
                                },
124
                                {
125
                                    'Name': 'metadata',
126
                                    'Mapping': '$.metadata',
127
                                    'SqlType': 'VARCHAR(1024)'
128
                                }
129
                            ]
130
                        }
131
                    }
132
                ],
133
                Outputs=[
134
                    {
135
                        'Name': 'DESTINATION_SQL_STREAM',
136
                        'KinesisStreamsOutput': {
137
                            'ResourceARN': output_stream_arn,
138
                            'RoleARN': 'arn:aws:iam::123456789012:role/service-role/kinesis-analytics-role'
139
                        },
140
                        'DestinationSchema': {
141
                            'RecordFormatType': 'JSON'
142
                        }
143
                    }
144
                ],
145
                ApplicationCode=sql_code
146
            )
147

148
            print(f"Analytics application '{application_name}' created successfully")
149
            return response
150

151
        except Exception as e:
152
            print(f"Error creating analytics application: {e}")
153
            return None
154

155
    def create_flink_application(self, application_name, s3_bucket, s3_key):
156
        """
157
        Create Kinesis Data Analytics Flink application (v2)
158
        """
159
        try:
160
            response = self.analyticsv2.create_application(
161
                ApplicationName=application_name,
162
                ApplicationDescription='Flink streaming application',
163
                RuntimeEnvironment='FLINK-1_13',
164
                ServiceExecutionRole='arn:aws:iam::123456789012:role/service-role/kinesis-analytics-role',
165
                ApplicationConfiguration={
166
                    'ApplicationCodeConfiguration': {
167
                        'CodeContent': {
168
                            'S3ContentLocation': {
169
                                'BucketARN': f'arn:aws:s3:::{s3_bucket}',
170
                                'FileKey': s3_key
171
                            }
172
                        },
173
                        'CodeContentType': 'ZIPFILE'
174
                    },
175
                    'EnvironmentProperties': {
176
                        'PropertyGroups': [
177
                            {
178
                                'PropertyGroupId': 'kinesis.analytics.flink.run.options',
179
                                'PropertyMap': {
180
                                    'python': 'main.py',
181
                                    'jarfile': 'flink-app.jar'
182
                                }
183
                            }
184
                        ]
185
                    },
186
                    'FlinkApplicationConfiguration': {
187
                        'CheckpointConfiguration': {
188
                            'ConfigurationType': 'DEFAULT'
189
                        },
190
                        'MonitoringConfiguration': {
191
                            'ConfigurationType': 'CUSTOM',
192
                            'LogLevel': 'INFO',
193
                            'MetricsLevel': 'APPLICATION'
194
                        },
195
                        'ParallelismConfiguration': {
196
                            'ConfigurationType': 'CUSTOM',
197
                            'Parallelism': 2,
198
                            'ParallelismPerKPU': 1,
199
                            'AutoScalingEnabled': True
200
                        }
201
                    }
202
                }
203
            )
204

205
            print(f"Flink application '{application_name}' created successfully")
206
            return response
207

208
        except Exception as e:
209
            print(f"Error creating Flink application: {e}")
210
            return None
211

212
    def generate_flink_python_code(self):
213
        """
214
        Generate example Flink Python application code
215
        """
216
        flink_code = '''
217
from pyflink.datastream import StreamExecutionEnvironment
218
from pyflink.table import StreamTableEnvironment
219
from pyflink.table.descriptors import Schema, Rowtime, Json, Kafka
220
from pyflink.table.window import Tumble
221
import json
222

223
def main():
224
    # Set up execution environment
225
    env = StreamExecutionEnvironment.get_execution_environment()
226
    t_env = StreamTableEnvironment.create(env)
227

228
    # Configure checkpointing
229
    env.enable_checkpointing(60000)  # checkpoint every 60 seconds
230

231
    # Define source table (Kinesis Data Streams)
232
    source_ddl = """
233
        CREATE TABLE source_table (
234
            user_id STRING,
235
            event_type STRING,
236
            amount DOUBLE,
237
            event_timestamp TIMESTAMP(3),
238
            WATERMARK FOR event_timestamp AS event_timestamp - INTERVAL '5' SECOND
239
        ) WITH (
240
            'connector' = 'kinesis',
241
            'stream' = 'my-input-stream',
242
            'aws.region' = 'us-east-1',
243
            'scan.stream.initpos' = 'LATEST',
244
            'format' = 'json'
245
        )
246
    """
247

248
    # Define sink table (Kinesis Data Streams)
249
    sink_ddl = """
250
        CREATE TABLE sink_table (
251
            user_id STRING,
252
            event_count BIGINT,
253
            total_amount DOUBLE,
254
            window_start TIMESTAMP(3),
255
            window_end TIMESTAMP(3)
256
        ) WITH (
257
            'connector' = 'kinesis',
258
            'stream' = 'my-output-stream',
259
            'aws.region' = 'us-east-1',
260
            'format' = 'json'
261
        )
262
    """
263

264
    t_env.execute_sql(source_ddl)
265
    t_env.execute_sql(sink_ddl)
266

267
    # Define the processing logic
268
    result = t_env.sql_query("""
269
        SELECT
270
            user_id,
271
            COUNT(*) as event_count,
272
            SUM(amount) as total_amount,
273
            TUMBLE_START(event_timestamp, INTERVAL '5' MINUTE) as window_start,
274
            TUMBLE_END(event_timestamp, INTERVAL '5' MINUTE) as window_end
275
        FROM source_table
276
        WHERE event_type = 'purchase'
277
        GROUP BY
278
            user_id,
279
            TUMBLE(event_timestamp, INTERVAL '5' MINUTE)
280
    """)
281

282
    # Insert results into sink table
283
    result.execute_insert("sink_table")
284

285
if __name__ == '__main__':
286
    main()
287
'''
288

289
        return flink_code
290

291
    def start_application(self, application_name, input_configurations=None):
292
        """
293
        Start Kinesis Data Analytics application
294
        """
295
        try:
296
            response = self.analytics.start_application(
297
                ApplicationName=application_name,
298
                InputConfigurations=input_configurations or []
299
            )
300

301
            print(f"Application '{application_name}' started successfully")
302
            return response
303

304
        except Exception as e:
305
            print(f"Error starting application: {e}")
306
            return None
307

308
# Usage examples
309
analytics_manager = KinesisAnalyticsManager()
310

311
# Create SQL application
312
# sql_app = analytics_manager.create_sql_application(
313
#     'my-analytics-app',
314
#     'arn:aws:kinesis:us-east-1:123456789012:stream/my-input-stream',
315
#     'arn:aws:kinesis:us-east-1:123456789012:stream/my-output-stream'
316
# )
317

318
# Generate Flink application code
319
flink_code = analytics_manager.generate_flink_python_code()
320
print("Flink Python Application Code:")
321
print(flink_code)
322

323
# Create Flink application
324
# flink_app = analytics_manager.create_flink_application(
325
#     'my-flink-app',
326
#     'my-code-bucket',
327
#     'flink-apps/my-app.zip'
328
# )

Real-time Processing Architectures {#processing-architectures}#

Lambda Integration for Stream Processing#

1
import json
2
import base64
3
import gzip
4
from datetime import datetime
5

6
class KinesisLambdaProcessor:
7
    """
8
    Example Lambda function for processing Kinesis streams
9
    """
10

11
    @staticmethod
12
    def lambda_handler(event, context):
13
        """
14
        Lambda handler for Kinesis stream processing
15
        """
16
        processed_records = 0
17
        failed_records = 0
18

19
        for record in event['Records']:
20
            try:
21
                # Decode and process the record
22
                payload = KinesisLambdaProcessor.decode_record(record)
23
                processed_data = KinesisLambdaProcessor.process_record(payload)
24

25
                # Store processed data (example with DynamoDB)
26
                KinesisLambdaProcessor.store_processed_record(processed_data)
27

28
                processed_records += 1
29

30
            except Exception as e:
31
                print(f"Error processing record: {e}")
32
                failed_records += 1
33

34
        return {
35
            'statusCode': 200,
36
            'body': json.dumps({
37
                'processed_records': processed_records,
38
                'failed_records': failed_records
39
            })
40
        }
41

42
    @staticmethod
43
    def decode_record(record):
44
        """
45
        Decode Kinesis record data
46
        """
47
        # Decode base64
48
        data = base64.b64decode(record['kinesis']['data'])
49

50
        # Handle gzip compression if present
51
        try:
52
            data = gzip.decompress(data)
53
        except:
54
            pass  # Not compressed
55

56
        # Parse JSON
57
        return json.loads(data.decode('utf-8'))
58

59
    @staticmethod
60
    def process_record(data):
61
        """
62
        Process individual record with business logic
63
        """
64
        processed_data = {
65
            'original_data': data,
66
            'processed_timestamp': datetime.utcnow().isoformat(),
67
            'processing_metadata': {}
68
        }
69

70
        # Example processing logic
71
        if 'event_type' in data:
72
            processed_data['event_category'] = KinesisLambdaProcessor.categorize_event(data['event_type'])
73

74
        if 'amount' in data:
75
            processed_data['amount_tier'] = KinesisLambdaProcessor.categorize_amount(data['amount'])
76

77
        # Add anomaly score
78
        processed_data['anomaly_score'] = KinesisLambdaProcessor.calculate_anomaly_score(data)
79

80
        return processed_data
81

82
    @staticmethod
83
    def categorize_event(event_type):
84
        """
85
        Categorize event types
86
        """
87
        categories = {
88
            'click': 'interaction',
89
            'view': 'interaction',
90
            'scroll': 'interaction',
91
            'purchase': 'transaction',
92
            'payment': 'transaction',
93
            'login': 'authentication',
94
            'logout': 'authentication',
95
            'signup': 'user_management'
96
        }
97

98
        return categories.get(event_type.lower(), 'other')
99

100
    @staticmethod
101
    def categorize_amount(amount):
102
        """
103
        Categorize transaction amounts
104
        """
105
        if amount < 10:
106
            return 'small'
107
        elif amount < 100:
108
            return 'medium'
109
        elif amount < 1000:
110
            return 'large'
111
        else:
112
            return 'enterprise'
113

114
    @staticmethod
115
    def calculate_anomaly_score(data):
116
        """
117
        Calculate anomaly score based on various factors
118
        """
119
        score = 0.0
120

121
        # Check for unusual timestamp
122
        try:
123
            timestamp = datetime.fromisoformat(data['timestamp'].replace('Z', '+00:00'))
124
            hour = timestamp.hour
125

126
            # Higher score for unusual hours (2-6 AM)
127
            if 2 <= hour <= 6:
128
                score += 0.3
129

130
        except:
131
            pass
132

133
        # Check for high amounts
134
        if 'amount' in data and data['amount'] > 1000:
135
            score += 0.4
136

137
        # Check for rapid events (would need session context)
138
        # This is a simplified example
139
        if 'session_id' in data and 'event_sequence' in data:
140
            if data['event_sequence'] > 10:  # More than 10 events in session
141
                score += 0.2
142

143
        return min(score, 1.0)  # Cap at 1.0
144

145
    @staticmethod
146
    def store_processed_record(processed_data):
147
        """
148
        Store processed record in DynamoDB
149
        """
150
        import boto3
151

152
        dynamodb = boto3.resource('dynamodb')
153
        table = dynamodb.Table('ProcessedEvents')
154

155
        # Create item for DynamoDB
156
        item = {
157
            'event_id': processed_data.get('original_data', {}).get('event_id', 'unknown'),
158
            'user_id': processed_data.get('original_data', {}).get('user_id', 'unknown'),
159
            'processed_timestamp': processed_data['processed_timestamp'],
160
            'event_category': processed_data.get('event_category', 'unknown'),
161
            'anomaly_score': processed_data.get('anomaly_score', 0.0),
162
            'original_data': json.dumps(processed_data['original_data'])
163
        }
164

165
        try:
166
            table.put_item(Item=item)
167
        except Exception as e:
168
            print(f"Error storing record: {e}")
169
            raise
170

171
# Multi-consumer architecture example
172
class KinesisConsumerManager:
173
    def __init__(self):
174
        self.kinesis = boto3.client('kinesis')
175

176
    def create_consumer_application(self, stream_name, consumer_name, consumer_arn):
177
        """
178
        Create a consumer application using enhanced fan-out
179
        """
180
        try:
181
            response = self.kinesis.register_stream_consumer(
182
                StreamARN=f'arn:aws:kinesis:us-east-1:123456789012:stream/{stream_name}',
183
                ConsumerName=consumer_name
184
            )
185

186
            consumer_arn = response['Consumer']['ConsumerARN']
187
            print(f"Consumer '{consumer_name}' registered with ARN: {consumer_arn}")
188

189
            return consumer_arn
190

191
        except Exception as e:
192
            print(f"Error registering consumer: {e}")
193
            return None
194

195
    def subscribe_to_shard(self, consumer_arn, shard_id):
196
        """
197
        Subscribe to shard with enhanced fan-out
198
        """
199
        try:
200
            response = self.kinesis.subscribe_to_shard(
201
                ConsumerARN=consumer_arn,
202
                ShardId=shard_id,
203
                StartingPosition={
204
                    'Type': 'LATEST'
205
                }
206
            )
207

208
            # Process the event stream
209
            for event in response['EventStream']:
210
                if 'SubscribeToShardEvent' in event:
211
                    records = event['SubscribeToShardEvent']['Records']
212
                    for record in records:
213
                        self.process_enhanced_record(record)
214

215
        except Exception as e:
216
            print(f"Error subscribing to shard: {e}")
217

218
    def process_enhanced_record(self, record):
219
        """
220
        Process record from enhanced fan-out consumer
221
        """
222
        data = json.loads(record['Data'].decode('utf-8'))
223

224
        print(f"Processing record: {record['SequenceNumber']}")
225
        print(f"Data: {data}")
226
        print(f"Approximate arrival: {record['ApproximateArrivalTimestamp']}")
227

228
# Error handling and retry patterns
229
class KinesisErrorHandler:
230
    def __init__(self):
231
        self.kinesis = boto3.client('kinesis')
232
        self.sqs = boto3.client('sqs')
233
        self.dead_letter_queue_url = 'https://sqs.us-east-1.amazonaws.com/123456789012/kinesis-dlq'
234

235
    def handle_processing_error(self, record, error):
236
        """
237
        Handle processing errors with retry logic and dead letter queue
238
        """
239
        retry_count = record.get('retry_count', 0)
240
        max_retries = 3
241

242
        if retry_count < max_retries:
243
            # Add retry metadata and put back to stream
244
            record['retry_count'] = retry_count + 1
245
            record['last_error'] = str(error)
246
            record['retry_timestamp'] = datetime.utcnow().isoformat()
247

248
            # Put record back to stream for retry
249
            self.put_record_for_retry(record)
250

251
        else:
252
            # Send to dead letter queue
253
            self.send_to_dead_letter_queue(record, error)
254

255
    def put_record_for_retry(self, record):
256
        """
257
        Put record back to stream for retry
258
        """
259
        try:
260
            self.kinesis.put_record(
261
                StreamName='retry-stream',
262
                Data=json.dumps(record),
263
                PartitionKey=record.get('partition_key', 'retry')
264
            )
265
        except Exception as e:
266
            print(f"Error putting record for retry: {e}")
267

268
    def send_to_dead_letter_queue(self, record, error):
269
        """
270
        Send failed record to dead letter queue
271
        """
272
        try:
273
            message_body = {
274
                'original_record': record,
275
                'processing_error': str(error),
276
                'failed_timestamp': datetime.utcnow().isoformat(),
277
                'retry_attempts': record.get('retry_count', 0)
278
            }
279

280
            self.sqs.send_message(
281
                QueueUrl=self.dead_letter_queue_url,
282
                MessageBody=json.dumps(message_body)
283
            )
284

285
            print(f"Record sent to dead letter queue: {record.get('event_id', 'unknown')}")
286

287
        except Exception as e:
288
            print(f"Error sending to dead letter queue: {e}")
289

290
# Usage examples
291
lambda_processor = KinesisLambdaProcessor()
292
consumer_manager = KinesisConsumerManager()
293
error_handler = KinesisErrorHandler()
294

295
# Example Lambda function code
296
print("Lambda Processor Class created - ready for deployment")
297

298
# Consumer registration example
299
consumer_arn = consumer_manager.create_consumer_application(
300
    'my-data-stream',
301
    'analytics-consumer',
302
    'arn:aws:kinesis:us-east-1:123456789012:stream/my-data-stream'
303
)
304

305
print("Real-time processing architecture components ready")

Best Practices {#best-practices}#

Kinesis Optimization and Operational Excellence#

1
class KinesisBestPractices:
2
    def __init__(self):
3
        self.kinesis = boto3.client('kinesis')
4
        self.cloudwatch = boto3.client('cloudwatch')
5

6
    def implement_partitioning_strategy(self):
7
        """
8
        Implement effective partitioning strategies
9
        """
10
        partitioning_strategies = {
11
            'user_based': {
12
                'description': 'Partition by user ID for user-specific processing',
13
                'example': 'user_id',
14
                'benefits': ['Ordered processing per user', 'Easy scaling per user'],
15
                'considerations': ['Hot partitions if few active users', 'Uneven distribution']
16
            },
17
            'time_based': {
18
                'description': 'Partition by timestamp for time-series data',
19
                'example': 'timestamp_hour',
20
                'benefits': ['Even distribution over time', 'Time-ordered processing'],
21
                'considerations': ['All data goes to same partition at given time']
22
            },
23
            'hash_based': {
24
                'description': 'Partition using hash function for even distribution',
25
                'example': 'hash(user_id + session_id)',
26
                'benefits': ['Even distribution', 'Predictable partitioning'],
27
                'considerations': ['May break ordering guarantees']
28
            },
29
            'categorical': {
30
                'description': 'Partition by event type or category',
31
                'example': 'event_type',
32
                'benefits': ['Type-specific processing', 'Easy filtering'],
33
                'considerations': ['Uneven distribution if categories imbalanced']
34
            }
35
        }
36

37
        return partitioning_strategies
38

39
    def implement_monitoring_and_alerting(self, stream_name):
40
        """
41
        Set up comprehensive monitoring and alerting
42
        """
43
        # Create CloudWatch alarms for key metrics
44
        alarms = []
45

46
        # High incoming records alarm
47
        high_traffic_alarm = self.cloudwatch.put_metric_alarm(
48
            AlarmName=f'{stream_name}-HighIncomingRecords',
49
            ComparisonOperator='GreaterThanThreshold',
50
            EvaluationPeriods=2,
51
            MetricName='IncomingRecords',
52
            Namespace='AWS/Kinesis',
53
            Period=300,
54
            Statistic='Sum',
55
            Threshold=100000.0,
56
            ActionsEnabled=True,
57
            AlarmActions=[
58
                'arn:aws:sns:us-east-1:123456789012:kinesis-alerts'
59
            ],
60
            AlarmDescription='High incoming records detected',
61
            Dimensions=[
62
                {
63
                    'Name': 'StreamName',
64
                    'Value': stream_name
65
                }
66
            ],
67
            Unit='Count'
68
        )
69
        alarms.append('HighIncomingRecords')
70

71
        # Iterator age alarm (consumer lag)
72
        iterator_age_alarm = self.cloudwatch.put_metric_alarm(
73
            AlarmName=f'{stream_name}-HighIteratorAge',
74
            ComparisonOperator='GreaterThanThreshold',
75
            EvaluationPeriods=2,
76
            MetricName='GetRecords.IteratorAge',
77
            Namespace='AWS/Kinesis',
78
            Period=300,
79
            Statistic='Maximum',
80
            Threshold=60000.0,  # 60 seconds
81
            ActionsEnabled=True,
82
            AlarmActions=[
83
                'arn:aws:sns:us-east-1:123456789012:kinesis-alerts'
84
            ],
85
            AlarmDescription='Consumer lag detected',
86
            Dimensions=[
87
                {
88
                    'Name': 'StreamName',
89
                    'Value': stream_name
90
                }
91
            ],
92
            Unit='Milliseconds'
93
        )
94
        alarms.append('HighIteratorAge')
95

96
        # Write provisioned throughput exceeded
97
        write_throttle_alarm = self.cloudwatch.put_metric_alarm(
98
            AlarmName=f'{stream_name}-WriteProvisionedThroughputExceeded',
99
            ComparisonOperator='GreaterThanThreshold',
100
            EvaluationPeriods=1,
101
            MetricName='WriteProvisionedThroughputExceeded',
102
            Namespace='AWS/Kinesis',
103
            Period=300,
104
            Statistic='Sum',
105
            Threshold=0.0,
106
            ActionsEnabled=True,
107
            AlarmActions=[
108
                'arn:aws:sns:us-east-1:123456789012:kinesis-alerts'
109
            ],
110
            AlarmDescription='Write throttling detected',
111
            Dimensions=[
112
                {
113
                    'Name': 'StreamName',
114
                    'Value': stream_name
115
                }
116
            ],
117
            Unit='Count'
118
        )
119
        alarms.append('WriteProvisionedThroughputExceeded')
120

121
        return alarms
122

123
    def implement_error_handling_patterns(self):
124
        """
125
        Implement comprehensive error handling patterns
126
        """
127
        error_patterns = {
128
            'exponential_backoff': '''
129
import time
130
import random
131

132
def exponential_backoff_retry(func, max_retries=3, base_delay=1):
133
    for attempt in range(max_retries):
134
        try:
135
            return func()
136
        except Exception as e:
137
            if attempt == max_retries - 1:
138
                raise e
139

140
            delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
141
            print(f"Retry attempt {attempt + 1} after {delay:.2f} seconds")
142
            time.sleep(delay)
143

144
    raise Exception("Max retries exceeded")
145
''',
146
            'circuit_breaker': '''
147
import time
148

149
class CircuitBreaker:
150
    def __init__(self, failure_threshold=5, timeout=60):
151
        self.failure_threshold = failure_threshold
152
        self.timeout = timeout
153
        self.failure_count = 0
154
        self.last_failure_time = None
155
        self.state = 'CLOSED'  # CLOSED, OPEN, HALF_OPEN
156

157
    def call(self, func):
158
        if self.state == 'OPEN':
159
            if time.time() - self.last_failure_time >= self.timeout:
160
                self.state = 'HALF_OPEN'
161
            else:
162
                raise Exception("Circuit breaker is OPEN")
163

164
        try:
165
            result = func()
166
            self.on_success()
167
            return result
168
        except Exception as e:
169
            self.on_failure()
170
            raise e
171

172
    def on_success(self):
173
        self.failure_count = 0
174
        self.state = 'CLOSED'
175

176
    def on_failure(self):
177
        self.failure_count += 1
178
        self.last_failure_time = time.time()
179

180
        if self.failure_count >= self.failure_threshold:
181
            self.state = 'OPEN'
182
''',
183
            'dead_letter_queue': '''
184
import boto3
185
import json
186
from datetime import datetime
187

188
class DeadLetterQueueHandler:
189
    def __init__(self, queue_url):
190
        self.sqs = boto3.client('sqs')
191
        self.queue_url = queue_url
192

193
    def send_failed_record(self, record, error, retry_count=0):
194
        message = {
195
            'original_record': record,
196
            'error_message': str(error),
197
            'failure_timestamp': datetime.utcnow().isoformat(),
198
            'retry_count': retry_count,
199
            'source': 'kinesis_processor'
200
        }
201

202
        try:
203
            self.sqs.send_message(
204
                QueueUrl=self.queue_url,
205
                MessageBody=json.dumps(message),
206
                MessageAttributes={
207
                    'ErrorType': {
208
                        'StringValue': type(error).__name__,
209
                        'DataType': 'String'
210
                    },
211
                    'RetryCount': {
212
                        'StringValue': str(retry_count),
213
                        'DataType': 'Number'
214
                    }
215
                }
216
            )
217
        except Exception as e:
218
            print(f"Failed to send to DLQ: {e}")
219
'''
220
        }
221

222
        return error_patterns
223

224
    def implement_scaling_strategies(self, stream_name):
225
        """
226
        Implement scaling strategies for Kinesis streams
227
        """
228
        scaling_strategies = {
229
            'auto_scaling': {
230
                'description': 'Automatic scaling based on metrics',
231
                'implementation': self.setup_auto_scaling(stream_name),
232
                'triggers': [
233
                    'IncomingRecords > threshold',
234
                    'WriteProvisionedThroughputExceeded > 0',
235
                    'ReadProvisionedThroughputExceeded > 0'
236
                ]
237
            },
238
            'predictive_scaling': {
239
                'description': 'Scale based on predicted traffic patterns',
240
                'considerations': [
241
                    'Historical traffic analysis',
242
                    'Time-based scaling schedules',
243
                    'Event-driven scaling'
244
                ]
245
            },
246
            'on_demand_mode': {
247
                'description': 'Use on-demand capacity mode for variable workloads',
248
                'benefits': [
249
                    'No capacity planning required',
250
                    'Automatic scaling',
251
                    'Pay per use'
252
                ],
253
                'limitations': [
254
                    'Higher cost for consistent workloads',
255
                    'Default limits apply'
256
                ]
257
            }
258
        }
259

260
        return scaling_strategies
261

262
    def setup_auto_scaling(self, stream_name):
263
        """
264
        Set up auto-scaling for Kinesis stream
265
        """
266
        auto_scaling_config = {
267
            'lambda_function': '''
268
import boto3
269
import json
270

271
def lambda_handler(event, context):
272
    kinesis = boto3.client('kinesis')
273
    cloudwatch = boto3.client('cloudwatch')
274

275
    stream_name = event['stream_name']
276

277
    # Get current metrics
278
    metrics = get_stream_metrics(cloudwatch, stream_name)
279

280
    # Determine if scaling is needed
281
    current_shards = get_current_shard_count(kinesis, stream_name)
282
    target_shards = calculate_target_shards(metrics, current_shards)
283

284
    if target_shards != current_shards:
285
        scale_stream(kinesis, stream_name, target_shards)
286

287
        return {
288
            'statusCode': 200,
289
            'body': json.dumps({
290
                'action': 'scaled',
291
                'from_shards': current_shards,
292
                'to_shards': target_shards
293
            })
294
        }
295

296
    return {
297
        'statusCode': 200,
298
        'body': json.dumps({'action': 'no_scaling_needed'})
299
    }
300

301
def get_stream_metrics(cloudwatch, stream_name):
302
    # Implementation to get metrics
303
    pass
304

305
def get_current_shard_count(kinesis, stream_name):
306
    response = kinesis.describe_stream(StreamName=stream_name)
307
    return len(response['StreamDescription']['Shards'])
308

309
def calculate_target_shards(metrics, current_shards):
310
    # Scaling algorithm implementation
311
    incoming_records_per_sec = metrics.get('incoming_records_per_sec', 0)
312

313
    # Each shard can handle ~1000 records/sec or 1MB/sec
314
    target_shards = max(1, int(incoming_records_per_sec / 1000) + 1)
315

316
    # Limit scaling changes
317
    max_increase = current_shards * 2
318
    max_decrease = max(1, current_shards // 2)
319

320
    return max(max_decrease, min(target_shards, max_increase))
321

322
def scale_stream(kinesis, stream_name, target_shards):
323
    kinesis.update_shard_count(
324
        StreamName=stream_name,
325
        TargetShardCount=target_shards,
326
        ScalingType='UNIFORM_SCALING'
327
    )
328
''',
329
            'cloudwatch_rule': {
330
                'schedule': 'rate(5 minutes)',
331
                'target': 'scaling_lambda_function'
332
            }
333
        }
334

335
        return auto_scaling_config
336

337
    def implement_security_best_practices(self):
338
        """
339
        Implement security best practices
340
        """
341
        security_practices = {
342
            'encryption': {
343
                'at_rest': {
344
                    'description': 'Encrypt data at rest using AWS KMS',
345
                    'configuration': {
346
                        'EncryptionType': 'KMS',
347
                        'KeyId': 'arn:aws:kms:us-east-1:123456789012:key/12345678-1234-1234-1234-123456789012'
348
                    }
349
                },
350
                'in_transit': {
351
                    'description': 'All API calls use TLS 1.2',
352
                    'implementation': 'Automatic with AWS SDK'
353
                }
354
            },
355
            'access_control': {
356
                'iam_policies': '''
357
{
358
    "Version": "2012-10-17",
359
    "Statement": [
360
        {
361
            "Effect": "Allow",
362
            "Action": [
363
                "kinesis:PutRecord",
364
                "kinesis:PutRecords"
365
            ],
366
            "Resource": "arn:aws:kinesis:*:*:stream/my-stream"
367
        },
368
        {
369
            "Effect": "Allow",
370
            "Action": [
371
                "kinesis:DescribeStream",
372
                "kinesis:GetShardIterator",
373
                "kinesis:GetRecords"
374
            ],
375
            "Resource": "arn:aws:kinesis:*:*:stream/my-stream",
376
            "Condition": {
377
                "StringEquals": {
378
                    "kinesis:consumer-name": "my-application"
379
                }
380
            }
381
        }
382
    ]
383
}
384
''',
385
                'vpc_endpoints': {
386
                    'description': 'Use VPC endpoints for private connectivity',
387
                    'benefits': [
388
                        'Traffic stays within AWS network',
389
                        'Reduced data transfer costs',
390
                        'Enhanced security'
391
                    ]
392
                }
393
            },
394
            'monitoring': {
395
                'cloudtrail': 'Enable CloudTrail for API audit logging',
396
                'vpc_flow_logs': 'Enable VPC Flow Logs for network monitoring',
397
                'access_logging': 'Log all access attempts and patterns'
398
            }
399
        }
400

401
        return security_practices
402

403
# Best practices implementation
404
best_practices = KinesisBestPractices()
405

406
# Get partitioning strategies
407
partitioning = best_practices.implement_partitioning_strategy()
408
print("Kinesis Partitioning Strategies:")
409
print(json.dumps(partitioning, indent=2))
410

411
# Set up monitoring
412
alarms = best_practices.implement_monitoring_and_alerting('my-production-stream')
413
print(f"\nMonitoring alarms created: {alarms}")
414

415
# Get error handling patterns
416
error_patterns = best_practices.implement_error_handling_patterns()
417
print("\nError Handling Patterns:")
418
for pattern, code in error_patterns.items():
419
    print(f"\n{pattern}:")
420
    print(code)
421

422
# Get scaling strategies
423
scaling = best_practices.implement_scaling_strategies('my-production-stream')
424
print("\nScaling Strategies:")
425
print(json.dumps(scaling, indent=2, default=str))
426

427
# Get security best practices
428
security = best_practices.implement_security_best_practices()
429
print("\nSecurity Best Practices:")
430
print(json.dumps(security, indent=2))

Cost Optimization {#cost-optimization}#

Kinesis Cost Management#

1
class KinesisCostOptimizer:
2
    def __init__(self):
3
        self.kinesis = boto3.client('kinesis')
4
        self.ce = boto3.client('ce')  # Cost Explorer
5

6
    def analyze_kinesis_costs(self, start_date, end_date):
7
        """
8
        Analyze Kinesis costs across all services
9
        """
10
        try:
11
            response = self.ce.get_cost_and_usage(
12
                TimePeriod={
13
                    'Start': start_date.strftime('%Y-%m-%d'),
14
                    'End': end_date.strftime('%Y-%m-%d')
15
                },
16
                Granularity='MONTHLY',
17
                Metrics=['BlendedCost', 'UsageQuantity'],
18
                GroupBy=[
19
                    {
20
                        'Type': 'DIMENSION',
21
                        'Key': 'SERVICE'
22
                    }
23
                ],
24
                Filter={
25
                    'Dimensions': {
26
                        'Key': 'SERVICE',
27
                        'Values': [
28
                            'Amazon Kinesis',
29
                            'Amazon Kinesis Firehose',
30
                            'Amazon Kinesis Analytics'
31
                        ]
32
                    }
33
                }
34
            )
35

36
            cost_breakdown = {}
37
            for result in response['ResultsByTime']:
38
                for group in result['Groups']:
39
                    service = group['Keys'][0]
40
                    cost = float(group['Metrics']['BlendedCost']['Amount'])
41
                    usage = float(group['Metrics']['UsageQuantity']['Amount'])
42

43
                    if service not in cost_breakdown:
44
                        cost_breakdown[service] = {'cost': 0, 'usage': 0}
45

46
                    cost_breakdown[service]['cost'] += cost
47
                    cost_breakdown[service]['usage'] += usage
48

49
            return cost_breakdown
50

51
        except Exception as e:
52
            print(f"Error analyzing Kinesis costs: {e}")
53
            return {}
54

55
    def optimize_data_streams(self):
56
        """
57
        Analyze and optimize Kinesis Data Streams costs
58
        """
59
        try:
60
            streams = self.kinesis.list_streams()
61

62
            optimization_recommendations = []
63

64
            for stream_name in streams['StreamNames']:
65
                stream_desc = self.kinesis.describe_stream(StreamName=stream_name)
66
                stream_info = stream_desc['StreamDescription']
67

68
                recommendations = []
69
                current_cost_per_month = 0
70
                potential_savings = 0
71

72
                # Analyze shard count and utilization
73
                shard_count = len(stream_info['Shards'])
74
                shard_cost_per_month = shard_count * 15  # $15 per shard per month
75
                current_cost_per_month += shard_cost_per_month
76

77
                # Check for over-provisioning
78
                if shard_count > 1:
79
                    recommendations.append({
80
                        'type': 'shard_optimization',
81
                        'description': 'Consider using on-demand mode for variable workloads',
82
                        'potential_monthly_savings': shard_cost_per_month * 0.3,
83
                        'action': 'Switch to on-demand capacity mode'
84
                    })
85

86
                # Check retention period
87
                retention_hours = stream_info['RetentionPeriodHours']
88
                if retention_hours > 168:  # More than 7 days
89
                    extended_retention_cost = (retention_hours - 24) * shard_count * 0.014
90
                    current_cost_per_month += extended_retention_cost
91

92
                    recommendations.append({
93
                        'type': 'retention_optimization',
94
                        'description': f'Retention period is {retention_hours} hours',
95
                        'potential_monthly_savings': extended_retention_cost * 0.5,
96
                        'action': 'Review data retention requirements'
97
                    })
98

99
                # Check for encryption costs
100
                if stream_info.get('EncryptionType') == 'KMS':
101
                    kms_cost_estimate = shard_count * 2  # Rough estimate
102
                    current_cost_per_month += kms_cost_estimate
103

104
                    if shard_count > 5:
105
                        recommendations.append({
106
                            'type': 'encryption_optimization',
107
                            'description': 'High KMS costs with many shards',
108
                            'potential_monthly_savings': kms_cost_estimate * 0.2,
109
                            'action': 'Consider using AWS owned keys for non-sensitive data'
110
                        })
111

112
                if recommendations:
113
                    total_potential_savings = sum(r['potential_monthly_savings'] for r in recommendations)
114

115
                    optimization_recommendations.append({
116
                        'stream_name': stream_name,
117
                        'current_monthly_cost': current_cost_per_month,
118
                        'shard_count': shard_count,
119
                        'retention_hours': retention_hours,
120
                        'recommendations': recommendations,
121
                        'total_potential_savings': total_potential_savings
122
                    })
123

124
            return optimization_recommendations
125

126
        except Exception as e:
127
            print(f"Error optimizing data streams: {e}")
128
            return []
129

130
    def optimize_firehose_delivery_streams(self):
131
        """
132
        Optimize Firehose delivery streams for cost
133
        """
134
        try:
135
            response = self.firehose.list_delivery_streams()
136

137
            optimization_recommendations = []
138

139
            for stream_name in response['DeliveryStreamNames']:
140
                stream_desc = self.firehose.describe_delivery_stream(
141
                    DeliveryStreamName=stream_name
142
                )
143

144
                destinations = stream_desc['DeliveryStreamDescription']['Destinations']
145
                recommendations = []
146

147
                for dest in destinations:
148
                    # Check S3 configuration
149
                    if 'S3DestinationDescription' in dest:
150
                        s3_config = dest['S3DestinationDescription']
151

152
                        # Check compression
153
                        if s3_config.get('CompressionFormat', 'UNCOMPRESSED') == 'UNCOMPRESSED':
154
                            recommendations.append({
155
                                'type': 'compression',
156
                                'description': 'Enable GZIP compression to reduce storage costs',
157
                                'potential_savings': '60-70% storage cost reduction',
158
                                'action': 'Enable GZIP compression'
159
                            })
160

161
                        # Check buffering configuration
162
                        buffering = s3_config.get('BufferingHints', {})
163
                        buffer_size = buffering.get('SizeInMBs', 5)
164
                        buffer_interval = buffering.get('IntervalInSeconds', 300)
165

166
                        if buffer_size < 128 or buffer_interval < 900:
167
                            recommendations.append({
168
                                'type': 'buffering_optimization',
169
                                'description': 'Optimize buffering to reduce API calls',
170
                                'potential_savings': '20-30% delivery cost reduction',
171
                                'action': f'Increase buffer size to 128MB and interval to 900s'
172
                            })
173

174
                    # Check transformation costs
175
                    if 'ProcessingConfiguration' in dest and dest['ProcessingConfiguration']['Enabled']:
176
                        recommendations.append({
177
                            'type': 'transformation_review',
178
                            'description': 'Review data transformation necessity',
179
                            'potential_savings': 'Lambda execution cost savings',
180
                            'action': 'Review if all transformations are necessary'
181
                        })
182

183
                if recommendations:
184
                    optimization_recommendations.append({
185
                        'delivery_stream_name': stream_name,
186
                        'recommendations': recommendations
187
                    })
188

189
            return optimization_recommendations
190

191
        except Exception as e:
192
            print(f"Error optimizing Firehose streams: {e}")
193
            return []
194

195
    def calculate_cost_projections(self, usage_patterns):
196
        """
197
        Calculate cost projections for different usage patterns
198
        """
199
        cost_calculator = {
200
            'data_streams': {
201
                'provisioned_mode': {
202
                    'shard_hour': 0.015,  # $0.015 per shard hour
203
                    'put_payload_unit': 0.014,  # $0.014 per million PUT payload units
204
                    'extended_retention': 0.023  # $0.023 per shard hour for extended retention
205
                },
206
                'on_demand_mode': {
207
                    'data_in_per_gb': 0.033,  # $0.033 per GB data ingested
208
                    'data_out_per_gb': 0.055  # $0.055 per GB data retrieved
209
                }
210
            },
211
            'firehose': {
212
                'data_ingested_per_gb': 0.029,  # $0.029 per GB ingested
213
                'format_conversion_per_gb': 0.018,  # $0.018 per GB for format conversion
214
                'vpc_delivery_per_gb': 0.01  # $0.01 per GB for VPC delivery
215
            },
216
            'analytics': {
217
                'kpu_hour': 0.11,  # $0.11 per KPU hour
218
                'running_application_per_gb': 0.05  # $0.05 per GB processed
219
            }
220
        }
221

222
        projections = {}
223

224
        for service, patterns in usage_patterns.items():
225
            if service == 'data_streams':
226
                # Calculate both provisioned and on-demand costs
227
                provisioned_cost = self._calculate_data_streams_provisioned_cost(
228
                    patterns, cost_calculator['data_streams']['provisioned_mode']
229
                )
230
                on_demand_cost = self._calculate_data_streams_on_demand_cost(
231
                    patterns, cost_calculator['data_streams']['on_demand_mode']
232
                )
233

234
                projections[service] = {
235
                    'provisioned_monthly_cost': provisioned_cost,
236
                    'on_demand_monthly_cost': on_demand_cost,
237
                    'recommended_mode': 'on_demand' if on_demand_cost < provisioned_cost else 'provisioned',
238
                    'savings_opportunity': abs(provisioned_cost - on_demand_cost)
239
                }
240

241
            elif service == 'firehose':
242
                monthly_cost = patterns['gb_per_month'] * cost_calculator['firehose']['data_ingested_per_gb']
243
                if patterns.get('format_conversion', False):
244
                    monthly_cost += patterns['gb_per_month'] * cost_calculator['firehose']['format_conversion_per_gb']
245

246
                projections[service] = {
247
                    'monthly_cost': monthly_cost,
248
                    'cost_per_gb': cost_calculator['firehose']['data_ingested_per_gb']
249
                }
250

251
            elif service == 'analytics':
252
                monthly_cost = (patterns['kpu_hours_per_month'] * cost_calculator['analytics']['kpu_hour'] +
253
                               patterns['gb_processed_per_month'] * cost_calculator['analytics']['running_application_per_gb'])
254

255
                projections[service] = {
256
                    'monthly_cost': monthly_cost,
257
                    'kpu_cost': patterns['kpu_hours_per_month'] * cost_calculator['analytics']['kpu_hour'],
258
                    'processing_cost': patterns['gb_processed_per_month'] * cost_calculator['analytics']['running_application_per_gb']
259
                }
260

261
        return projections
262

263
    def _calculate_data_streams_provisioned_cost(self, patterns, pricing):
264
        """
265
        Calculate provisioned mode costs for data streams
266
        """
267
        monthly_hours = 24 * 30  # 720 hours per month
268

269
        shard_cost = patterns['shard_count'] * monthly_hours * pricing['shard_hour']
270
        put_cost = (patterns['records_per_month'] / 1000000) * pricing['put_payload_unit']
271

272
        extended_retention_cost = 0
273
        if patterns.get('retention_hours', 24) > 24:
274
            extended_hours = patterns['retention_hours'] - 24
275
            extended_retention_cost = (patterns['shard_count'] * extended_hours *
276
                                     pricing['extended_retention'] * 30)  # 30 days
277

278
        return shard_cost + put_cost + extended_retention_cost
279

280
    def _calculate_data_streams_on_demand_cost(self, patterns, pricing):
281
        """
282
        Calculate on-demand mode costs for data streams
283
        """
284
        data_in_cost = patterns['gb_ingested_per_month'] * pricing['data_in_per_gb']
285
        data_out_cost = patterns['gb_retrieved_per_month'] * pricing['data_out_per_gb']
286

287
        return data_in_cost + data_out_cost
288

289
    def generate_cost_optimization_report(self):
290
        """
291
        Generate comprehensive cost optimization report
292
        """
293
        from datetime import datetime, timedelta
294

295
        end_date = datetime.utcnow()
296
        start_date = end_date - timedelta(days=90)  # Last 3 months
297

298
        report = {
299
            'report_date': datetime.utcnow().isoformat(),
300
            'analysis_period': f"{start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}",
301
            'current_costs': self.analyze_kinesis_costs(start_date, end_date),
302
            'data_streams_optimization': self.optimize_data_streams(),
303
            'firehose_optimization': self.optimize_firehose_delivery_streams(),
304
            'recommendations_summary': {
305
                'immediate_actions': [
306
                    'Enable compression on Firehose delivery streams',
307
                    'Review data retention periods for streams',
308
                    'Consider on-demand mode for variable workloads',
309
                    'Optimize buffering configurations'
310
                ],
311
                'cost_reduction_strategies': [
312
                    'Right-size shard counts based on actual usage',
313
                    'Implement data lifecycle policies',
314
                    'Use appropriate compression formats',
315
                    'Optimize data transformation logic'
316
                ]
317
            }
318
        }
319

320
        # Calculate total potential savings
321
        total_savings = 0
322
        for stream_opt in report['data_streams_optimization']:
323
            total_savings += stream_opt.get('total_potential_savings', 0)
324

325
        report['total_monthly_savings_potential'] = total_savings
326
        report['annual_savings_projection'] = total_savings * 12
327

328
        return report
329

330
# Cost optimization examples
331
cost_optimizer = KinesisCostOptimizer()
332

333
# Example usage patterns for cost projection
334
usage_patterns = {
335
    'data_streams': {
336
        'shard_count': 5,
337
        'records_per_month': 10000000,  # 10M records
338
        'gb_ingested_per_month': 100,
339
        'gb_retrieved_per_month': 50,
340
        'retention_hours': 168  # 7 days
341
    },
342
    'firehose': {
343
        'gb_per_month': 500,
344
        'format_conversion': True
345
    },
346
    'analytics': {
347
        'kpu_hours_per_month': 720,  # 1 KPU running 24/7
348
        'gb_processed_per_month': 1000
349
    }
350
}
351

352
# Calculate cost projections
353
projections = cost_optimizer.calculate_cost_projections(usage_patterns)
354
print("Kinesis Cost Projections:")
355
print(json.dumps(projections, indent=2))
356

357
# Generate comprehensive cost optimization report
358
report = cost_optimizer.generate_cost_optimization_report()
359
print("\nKinesis Cost Optimization Report:")
360
print(json.dumps(report, indent=2, default=str))
361

362
print(f"\nTotal Monthly Savings Potential: ${report['total_monthly_savings_potential']:.2f}")
363
print(f"Annual Savings Projection: ${report['annual_savings_projection']:.2f}")

Conclusion#

Amazon Kinesis provides a comprehensive platform for real-time data streaming and analytics on AWS. Key takeaways:

Essential Services:#

Kinesis Data Streams: Real-time data ingestion and processing with configurable retention
Kinesis Data Firehose: Serverless data delivery to AWS data stores with transformation capabilities
Kinesis Data Analytics: Real-time analytics with SQL and Apache Flink
Kinesis Video Streams: Video streaming for analytics and machine learning

Advanced Capabilities:#

Multiple ingestion patterns: Producer libraries, direct API calls, and agent-based collection
Flexible consumer models: Lambda integration, enhanced fan-out, and custom consumers
Real-time processing: Stream processing with windowing, aggregation, and complex event processing
Seamless integration: Native integration with AWS services and third-party tools

Best Practices:#

Implement effective partitioning strategies for optimal performance and scaling
Set up comprehensive monitoring and alerting for operational excellence
Use appropriate error handling patterns with retries and dead letter queues
Implement security best practices with encryption and access controls
Optimize costs through right-sizing, compression, and appropriate capacity modes

Cost Optimization Strategies:#

Choose between provisioned and on-demand capacity modes based on usage patterns
Enable compression to reduce storage and transfer costs
Optimize buffering configurations to reduce API calls
Review data retention periods and transformation requirements
Monitor usage patterns and adjust capacity accordingly

Operational Excellence:#

Implement comprehensive monitoring with CloudWatch metrics and alarms
Use infrastructure as code for deployment and configuration management
Establish disaster recovery and backup strategies
Maintain proper documentation and runbooks
Regular cost reviews and optimization cycles

Amazon Kinesis enables organizations to build real-time data processing architectures that can handle massive scale while providing the flexibility to process data as it arrives, making it ideal for modern data-driven applications requiring immediate insights and responses.