Day 96 - Modern Infrastructure as Code: From Terraform to Pulumi#

Infrastructure as Code (IaC) has revolutionized how we provision and manage infrastructure. As we move into 2025, the landscape has evolved beyond declarative configuration to include full programming languages, policy engines, and AI-assisted infrastructure management. Today, we’ll explore modern IaC tools, patterns, and best practices for managing infrastructure at scale.

The Evolution of Infrastructure as Code#

The IaC journey has been transformative:

2011-2014: Configuration management (Puppet, Chef, Ansible)
2014-2018: Declarative IaC (Terraform, CloudFormation)
2018-2021: Cloud-native tools (CDK, Pulumi)
2021-2025: AI-assisted IaC and policy-driven infrastructure

Modern IaC Tools Comparison#

Terraform: The Industry Standard#

1
terraform {
2
  required_version = ">= 1.5.0"
3

4
  required_providers {
5
    aws = {
6
      source  = "hashicorp/aws"
7
      version = "~> 5.0"
8
    }
9
    kubernetes = {
10
      source  = "hashicorp/kubernetes"
11
      version = "~> 2.23"
12
    }
13
  }
14
}
15

16
# Data source for availability zones
17
data "aws_availability_zones" "available" {
18
  state = "available"
19
}
20

21
# EKS Cluster
22
resource "aws_eks_cluster" "main" {
23
  name     = var.cluster_name
24
  role_arn = aws_iam_role.cluster.arn
25
  version  = var.kubernetes_version
26

27
  vpc_config {
28
    subnet_ids              = aws_subnet.private[*].id
29
    endpoint_private_access = true
30
    endpoint_public_access  = var.enable_public_access
31
    public_access_cidrs    = var.public_access_cidrs
32
    security_group_ids     = [aws_security_group.cluster.id]
33
  }
34

35
  encryption_config {
36
    provider {
37
      key_arn = aws_kms_key.eks.arn
38
    }
39
    resources = ["secrets"]
40
  }
41

42
  enabled_cluster_log_types = ["api", "audit", "authenticator", "controllerManager", "scheduler"]
43

44
  depends_on = [
45
    aws_iam_role_policy_attachment.cluster_AmazonEKSClusterPolicy,
46
    aws_iam_role_policy_attachment.cluster_AmazonEKSVPCResourceController,
47
    aws_cloudwatch_log_group.eks,
48
  ]
49

50
  tags = merge(
51
    var.tags,
52
    {
53
      "Name" = var.cluster_name
54
    }
55
  )
56
}
57

58
# Node Groups with mixed instance types
59
resource "aws_eks_node_group" "main" {
60
  for_each = var.node_groups
61

62
  cluster_name    = aws_eks_cluster.main.name
63
  node_group_name = each.key
64
  node_role_arn   = aws_iam_role.node.arn
65
  subnet_ids      = aws_subnet.private[*].id
66

67
  instance_types = each.value.instance_types
68
  capacity_type  = each.value.capacity_type # ON_DEMAND or SPOT
69

70
  scaling_config {
71
    desired_size = each.value.desired_size
72
    max_size     = each.value.max_size
73
    min_size     = each.value.min_size
74
  }
75

76
  update_config {
77
    max_unavailable_percentage = 33
78
  }
79

80
  launch_template {
81
    id      = aws_launch_template.node[each.key].id
82
    version = aws_launch_template.node[each.key].latest_version
83
  }
84

85
  labels = each.value.labels
86

87
  taints = [
88
    for taint in each.value.taints : {
89
      key    = taint.key
90
      value  = taint.value
91
      effect = taint.effect
92
    }
93
  ]
94

95
  lifecycle {
96
    create_before_destroy = true
97
    ignore_changes        = [scaling_config[0].desired_size]
98
  }
99

100
  depends_on = [
101
    aws_iam_role_policy_attachment.node_AmazonEKSWorkerNodePolicy,
102
    aws_iam_role_policy_attachment.node_AmazonEKS_CNI_Policy,
103
    aws_iam_role_policy_attachment.node_AmazonEC2ContainerRegistryReadOnly,
104
  ]
105
}
106

107
# Advanced networking with VPC CNI custom configuration
108
resource "kubernetes_config_map" "aws_auth" {
109
  metadata {
110
    name      = "aws-auth"
111
    namespace = "kube-system"
112
  }
113

114
  data = {
115
    mapRoles = yamlencode(concat(
116
      [
117
        {
118
          rolearn  = aws_iam_role.node.arn
119
          username = "system:node:{{EC2PrivateDNSName}}"
120
          groups   = ["system:bootstrappers", "system:nodes"]
121
        }
122
      ],
123
      var.map_roles
124
    ))
125
    mapUsers = yamlencode(var.map_users)
126
  }
127

128
  depends_on = [aws_eks_cluster.main]
129
}

Pulumi: Infrastructure as Real Code#

1
import * as pulumi from "@pulumi/pulumi";
2
import * as aws from "@pulumi/aws";
3
import * as eks from "@pulumi/eks";
4
import * as k8s from "@pulumi/kubernetes";
5
import { ComponentResource, ComponentResourceOptions } from "@pulumi/pulumi";
6

7
interface ModernClusterArgs {
8
  vpcCidr: string;
9
  availabilityZones: string[];
10
  nodeGroups: NodeGroupConfig[];
11
  enableGitOps: boolean;
12
  enableServiceMesh: boolean;
13
  monitoringConfig: MonitoringConfig;
14
}
15

16
class ModernKubernetesCluster extends ComponentResource {
17
  public cluster: eks.Cluster;
18
  public kubeconfig: pulumi.Output<any>;
19

20
  constructor(name: string, args: ModernClusterArgs, opts?: ComponentResourceOptions) {
21
    super("custom:infrastructure:ModernKubernetesCluster", name, {}, opts);
22

23
    // Create VPC with advanced networking
24
    const vpc = new awsx.ec2.Vpc(`${name}-vpc`, {
25
      cidrBlock: args.vpcCidr,
26
      numberOfAvailabilityZones: args.availabilityZones.length,
27
      natGateways: {
28
        strategy: "HighlyAvailable",
29
      },
30
      tags: {
31
        "kubernetes.io/cluster/"+name: "shared",
32
      },
33
    }, { parent: this });
34

35
    // Create EKS cluster with OIDC
36
    this.cluster = new eks.Cluster(`${name}-cluster`, {
37
      vpc: vpc,
38
      version: "1.28",
39
      nodeAssociatePublicIpAddress: false,
40
      endpointPrivateAccess: true,
41
      endpointPublicAccess: true,
42
      enabledClusterLogTypes: ["api", "audit", "authenticator", "controllerManager", "scheduler"],
43

44
      // Advanced node group configurations
45
      nodeGroups: args.nodeGroups.map(ng => ({
46
        ...ng,
47
        instanceType: ng.instanceTypes,
48
        nodeAssociatePublicIpAddress: false,
49

50
        // Spot instance support
51
        spotPrice: ng.useSpot ? ng.spotPrice : undefined,
52

53
        // Custom launch template
54
        launchTemplate: {
55
          userData: this.generateUserData(ng),
56
          blockDeviceMappings: [{
57
            deviceName: "/dev/xvda",
58
            ebs: {
59
              volumeSize: ng.diskSize || 100,
60
              volumeType: "gp3",
61
              iops: 3000,
62
              throughput: 125,
63
              encrypted: true,
64
            },
65
          }],
66
        },
67

68
        // Auto-scaling configuration
69
        autoScaling: {
70
          enabled: true,
71
          minSize: ng.minSize,
72
          maxSize: ng.maxSize,
73
          targetCpuUtilization: 70,
74
          targetMemoryUtilization: 80,
75
        },
76
      })),
77

78
      // Fargate profiles for serverless workloads
79
      fargateProfiles: [{
80
        name: "system-critical",
81
        selectors: [{
82
          namespace: "kube-system",
83
          labels: {
84
            "compute-type": "fargate",
85
          },
86
        }],
87
      }],
88
    }, { parent: this });
89

90
    // Install core add-ons
91
    this.installCoreAddons();
92

93
    // GitOps setup
94
    if (args.enableGitOps) {
95
      this.setupGitOps();
96
    }
97

98
    // Service mesh installation
99
    if (args.enableServiceMesh) {
100
      this.installServiceMesh();
101
    }
102

103
    // Monitoring and observability
104
    this.setupMonitoring(args.monitoringConfig);
105

106
    this.kubeconfig = this.cluster.kubeconfig;
107

108
    this.registerOutputs({
109
      clusterName: this.cluster.eksCluster.name,
110
      kubeconfig: this.kubeconfig,
111
    });
112
  }
113

114
  private generateUserData(nodeGroup: NodeGroupConfig): string {
115
    return `#!/bin/bash
116
set -ex
117

118
# Install SSM agent for secure access
119
yum install -y amazon-ssm-agent
120
systemctl enable amazon-ssm-agent
121
systemctl start amazon-ssm-agent
122

123
# Configure container runtime
124
cat <<EOF > /etc/docker/daemon.json
125
{
126
  "exec-opts": ["native.cgroupdriver=systemd"],
127
  "log-driver": "json-file",
128
  "log-opts": {
129
    "max-size": "100m",
130
    "max-file": "5"
131
  },
132
  "storage-driver": "overlay2"
133
}
134
EOF
135

136
# Set up kubelet extra args
137
cat <<EOF > /etc/systemd/system/kubelet.service.d/10-kubelet-args.conf
138
[Service]
139
Environment="KUBELET_EXTRA_ARGS=--node-labels=nodegroup=${nodeGroup.name} --register-with-taints=${nodeGroup.taints?.join(',') || ''}"
140
EOF
141

142
# Optimize kernel parameters
143
cat <<EOF >> /etc/sysctl.conf
144
net.ipv4.ip_forward = 1
145
net.bridge.bridge-nf-call-iptables = 1
146
vm.max_map_count = 262144
147
fs.inotify.max_user_instances = 8192
148
fs.inotify.max_user_watches = 524288
149
EOF
150

151
sysctl -p
152

153
# Join the cluster
154
/etc/eks/bootstrap.sh ${this.cluster.eksCluster.name}
155
`;
156
  }
157

158
  private async installCoreAddons() {
159
    // AWS Load Balancer Controller
160
    const albController = new k8s.helm.v3.Chart("aws-load-balancer-controller", {
161
      chart: "aws-load-balancer-controller",
162
      version: "1.6.2",
163
      namespace: "kube-system",
164
      fetchOpts: {
165
        repo: "https://aws.github.io/eks-charts",
166
      },
167
      values: {
168
        clusterName: this.cluster.eksCluster.name,
169
        serviceAccount: {
170
          create: true,
171
          annotations: {
172
            "eks.amazonaws.com/role-arn": this.createIRSARole("aws-load-balancer-controller", [
173
              "arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess",
174
            ]).arn,
175
          },
176
        },
177
      },
178
    }, { provider: this.cluster.provider, parent: this });
179

180
    // EBS CSI Driver
181
    const ebsCsiDriver = new k8s.helm.v3.Chart("aws-ebs-csi-driver", {
182
      chart: "aws-ebs-csi-driver",
183
      namespace: "kube-system",
184
      fetchOpts: {
185
        repo: "https://kubernetes-sigs.github.io/aws-ebs-csi-driver",
186
      },
187
      values: {
188
        controller: {
189
          serviceAccount: {
190
            create: true,
191
            annotations: {
192
              "eks.amazonaws.com/role-arn": this.createIRSARole("ebs-csi-controller", [
193
                "arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy",
194
              ]).arn,
195
            },
196
          },
197
        },
198
      },
199
    }, { provider: this.cluster.provider, parent: this });
200
  }
201

202
  private setupGitOps() {
203
    // Install Flux v2
204
    const flux = new k8s.helm.v3.Chart("flux2", {
205
      chart: "flux2",
206
      namespace: "flux-system",
207
      fetchOpts: {
208
        repo: "https://fluxcd-community.github.io/helm-charts",
209
      },
210
      values: {
211
        gitRepository: {
212
          url: pulumi.interpolate`${process.env.GIT_REPO_URL}`,
213
          branch: "main",
214
          interval: "1m",
215
        },
216
        kustomization: {
217
          path: "./clusters/production",
218
          prune: true,
219
          interval: "10m",
220
        },
221
      },
222
    }, { provider: this.cluster.provider, parent: this });
223
  }
224
}
225

226
// Export modern infrastructure with type safety
227
export const createModernInfrastructure = async () => {
228
  const cluster = new ModernKubernetesCluster("production", {
229
    vpcCidr: "10.0.0.0/16",
230
    availabilityZones: ["us-west-2a", "us-west-2b", "us-west-2c"],
231
    nodeGroups: [
232
      {
233
        name: "general-purpose",
234
        instanceTypes: ["t3.large", "t3a.large"],
235
        minSize: 3,
236
        maxSize: 10,
237
        diskSize: 100,
238
        useSpot: false,
239
      },
240
      {
241
        name: "compute-optimized",
242
        instanceTypes: ["c5.2xlarge", "c5a.2xlarge"],
243
        minSize: 0,
244
        maxSize: 20,
245
        diskSize: 200,
246
        useSpot: true,
247
        spotPrice: "0.20",
248
        taints: ["workload=compute:NoSchedule"],
249
        labels: {
250
          "workload": "compute",
251
          "instance-type": "compute-optimized",
252
        },
253
      },
254
    ],
255
    enableGitOps: true,
256
    enableServiceMesh: true,
257
    monitoringConfig: {
258
      enablePrometheus: true,
259
      enableGrafana: true,
260
      enableLoki: true,
261
      enableTempo: true,
262
      retentionDays: 30,
263
    },
264
  });
265

266
  return {
267
    clusterName: cluster.cluster.eksCluster.name,
268
    kubeconfig: cluster.kubeconfig,
269
    clusterEndpoint: cluster.cluster.eksCluster.endpoint,
270
  };
271
};

AWS CDK: Cloud-Native Constructs#

1
import * as cdk from "aws-cdk-lib";
2
import * as ec2 from "aws-cdk-lib/aws-ec2";
3
import * as eks from "aws-cdk-lib/aws-eks";
4
import * as iam from "aws-cdk-lib/aws-iam";
5
import * as lambda from "aws-cdk-lib/aws-lambda";
6
import * as cr from "aws-cdk-lib/custom-resources";
7
import { Construct } from "constructs";
8

9
export class ModernInfrastructureStack extends cdk.Stack {
10
  constructor(scope: Construct, id: string, props?: cdk.StackProps) {
11
    super(scope, id, props);
12

13
    // VPC with custom CIDR and flow logs
14
    const vpc = new ec2.Vpc(this, "VPC", {
15
      maxAzs: 3,
16
      natGateways: 3,
17
      cidr: "10.0.0.0/16",
18
      subnetConfiguration: [
19
        {
20
          name: "Public",
21
          subnetType: ec2.SubnetType.PUBLIC,
22
          cidrMask: 24,
23
        },
24
        {
25
          name: "Private",
26
          subnetType: ec2.SubnetType.PRIVATE_WITH_NAT,
27
          cidrMask: 20,
28
        },
29
        {
30
          name: "Isolated",
31
          subnetType: ec2.SubnetType.PRIVATE_ISOLATED,
32
          cidrMask: 24,
33
        },
34
      ],
35
      flowLogs: {
36
        VPCFlowLogs: {
37
          destination: ec2.FlowLogDestination.toCloudWatchLogs(),
38
          trafficType: ec2.FlowLogTrafficType.ALL,
39
        },
40
      },
41
    });
42

43
    // Advanced EKS cluster with custom configuration
44
    const cluster = new eks.Cluster(this, "Cluster", {
45
      vpc,
46
      version: eks.KubernetesVersion.V1_28,
47
      defaultCapacity: 0, // We'll add custom node groups
48
      clusterLogging: [
49
        eks.ClusterLoggingTypes.API,
50
        eks.ClusterLoggingTypes.AUDIT,
51
        eks.ClusterLoggingTypes.AUTHENTICATOR,
52
        eks.ClusterLoggingTypes.CONTROLLER_MANAGER,
53
        eks.ClusterLoggingTypes.SCHEDULER,
54
      ],
55
      albController: {
56
        version: eks.AlbControllerVersion.V2_6_2,
57
      },
58

59
      // Custom security group rules
60
      securityGroup: new ec2.SecurityGroup(this, "ClusterSecurityGroup", {
61
        vpc,
62
        description: "EKS cluster security group",
63
        allowAllOutbound: false,
64
      }),
65
    });
66

67
    // Add custom managed node groups
68
    const generalNodeGroup = cluster.addNodegroupCapacity("GeneralNodeGroup", {
69
      instanceTypes: [
70
        ec2.InstanceType.of(ec2.InstanceClass.M5, ec2.InstanceSize.LARGE),
71
        ec2.InstanceType.of(ec2.InstanceClass.M5A, ec2.InstanceSize.LARGE),
72
      ],
73
      minSize: 3,
74
      maxSize: 10,
75
      diskSize: 100,
76
      subnets: { subnetType: ec2.SubnetType.PRIVATE_WITH_NAT },
77
      launchTemplateSpec: {
78
        userData: ec2.UserData.forLinux({
79
          shebang: "#!/bin/bash",
80
        }).addCommands(
81
          'echo "net.ipv4.ip_forward = 1" >> /etc/sysctl.conf',
82
          "sysctl -p"
83
        ),
84
      },
85
    });
86

87
    // GPU node group for ML workloads
88
    const gpuNodeGroup = cluster.addNodegroupCapacity("GPUNodeGroup", {
89
      instanceTypes: [
90
        ec2.InstanceType.of(ec2.InstanceClass.P3, ec2.InstanceSize.XLARGE2),
91
      ],
92
      minSize: 0,
93
      maxSize: 5,
94
      labels: {
95
        workload: "gpu",
96
        "nvidia.com/gpu": "true",
97
      },
98
      taints: [
99
        {
100
          effect: eks.TaintEffect.NO_SCHEDULE,
101
          key: "nvidia.com/gpu",
102
          value: "true",
103
        },
104
      ],
105
    });
106

107
    // Custom resource for advanced cluster configuration
108
    const clusterConfigLambda = new lambda.Function(
109
      this,
110
      "ClusterConfigFunction",
111
      {
112
        runtime: lambda.Runtime.PYTHON_3_11,
113
        handler: "index.handler",
114
        code: lambda.Code.fromInline(`
115
import json
116
import boto3
117
import urllib3
118

119
def handler(event, context):
120
    if event['RequestType'] == 'Delete':
121
        return {'StatusCode': 200}
122

123
    # Configure advanced cluster settings
124
    eks_client = boto3.client('eks')
125
    cluster_name = event['ResourceProperties']['ClusterName']
126

127
    # Enable additional features
128
    response = eks_client.associate_encryption_config(
129
        clusterName=cluster_name,
130
        encryptionConfig=[{
131
            'resources': ['secrets'],
132
            'provider': {
133
                'keyArn': event['ResourceProperties']['KmsKeyArn']
134
            }
135
        }]
136
    )
137

138
    return {
139
        'StatusCode': 200,
140
        'PhysicalResourceId': f'{cluster_name}-encryption-config'
141
    }
142
      `),
143
        timeout: cdk.Duration.minutes(15),
144
      }
145
    );
146

147
    clusterConfigLambda.addToRolePolicy(
148
      new iam.PolicyStatement({
149
        actions: ["eks:*", "kms:*"],
150
        resources: ["*"],
151
      })
152
    );
153

154
    new cr.Provider(this, "ClusterConfigProvider", {
155
      onEventHandler: clusterConfigLambda,
156
    });
157

158
    // Implement cost optimization through intelligent scheduling
159
    this.implementCostOptimization(cluster);
160

161
    // Set up advanced monitoring
162
    this.setupAdvancedMonitoring(cluster);
163

164
    // Configure backup and disaster recovery
165
    this.setupBackupAndDR(cluster);
166
  }
167

168
  private implementCostOptimization(cluster: eks.Cluster) {
169
    // Karpenter for intelligent node provisioning
170
    const karpenter = new eks.HelmChart(this, "Karpenter", {
171
      cluster,
172
      chart: "karpenter",
173
      repository: "oci://public.ecr.aws/karpenter/karpenter",
174
      namespace: "karpenter",
175
      version: "v0.33.0",
176
      values: {
177
        serviceAccount: {
178
          annotations: {
179
            "eks.amazonaws.com/role-arn": new iam.Role(this, "KarpenterRole", {
180
              assumedBy: new iam.ServicePrincipal("eks.amazonaws.com"),
181
              managedPolicies: [
182
                iam.ManagedPolicy.fromAwsManagedPolicyName(
183
                  "AmazonEKSClusterPolicy"
184
                ),
185
              ],
186
            }).roleArn,
187
          },
188
        },
189
        settings: {
190
          aws: {
191
            clusterName: cluster.clusterName,
192
            defaultInstanceProfile: "KarpenterNodeInstanceProfile",
193
            interruptionQueueName: cluster.clusterName,
194
          },
195
        },
196
      },
197
    });
198
  }
199
}

Crossplane: Kubernetes-Native Infrastructure#

1
apiVersion: apiextensions.crossplane.io/v1
2
kind: Composition
3
metadata:
4
  name: xpostgresqlinstances.database.company.com
5
spec:
6
  compositeTypeRef:
7
    apiVersion: database.company.com/v1alpha1
8
    kind: XPostgreSQLInstance
9

10
  patchSets:
11
    - name: common-fields
12
      patches:
13
        - type: FromCompositeFieldPath
14
          fromFieldPath: spec.parameters.region
15
          toFieldPath: spec.forProvider.region
16
        - type: FromCompositeFieldPath
17
          fromFieldPath: spec.parameters.deletionPolicy
18
          toFieldPath: spec.deletionPolicy
19

20
  resources:
21
    # VPC for database
22
    - name: vpc
23
      base:
24
        apiVersion: ec2.aws.crossplane.io/v1beta1
25
        kind: VPC
26
        spec:
27
          forProvider:
28
            cidrBlock: 10.0.0.0/16
29
            enableDnsHostnames: true
30
            enableDnsSupport: true
31
      patches:
32
        - type: PatchSet
33
          patchSetName: common-fields
34

35
    # Subnet Group
36
    - name: subnet-group
37
      base:
38
        apiVersion: rds.aws.crossplane.io/v1alpha1
39
        kind: DBSubnetGroup
40
        spec:
41
          forProvider:
42
            description: "Subnet group for PostgreSQL instance"
43
            subnetIdSelector:
44
              matchLabels:
45
                type: database
46
      patches:
47
        - type: PatchSet
48
          patchSetName: common-fields
49

50
    # Security Group
51
    - name: security-group
52
      base:
53
        apiVersion: ec2.aws.crossplane.io/v1beta1
54
        kind: SecurityGroup
55
        spec:
56
          forProvider:
57
            description: "Security group for PostgreSQL instance"
58
            vpcIdSelector:
59
              matchControllerRef: true
60
            ingress:
61
              - fromPort: 5432
62
                toPort: 5432
63
                ipProtocol: tcp
64
                ipRanges:
65
                  - cidrIp: 10.0.0.0/16
66
                    description: "VPC access"
67

68
    # Parameter Group
69
    - name: parameter-group
70
      base:
71
        apiVersion: rds.aws.crossplane.io/v1alpha1
72
        kind: DBParameterGroup
73
        spec:
74
          forProvider:
75
            dbParameterGroupFamily: postgres14
76
            description: "Custom parameter group for PostgreSQL"
77
            parameters:
78
              - parameterName: shared_preload_libraries
79
                parameterValue: pg_stat_statements
80
              - parameterName: log_statement
81
                parameterValue: all
82
              - parameterName: log_min_duration_statement
83
                parameterValue: "1000"
84

85
    # RDS Instance
86
    - name: rds-instance
87
      base:
88
        apiVersion: rds.aws.crossplane.io/v1alpha1
89
        kind: DBInstance
90
        spec:
91
          forProvider:
92
            engine: postgres
93
            engineVersion: "14.9"
94
            dbInstanceClass: db.t3.medium
95
            allocatedStorage: 100
96
            storageType: gp3
97
            storageEncrypted: true
98

99
            backupRetentionPeriod: 30
100
            preferredBackupWindow: "03:00-04:00"
101
            preferredMaintenanceWindow: "sun:04:00-sun:05:00"
102

103
            enableCloudwatchLogsExports:
104
              - postgresql
105

106
            dbSubnetGroupNameSelector:
107
              matchControllerRef: true
108
            vpcSecurityGroupIDSelector:
109
              matchControllerRef: true
110
            dbParameterGroupNameSelector:
111
              matchControllerRef: true
112

113
      patches:
114
        - type: PatchSet
115
          patchSetName: common-fields
116
        - fromFieldPath: spec.parameters.instanceSize
117
          toFieldPath: spec.forProvider.dbInstanceClass
118
          transforms:
119
            - type: map
120
              map:
121
                small: db.t3.small
122
                medium: db.t3.medium
123
                large: db.r6i.large
124
                xlarge: db.r6i.xlarge
125
        - fromFieldPath: spec.parameters.storageGB
126
          toFieldPath: spec.forProvider.allocatedStorage
127
        - fromFieldPath: spec.parameters.engineVersion
128
          toFieldPath: spec.forProvider.engineVersion
129
        - fromFieldPath: metadata.uid
130
          toFieldPath: spec.forProvider.masterUsername
131
          transforms:
132
            - type: string
133
              string:
134
                fmt: "postgres"
135
        - fromFieldPath: metadata.uid
136
          toFieldPath: spec.writeConnectionSecretToRef.name
137
          transforms:
138
            - type: string
139
              string:
140
                fmt: "%s-postgresql"
141
        - fromFieldPath: spec.writeConnectionSecretToRef.namespace
142
          toFieldPath: spec.writeConnectionSecretToRef.namespace
143

144
    # Automated backups to S3
145
    - name: backup-configuration
146
      base:
147
        apiVersion: backup.aws.crossplane.io/v1alpha1
148
        kind: BackupPlan
149
        spec:
150
          forProvider:
151
            rules:
152
              - ruleName: DailyBackups
153
                targetBackupVault:
154
                  name: postgresql-backups
155
                schedule: "cron(0 5 ? * * *)"
156
                lifecycle:
157
                  deleteAfterDays: 30
158
                  moveToColdStorageAfterDays: 7

Advanced IaC Patterns#

Policy as Code with OPA#

1
package infrastructure.cost_control
2

3
import future.keywords.contains
4
import future.keywords.if
5
import future.keywords.in
6

7
# Maximum allowed cost per month
8
max_monthly_cost := 10000
9

10
# Deny expensive instance types
11
deny[msg] {
12
    input.resource_type == "aws_instance"
13
    expensive_instances := ["x1e.xlarge", "p3.2xlarge", "i3.metal"]
14
    input.instance_type in expensive_instances
15
    msg := sprintf("Instance type %s is too expensive for this environment", [input.instance_type])
16
}
17

18
# Require tagging for cost allocation
19
deny[msg] {
20
    input.resource_type in ["aws_instance", "aws_rds_instance", "aws_eks_cluster"]
21
    required_tags := ["Environment", "CostCenter", "Owner", "Project"]
22
    missing_tags := required_tags[_]
23
    not input.tags[missing_tags]
24
    msg := sprintf("Missing required tag: %s", [missing_tags])
25
}
26

27
# Enforce resource limits
28
deny[msg] {
29
    input.resource_type == "aws_eks_node_group"
30
    input.max_size > 50
31
    msg := "Node group max size cannot exceed 50 instances"
32
}
33

34
# Cost estimation
35
estimated_monthly_cost[resource_id] = cost {
36
    resource := input.resources[resource_id]
37
    instance_costs := {
38
        "t3.micro": 7.49,
39
        "t3.small": 14.98,
40
        "t3.medium": 29.95,
41
        "t3.large": 59.90,
42
        "m5.large": 69.12,
43
        "m5.xlarge": 138.24,
44
        "c5.large": 61.92,
45
        "c5.xlarge": 123.84
46
    }
47
    cost := instance_costs[resource.instance_type] * resource.count * 24 * 30
48
}
49

50
# Enforce cost limits
51
deny[msg] {
52
    total_cost := sum([cost | cost := estimated_monthly_cost[_]])
53
    total_cost > max_monthly_cost
54
    msg := sprintf("Estimated monthly cost $%.2f exceeds limit of $%.2f", [total_cost, max_monthly_cost])
55
}

Infrastructure Testing with Terratest#

1
package test
2

3
import (
4
    "crypto/tls"
5
    "fmt"
6
    "testing"
7
    "time"
8

9
    "github.com/gruntwork-io/terratest/modules/aws"
10
    "github.com/gruntwork-io/terratest/modules/http-helper"
11
    "github.com/gruntwork-io/terratest/modules/k8s"
12
    "github.com/gruntwork-io/terratest/modules/terraform"
13
    "github.com/stretchr/testify/assert"
14
    "github.com/stretchr/testify/require"
15
)
16

17
func TestKubernetesCluster(t *testing.T) {
18
    t.Parallel()
19

20
    // Configure Terraform options
21
    terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{
22
        TerraformDir: "../terraform/environments/test",
23
        Vars: map[string]interface{}{
24
            "cluster_name": fmt.Sprintf("test-cluster-%s", random.UniqueId()),
25
            "region":       "us-west-2",
26
            "node_count":   3,
27
        },
28
    })
29

30
    // Clean up resources
31
    defer terraform.Destroy(t, terraformOptions)
32

33
    // Deploy infrastructure
34
    terraform.InitAndApply(t, terraformOptions)
35

36
    // Get outputs
37
    clusterName := terraform.Output(t, terraformOptions, "cluster_name")
38
    region := terraform.Output(t, terraformOptions, "region")
39
    kubeconfig := terraform.Output(t, terraformOptions, "kubeconfig")
40

41
    // Verify cluster is running
42
    cluster := aws.GetEksCluster(t, region, clusterName)
43
    assert.Equal(t, "ACTIVE", cluster.Status)
44

45
    // Test Kubernetes connectivity
46
    options := k8s.NewKubectlOptionsFromConfig(kubeconfig, "")
47

48
    // Verify nodes are ready
49
    nodes := k8s.GetNodes(t, options)
50
    require.Equal(t, 3, len(nodes))
51

52
    for _, node := range nodes {
53
        assert.True(t, k8s.IsNodeReady(node))
54
    }
55

56
    // Deploy test application
57
    k8s.KubectlApply(t, options, "../k8s/test-app.yaml")
58
    defer k8s.KubectlDelete(t, options, "../k8s/test-app.yaml")
59

60
    // Wait for deployment
61
    k8s.WaitUntilDeploymentAvailable(t, options, "test-app", 10, 30*time.Second)
62

63
    // Get service endpoint
64
    service := k8s.GetService(t, options, "test-app")
65
    endpoint := k8s.GetServiceEndpoint(t, options, service, 80)
66

67
    // Test application is accessible
68
    tlsConfig := &tls.Config{InsecureSkipVerify: true}
69
    http_helper.HttpGetWithRetryWithCustomValidation(
70
        t,
71
        fmt.Sprintf("http://%s", endpoint),
72
        tlsConfig,
73
        30,
74
        10*time.Second,
75
        func(statusCode int, body string) bool {
76
            return statusCode == 200
77
        },
78
    )
79
}
80

81
func TestInfrastructureCompliance(t *testing.T) {
82
    t.Parallel()
83

84
    terraformOptions := terraform.WithDefaultRetryableErrors(t, &terraform.Options{
85
        TerraformDir: "../terraform/modules/vpc",
86
    })
87

88
    // Run compliance checks
89
    plan := terraform.InitAndPlan(t, terraformOptions)
90

91
    // Verify security group rules
92
    resourceChanges := terraform.GetResourceChangesFromPlan(t, plan)
93

94
    for _, change := range resourceChanges {
95
        if change.Type == "aws_security_group_rule" {
96
            // Ensure no 0.0.0.0/0 ingress rules
97
            assert.NotContains(t, change.AttributeChanges["cidr_blocks"], "0.0.0.0/0")
98
        }
99
    }
100

101
    // Verify encryption is enabled
102
    for _, change := range resourceChanges {
103
        if change.Type == "aws_ebs_volume" || change.Type == "aws_rds_cluster" {
104
            assert.True(t, change.AttributeChanges["encrypted"].(bool))
105
        }
106
    }
107
}

AI-Assisted Infrastructure Generation#

1
import openai
2
from typing import Dict, Any
3
import json
4
import hcl2
5
import yaml
6

7
class AIInfrastructureGenerator:
8
    def __init__(self):
9
        self.openai_client = openai.Client()
10
        self.templates = self.load_templates()
11

12
    def generate_infrastructure(self, requirements: str) -> Dict[str, Any]:
13
        """Generate infrastructure code from natural language requirements"""
14

15
        # Analyze requirements
16
        analysis = self.analyze_requirements(requirements)
17

18
        # Generate base infrastructure
19
        infrastructure = self.generate_base_infrastructure(analysis)
20

21
        # Optimize for cost and performance
22
        optimized = self.optimize_infrastructure(infrastructure)
23

24
        # Add security best practices
25
        secured = self.apply_security_practices(optimized)
26

27
        # Validate generated code
28
        self.validate_infrastructure(secured)
29

30
        return secured
31

32
    def analyze_requirements(self, requirements: str) -> Dict[str, Any]:
33
        """Use AI to analyze infrastructure requirements"""
34

35
        prompt = f"""
36
        Analyze the following infrastructure requirements and extract:
37
        1. Cloud provider (AWS, Azure, GCP)
38
        2. Required services (compute, storage, networking, databases)
39
        3. Scale requirements (number of users, requests per second)
40
        4. Security requirements (compliance, encryption, access control)
41
        5. Budget constraints
42
        6. Performance requirements
43

44
        Requirements: {requirements}
45

46
        Return as JSON.
47
        """
48

49
        response = self.openai_client.chat.completions.create(
50
            model="gpt-4",
51
            messages=[
52
                {"role": "system", "content": "You are an infrastructure architect expert."},
53
                {"role": "user", "content": prompt}
54
            ],
55
            temperature=0.3
56
        )
57

58
        return json.loads(response.choices[0].message.content)
59

60
    def generate_base_infrastructure(self, analysis: Dict[str, Any]) -> str:
61
        """Generate infrastructure code based on analysis"""
62

63
        if analysis['cloud_provider'] == 'AWS':
64
            return self.generate_terraform_aws(analysis)
65
        elif analysis['cloud_provider'] == 'Azure':
66
            return self.generate_terraform_azure(analysis)
67
        elif analysis['cloud_provider'] == 'GCP':
68
            return self.generate_terraform_gcp(analysis)
69

70
    def generate_terraform_aws(self, analysis: Dict[str, Any]) -> str:
71
        """Generate Terraform code for AWS"""
72

73
        prompt = f"""
74
        Generate production-ready Terraform code for AWS with:
75
        - Services: {', '.join(analysis['required_services'])}
76
        - Scale: {analysis['scale_requirements']}
77
        - Security: {analysis['security_requirements']}
78
        - Budget: {analysis['budget_constraints']}
79

80
        Include:
81
        1. VPC with proper CIDR planning
82
        2. High availability across multiple AZs
83
        3. Security groups with least privilege
84
        4. Encryption at rest and in transit
85
        5. Backup and disaster recovery
86
        6. Monitoring and alerting
87
        7. Cost optimization (spot instances, reserved capacity)
88

89
        Follow Terraform best practices and use latest provider versions.
90
        """
91

92
        response = self.openai_client.chat.completions.create(
93
            model="gpt-4",
94
            messages=[
95
                {"role": "system", "content": "You are a Terraform expert. Generate only valid HCL code."},
96
                {"role": "user", "content": prompt}
97
            ],
98
            temperature=0.2
99
        )
100

101
        return response.choices[0].message.content
102

103
    def optimize_infrastructure(self, infrastructure: str) -> str:
104
        """Optimize generated infrastructure for cost and performance"""
105

106
        # Parse infrastructure code
107
        if infrastructure.endswith('.tf'):
108
            parsed = hcl2.loads(infrastructure)
109
        else:
110
            parsed = yaml.safe_load(infrastructure)
111

112
        # Apply optimization rules
113
        optimizations = {
114
            'use_spot_instances': self.should_use_spot_instances(parsed),
115
            'enable_autoscaling': self.should_enable_autoscaling(parsed),
116
            'optimize_storage': self.optimize_storage_configuration(parsed),
117
            'network_optimization': self.optimize_network_configuration(parsed),
118
        }
119

120
        # Generate optimized code
121
        optimized_prompt = f"""
122
        Optimize the following infrastructure code with these recommendations:
123
        {json.dumps(optimizations, indent=2)}
124

125
        Original code:
126
        {infrastructure}
127

128
        Apply optimizations while maintaining functionality.
129
        """
130

131
        response = self.openai_client.chat.completions.create(
132
            model="gpt-4",
133
            messages=[
134
                {"role": "system", "content": "You are an infrastructure optimization expert."},
135
                {"role": "user", "content": optimized_prompt}
136
            ],
137
            temperature=0.2
138
        )
139

140
        return response.choices[0].message.content

Infrastructure Lifecycle Management#

Automated Drift Detection#

1
class InfrastructureDriftDetector:
2
    def __init__(self):
3
        self.terraform_client = TerraformClient()
4
        self.cloud_clients = {
5
            'aws': boto3.Session(),
6
            'azure': AzureClient(),
7
            'gcp': GCPClient()
8
        }
9

10
    async def detect_drift(self, workspace: str) -> DriftReport:
11
        """Detect infrastructure drift across all resources"""
12

13
        # Get current state
14
        tf_state = self.terraform_client.get_state(workspace)
15

16
        # Compare with actual resources
17
        drift_items = []
18

19
        for resource in tf_state.resources:
20
            actual_state = await self.get_actual_state(resource)
21
            expected_state = resource.attributes
22

23
            differences = self.compare_states(expected_state, actual_state)
24

25
            if differences:
26
                drift_items.append({
27
                    'resource': resource.address,
28
                    'type': resource.type,
29
                    'differences': differences,
30
                    'severity': self.calculate_severity(differences),
31
                    'remediation': self.suggest_remediation(differences)
32
                })
33

34
        return DriftReport(
35
            workspace=workspace,
36
            scan_time=datetime.now(),
37
            total_resources=len(tf_state.resources),
38
            drifted_resources=len(drift_items),
39
            drift_items=drift_items,
40
            risk_score=self.calculate_risk_score(drift_items)
41
        )
42

43
    def suggest_remediation(self, differences: List[Difference]) -> Dict[str, Any]:
44
        """AI-powered remediation suggestions"""
45

46
        critical_drift = [d for d in differences if d.severity == 'critical']
47

48
        if critical_drift:
49
            return {
50
                'action': 'immediate',
51
                'method': 'terraform_apply',
52
                'risk': 'high',
53
                'estimated_downtime': self.estimate_downtime(differences),
54
                'rollback_plan': self.generate_rollback_plan(differences)
55
            }
56
        else:
57
            return {
58
                'action': 'scheduled',
59
                'method': 'incremental_update',
60
                'risk': 'low',
61
                'maintenance_window': self.suggest_maintenance_window()
62
            }

Blue-Green Infrastructure Deployments#

1
export class BlueGreenInfrastructure {
2
  private readonly pulumiClient: automation.LocalWorkspace;
3

4
  async deployBlueGreen(config: BlueGreenConfig): Promise<DeploymentResult> {
5
    // Create green environment
6
    const greenStack = await this.createGreenEnvironment(config);
7

8
    // Run validation tests
9
    const validationResults = await this.validateGreenEnvironment(greenStack);
10

11
    if (!validationResults.passed) {
12
      await this.rollbackGreenEnvironment(greenStack);
13
      throw new Error(`Validation failed: ${validationResults.errors}`);
14
    }
15

16
    // Gradual traffic shift
17
    await this.performTrafficShift(config, greenStack);
18

19
    // Monitor metrics
20
    const metrics = await this.monitorDeployment(greenStack);
21

22
    if (metrics.errorRate > config.errorThreshold) {
23
      await this.rollbackTrafficShift(config);
24
      throw new Error("Error rate exceeded threshold");
25
    }
26

27
    // Complete cutover
28
    await this.completeCutover(config, greenStack);
29

30
    // Cleanup blue environment
31
    await this.cleanupBlueEnvironment(config);
32

33
    return {
34
      deploymentId: greenStack.name,
35
      duration: Date.now() - startTime,
36
      metrics: metrics,
37
    };
38
  }
39

40
  private async performTrafficShift(
41
    config: BlueGreenConfig,
42
    greenStack: automation.Stack
43
  ): Promise<void> {
44
    const stages = [10, 25, 50, 75, 100];
45

46
    for (const percentage of stages) {
47
      // Update load balancer weights
48
      await this.updateLoadBalancerWeights({
49
        blue: 100 - percentage,
50
        green: percentage,
51
      });
52

53
      // Wait and monitor
54
      await this.sleep(config.stageDuration);
55

56
      const metrics = await this.collectMetrics();
57
      if (!this.metricsHealthy(metrics)) {
58
        await this.updateLoadBalancerWeights({
59
          blue: 100,
60
          green: 0,
61
        });
62
        throw new Error(`Metrics unhealthy at ${percentage}% traffic`);
63
      }
64
    }
65
  }
66
}

Modern IaC Best Practices#

1. GitOps-Driven Infrastructure#

1
name: Infrastructure GitOps
2

3
on:
4
  push:
5
    paths:
6
      - "infrastructure/**"
7
      - ".github/workflows/infrastructure-*.yml"
8

9
jobs:
10
  plan:
11
    runs-on: ubuntu-latest
12
    steps:
13
      - uses: actions/checkout@v4
14

15
      - name: Security Scanning
16
        run: |
17
          # Scan for secrets
18
          trufflehog filesystem --directory=infrastructure/
19

20
          # Policy validation
21
          opa test policies/infrastructure/
22

23
          # IaC security scanning
24
          checkov -d infrastructure/
25

26
      - name: Cost Estimation
27
        run: |
28
          infracost breakdown --path infrastructure/
29
          infracost diff --path infrastructure/ --compare-to main
30

31
      - name: Generate Plan
32
        run: |
33
          cd infrastructure/
34
          terraform init
35
          terraform plan -out=plan.tfplan
36
          terraform show -json plan.tfplan > plan.json
37

38
      - name: AI Review
39
        run: |
40
          python scripts/ai_review_infrastructure.py plan.json

2. Multi-Cloud Abstraction#

1
export abstract class CloudResource {
2
  abstract deploy(): Promise<void>;
3
  abstract validate(): Promise<boolean>;
4
  abstract destroy(): Promise<void>;
5
}
6

7
export class MultiCloudDatabase extends CloudResource {
8
  constructor(
9
    private provider: "aws" | "azure" | "gcp",
10
    private config: DatabaseConfig
11
  ) {
12
    super();
13
  }
14

15
  async deploy(): Promise<void> {
16
    switch (this.provider) {
17
      case "aws":
18
        return this.deployRDS();
19
      case "azure":
20
        return this.deployAzureSQL();
21
      case "gcp":
22
        return this.deployCloudSQL();
23
    }
24
  }
25

26
  private async deployRDS(): Promise<void> {
27
    // AWS RDS deployment with sensible defaults
28
    const rds = new aws.rds.Instance("database", {
29
      engine: "postgres",
30
      engineVersion: "14.9",
31
      instanceClass: this.getInstanceClass(),
32
      allocatedStorage: this.config.storageGb,
33
      storageEncrypted: true,
34
      backupRetentionPeriod: 30,
35
      multiAz: this.config.highAvailability,
36
      vpcSecurityGroupIds: [this.getSecurityGroup()],
37
      dbSubnetGroupName: this.getSubnetGroup(),
38
    });
39
  }
40
}

3. Self-Healing Infrastructure#

1
class SelfHealingInfrastructure:
2
    def __init__(self):
3
        self.health_checks = []
4
        self.remediation_actions = {}
5

6
    async def monitor_and_heal(self):
7
        """Continuous monitoring and self-healing loop"""
8

9
        while True:
10
            unhealthy_resources = await self.check_health()
11

12
            for resource in unhealthy_resources:
13
                try:
14
                    await self.attempt_healing(resource)
15
                except Exception as e:
16
                    await self.escalate_to_humans(resource, e)
17

18
            await asyncio.sleep(60)  # Check every minute
19

20
    async def attempt_healing(self, resource: UnhealthyResource):
21
        """Attempt to heal unhealthy resource"""
22

23
        healing_strategies = [
24
            self.restart_resource,
25
            self.scale_horizontally,
26
            self.failover_to_backup,
27
            self.recreate_resource
28
        ]
29

30
        for strategy in healing_strategies:
31
            try:
32
                result = await strategy(resource)
33
                if result.success:
34
                    await self.notify_healing_success(resource, strategy)
35
                    return
36
            except Exception as e:
37
                continue
38

39
        # All strategies failed
40
        raise HealingFailedException(resource)

The Future of Infrastructure as Code#

AI-Driven Infrastructure#

Natural language to infrastructure
Intelligent optimization
Predictive scaling
Automated security hardening

Quantum-Ready Infrastructure#

Quantum-safe encryption
Hybrid classical-quantum workloads
Quantum networking preparation

Sustainable Infrastructure#

Carbon-aware deployments
Energy-efficient resource allocation
Green cloud provider selection

Conclusion#

Modern Infrastructure as Code has evolved far beyond simple configuration files. With tools like Pulumi bringing real programming languages, Crossplane enabling Kubernetes-native infrastructure, and AI assisting in generation and optimization, we can build more sophisticated, reliable, and efficient infrastructure than ever before. The key is choosing the right tool for your team’s skills and requirements while following best practices for testing, security, and operations.

Additional Resources#

This concludes our 96-day journey through the world of DevOps! Thank you for joining me on this comprehensive exploration of modern DevOps practices, tools, and techniques. Keep learning, keep building, and keep pushing the boundaries of what’s possible in infrastructure and operations!