Day 93 - MLOps: Operationalizing Machine Learning at Scale#

Machine Learning Operations (MLOps) has emerged as a critical discipline that bridges the gap between ML development and production deployment. Today, we’ll explore how to build robust, scalable ML systems that can handle the unique challenges of operationalizing machine learning models.

The MLOps Challenge#

Traditional software follows deterministic patterns, but ML introduces unique complexities:

Data Dependency: Models are only as good as their training data
Model Drift: Performance degrades over time as data distributions change
Experimentation: Hundreds of experiments before production-ready models
Reproducibility: Ensuring consistent results across environments
Monitoring: Tracking not just system metrics but model performance

The MLOps Lifecycle#

1. Data Engineering & Feature Engineering#

The foundation of any ML system is robust data pipelines.

1
# Feature Store Implementation with Feast
2
from feast import FeatureStore
3
import pandas as pd
4
from datetime import datetime, timedelta
5

6
# Initialize feature store
7
store = FeatureStore(repo_path="feature_repo/")
8

9
# Define feature retrieval
10
entity_df = pd.DataFrame({
11
    "customer_id": [1001, 1002, 1003],
12
    "event_timestamp": [
13
        datetime.now() - timedelta(hours=3),
14
        datetime.now() - timedelta(hours=2),
15
        datetime.now() - timedelta(hours=1)
16
    ]
17
})
18

19
# Retrieve features
20
training_df = store.get_historical_features(
21
    entity_df=entity_df,
22
    features=[
23
        "customer_stats:total_transactions",
24
        "customer_stats:avg_transaction_amount",
25
        "customer_stats:days_since_last_purchase"
26
    ]
27
).to_df()
28

29
# Feature validation
30
from great_expectations import DataContext
31

32
context = DataContext()
33
batch = context.get_batch(
34
    datasource_name="customer_features",
35
    data_asset_name="training_features"
36
)
37

38
# Define expectations
39
batch.expect_column_values_to_not_be_null("customer_id")
40
batch.expect_column_values_to_be_between(
41
    "avg_transaction_amount",
42
    min_value=0,
43
    max_value=10000
44
)
45

46
# Validate
47
results = batch.validate()

2. Experiment Tracking#

Track experiments systematically to ensure reproducibility.

1
# MLflow Experiment Tracking
2
import mlflow
3
import mlflow.sklearn
4
from sklearn.ensemble import RandomForestClassifier
5
from sklearn.metrics import accuracy_score, roc_auc_score
6
import numpy as np
7

8
# Start MLflow run
9
with mlflow.start_run(run_name="rf_customer_churn_v3"):
10
    # Log parameters
11
    n_estimators = 100
12
    max_depth = 10
13
    mlflow.log_param("n_estimators", n_estimators)
14
    mlflow.log_param("max_depth", max_depth)
15
    mlflow.log_param("feature_set_version", "v2.1")
16

17
    # Train model
18
    rf = RandomForestClassifier(
19
        n_estimators=n_estimators,
20
        max_depth=max_depth,
21
        random_state=42
22
    )
23
    rf.fit(X_train, y_train)
24

25
    # Make predictions
26
    y_pred = rf.predict(X_test)
27
    y_pred_proba = rf.predict_proba(X_test)[:, 1]
28

29
    # Log metrics
30
    accuracy = accuracy_score(y_test, y_pred)
31
    auc = roc_auc_score(y_test, y_pred_proba)
32
    mlflow.log_metric("accuracy", accuracy)
33
    mlflow.log_metric("auc", auc)
34

35
    # Log model
36
    mlflow.sklearn.log_model(
37
        rf,
38
        "model",
39
        registered_model_name="customer_churn_classifier"
40
    )
41

42
    # Log feature importance
43
    feature_importance = pd.DataFrame({
44
        'feature': X_train.columns,
45
        'importance': rf.feature_importances_
46
    }).sort_values('importance', ascending=False)
47

48
    mlflow.log_table(feature_importance, "feature_importance.json")
49

50
    # Log artifacts
51
    mlflow.log_artifact("preprocessing_pipeline.pkl")
52
    mlflow.log_artifact("model_config.yaml")

3. Model Training Pipeline#

Automated training pipelines ensure consistency and scalability.

1
# Kubeflow Pipeline for Model Training
2
import kfp
3
from kfp import dsl
4
from kfp.components import func_to_container_op
5

6
# Define pipeline components
7
@func_to_container_op
8
def load_data(data_path: str) -> str:
9
    import pandas as pd
10
    import boto3
11
    from io import StringIO
12

13
    s3 = boto3.client('s3')
14
    obj = s3.get_object(Bucket='ml-data', Key=data_path)
15
    df = pd.read_csv(StringIO(obj['Body'].read().decode('utf-8')))
16

17
    # Save to shared volume
18
    df.to_csv('/mnt/data/raw_data.csv', index=False)
19
    return '/mnt/data/raw_data.csv'
20

21
@func_to_container_op
22
def preprocess_data(data_path: str) -> str:
23
    import pandas as pd
24
    from sklearn.preprocessing import StandardScaler
25
    import joblib
26

27
    df = pd.read_csv(data_path)
28

29
    # Feature engineering
30
    df['total_spend_per_transaction'] = df['total_spend'] / df['transaction_count']
31
    df['days_since_registration'] = (pd.Timestamp.now() - pd.to_datetime(df['registration_date'])).dt.days
32

33
    # Scaling
34
    scaler = StandardScaler()
35
    numeric_features = ['total_spend', 'transaction_count', 'days_since_registration']
36
    df[numeric_features] = scaler.fit_transform(df[numeric_features])
37

38
    # Save preprocessor
39
    joblib.dump(scaler, '/mnt/model/preprocessor.pkl')
40

41
    # Save processed data
42
    df.to_csv('/mnt/data/processed_data.csv', index=False)
43
    return '/mnt/data/processed_data.csv'
44

45
@func_to_container_op
46
def train_model(data_path: str, model_type: str) -> str:
47
    import pandas as pd
48
    import joblib
49
    from sklearn.model_selection import train_test_split
50
    from sklearn.ensemble import RandomForestClassifier
51
    from sklearn.linear_model import LogisticRegression
52
    import mlflow
53

54
    df = pd.read_csv(data_path)
55
    X = df.drop(['target', 'customer_id'], axis=1)
56
    y = df['target']
57

58
    X_train, X_test, y_train, y_test = train_test_split(
59
        X, y, test_size=0.2, random_state=42
60
    )
61

62
    if model_type == 'random_forest':
63
        model = RandomForestClassifier(n_estimators=100, random_state=42)
64
    else:
65
        model = LogisticRegression(random_state=42)
66

67
    model.fit(X_train, y_train)
68

69
    # Save model
70
    model_path = f'/mnt/model/{model_type}_model.pkl'
71
    joblib.dump(model, model_path)
72

73
    # Log to MLflow
74
    mlflow.log_model(model, model_type)
75

76
    return model_path
77

78
@func_to_container_op
79
def evaluate_model(model_path: str, data_path: str) -> float:
80
    import pandas as pd
81
    import joblib
82
    from sklearn.metrics import roc_auc_score
83
    import mlflow
84

85
    model = joblib.load(model_path)
86
    df = pd.read_csv(data_path)
87

88
    X = df.drop(['target', 'customer_id'], axis=1)
89
    y = df['target']
90

91
    predictions = model.predict_proba(X)[:, 1]
92
    auc = roc_auc_score(y, predictions)
93

94
    mlflow.log_metric("test_auc", auc)
95

96
    return auc
97

98
# Define pipeline
99
@dsl.pipeline(
100
    name='Customer Churn Training Pipeline',
101
    description='End-to-end ML training pipeline'
102
)
103
def ml_pipeline(data_path: str = 'data/customers.csv'):
104
    # Pipeline DAG
105
    data = load_data(data_path)
106
    processed_data = preprocess_data(data.output)
107

108
    # Train multiple models in parallel
109
    rf_model = train_model(processed_data.output, 'random_forest')
110
    lr_model = train_model(processed_data.output, 'logistic_regression')
111

112
    # Evaluate models
113
    rf_score = evaluate_model(rf_model.output, processed_data.output)
114
    lr_score = evaluate_model(lr_model.output, processed_data.output)
115

116
    # Select best model
117
    with dsl.Condition(rf_score.output > lr_score.output):
118
        deploy_model(rf_model.output)
119

120
# Compile and run pipeline
121
kfp.compiler.Compiler().compile(ml_pipeline, 'ml_pipeline.yaml')

4. Model Deployment#

Deploy models with proper versioning and rollback capabilities.

1
# Model Serving with BentoML
2
import bentoml
3
from bentoml.io import JSON, NumpyNdarray
4
import numpy as np
5

6
# Define service
7
@bentoml.env(pip_packages=["scikit-learn", "pandas", "numpy"])
8
@bentoml.artifacts([
9
    bentoml.artifact.PickleArtifact("model"),
10
    bentoml.artifact.PickleArtifact("preprocessor")
11
])
12
class ChurnPredictionService(bentoml.BentoService):
13

14
    @bentoml.api(input=JSON(), output=JSON())
15
    def predict(self, input_data):
16
        # Preprocess input
17
        features = self.artifacts.preprocessor.transform(
18
            pd.DataFrame([input_data])
19
        )
20

21
        # Make prediction
22
        prediction = self.artifacts.model.predict_proba(features)[0]
23

24
        return {
25
            "customer_id": input_data["customer_id"],
26
            "churn_probability": float(prediction[1]),
27
            "prediction": "churn" if prediction[1] > 0.5 else "retain",
28
            "model_version": self.version
29
        }
30

31
    @bentoml.api(input=JSON(), output=JSON())
32
    def batch_predict(self, input_data):
33
        df = pd.DataFrame(input_data["customers"])
34
        features = self.artifacts.preprocessor.transform(df)
35
        predictions = self.artifacts.model.predict_proba(features)
36

37
        results = []
38
        for i, customer in enumerate(input_data["customers"]):
39
            results.append({
40
                "customer_id": customer["customer_id"],
41
                "churn_probability": float(predictions[i][1]),
42
                "prediction": "churn" if predictions[i][1] > 0.5 else "retain"
43
            })
44

45
        return {"predictions": results}
46

47
# Save service
48
svc = ChurnPredictionService()
49
svc.pack("model", model)
50
svc.pack("preprocessor", preprocessor)
51
svc.save()
52

53
# Deploy with Kubernetes
54
# bentoml containerize ChurnPredictionService:latest
55
# bentoml deploy ChurnPredictionService:latest --platform=kubernetes

Kubernetes Deployment Configuration#

1
apiVersion: serving.knative.dev/v1
2
kind: Service
3
metadata:
4
  name: churn-prediction-service
5
  namespace: ml-models
6
spec:
7
  template:
8
    metadata:
9
      annotations:
10
        autoscaling.knative.dev/minScale: "2"
11
        autoscaling.knative.dev/maxScale: "100"
12
        autoscaling.knative.dev/target: "100"
13
    spec:
14
      containers:
15
        - image: gcr.io/project/churn-prediction:v1.2.0
16
          ports:
17
            - containerPort: 5000
18
          env:
19
            - name: MODEL_NAME
20
              value: "churn_classifier"
21
            - name: MODEL_VERSION
22
              value: "v1.2.0"
23
          resources:
24
            requests:
25
              memory: "2Gi"
26
              cpu: "1"
27
            limits:
28
              memory: "4Gi"
29
              cpu: "2"
30
              nvidia.com/gpu: "1" # For GPU inference
31
          readinessProbe:
32
            httpGet:
33
              path: /healthz
34
              port: 5000
35
            initialDelaySeconds: 10
36
            periodSeconds: 5
37
          livenessProbe:
38
            httpGet:
39
              path: /healthz
40
              port: 5000
41
            initialDelaySeconds: 30
42
            periodSeconds: 10

5. Model Monitoring#

Monitor model performance in production to detect drift and degradation.

1
# Model Monitoring with Evidently
2
from evidently import ColumnMapping
3
from evidently.report import Report
4
from evidently.metric_preset import DataDriftPreset, TargetDriftPreset
5
from evidently.test_suite import TestSuite
6
from evidently.test_preset import DataQualityTestPreset
7
import pandas as pd
8
from datetime import datetime
9
import boto3
10

11
class ModelMonitor:
12
    def __init__(self, reference_data, model_name):
13
        self.reference_data = reference_data
14
        self.model_name = model_name
15
        self.s3_client = boto3.client('s3')
16

17
    def check_data_drift(self, current_data):
18
        """Check for data drift between reference and current data"""
19

20
        # Create column mapping
21
        column_mapping = ColumnMapping(
22
            target='target',
23
            prediction='prediction',
24
            numerical_features=['total_spend', 'transaction_count', 'days_since_registration'],
25
            categorical_features=['customer_segment', 'region']
26
        )
27

28
        # Create drift report
29
        drift_report = Report(metrics=[
30
            DataDriftPreset(),
31
            TargetDriftPreset()
32
        ])
33

34
        drift_report.run(
35
            reference_data=self.reference_data,
36
            current_data=current_data,
37
            column_mapping=column_mapping
38
        )
39

40
        # Extract results
41
        drift_results = drift_report.as_dict()
42

43
        # Check if drift detected
44
        drift_detected = False
45
        for metric in drift_results['metrics']:
46
            if metric.get('result', {}).get('drift_detected', False):
47
                drift_detected = True
48
                break
49

50
        # Log results
51
        self.log_monitoring_results({
52
            'timestamp': datetime.now().isoformat(),
53
            'model_name': self.model_name,
54
            'drift_detected': drift_detected,
55
            'drift_score': drift_results.get('metrics', [{}])[0].get('result', {}).get('drift_score', 0),
56
            'n_drifted_features': sum(1 for m in drift_results['metrics'] if m.get('result', {}).get('drift_detected', False))
57
        })
58

59
        # Alert if drift detected
60
        if drift_detected:
61
            self.send_alert(f"Data drift detected for model {self.model_name}")
62

63
        return drift_results
64

65
    def check_prediction_quality(self, predictions_df):
66
        """Monitor prediction quality metrics"""
67

68
        # Run quality tests
69
        test_suite = TestSuite(tests=[
70
            DataQualityTestPreset()
71
        ])
72

73
        test_suite.run(
74
            reference_data=self.reference_data,
75
            current_data=predictions_df
76
        )
77

78
        # Calculate performance metrics
79
        from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
80

81
        metrics = {
82
            'accuracy': accuracy_score(predictions_df['target'], predictions_df['prediction']),
83
            'precision': precision_score(predictions_df['target'], predictions_df['prediction']),
84
            'recall': recall_score(predictions_df['target'], predictions_df['prediction']),
85
            'f1_score': f1_score(predictions_df['target'], predictions_df['prediction'])
86
        }
87

88
        # Check for performance degradation
89
        performance_threshold = 0.85
90
        if metrics['accuracy'] < performance_threshold:
91
            self.send_alert(
92
                f"Model performance degraded: Accuracy {metrics['accuracy']:.2f} < {performance_threshold}"
93
            )
94
            self.trigger_retraining()
95

96
        return metrics
97

98
    def log_monitoring_results(self, results):
99
        """Log monitoring results to S3"""
100
        import json
101

102
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
103
        key = f"monitoring/{self.model_name}/{timestamp}.json"
104

105
        self.s3_client.put_object(
106
            Bucket='ml-monitoring',
107
            Key=key,
108
            Body=json.dumps(results)
109
        )
110

111
    def send_alert(self, message):
112
        """Send alert via SNS"""
113
        sns_client = boto3.client('sns')
114
        sns_client.publish(
115
            TopicArn='arn:aws:sns:us-east-1:123456789:ml-alerts',
116
            Message=message,
117
            Subject=f'ML Model Alert: {self.model_name}'
118
        )
119

120
    def trigger_retraining(self):
121
        """Trigger model retraining pipeline"""
122
        import requests
123

124
        response = requests.post(
125
            'https://airflow.company.com/api/v1/dags/model_retraining/dagRuns',
126
            json={
127
                'conf': {
128
                    'model_name': self.model_name,
129
                    'trigger_reason': 'performance_degradation'
130
                }
131
            },
132
            headers={'Authorization': 'Bearer ' + os.getenv('AIRFLOW_TOKEN')}
133
        )
134

135
        print(f"Retraining triggered: {response.status_code}")

6. A/B Testing and Gradual Rollout#

Safely deploy new models with controlled rollouts.

1
# Feature Flag Based Model Routing
2
from flask import Flask, request, jsonify
3
import random
4
import json
5
from datetime import datetime
6

7
app = Flask(__name__)
8

9
class ModelRouter:
10
    def __init__(self):
11
        self.models = {
12
            'v1': ModelV1(),
13
            'v2': ModelV2()
14
        }
15
        self.rollout_config = {
16
            'v2_percentage': 10,  # Start with 10% traffic to v2
17
            'sticky_sessions': True,
18
            'excluded_segments': ['high_value_customers']
19
        }
20

21
    def get_model_version(self, customer_id, customer_segment):
22
        # Check if customer should be excluded from rollout
23
        if customer_segment in self.rollout_config['excluded_segments']:
24
            return 'v1'
25

26
        # Implement sticky sessions
27
        if self.rollout_config['sticky_sessions']:
28
            # Use consistent hashing for sticky routing
29
            hash_value = hash(customer_id) % 100
30
            if hash_value < self.rollout_config['v2_percentage']:
31
                return 'v2'
32
            return 'v1'
33

34
        # Random routing
35
        if random.random() * 100 < self.rollout_config['v2_percentage']:
36
            return 'v2'
37
        return 'v1'
38

39
router = ModelRouter()
40

41
@app.route('/predict', methods=['POST'])
42
def predict():
43
    data = request.json
44
    customer_id = data['customer_id']
45
    customer_segment = data.get('customer_segment', 'standard')
46

47
    # Determine model version
48
    model_version = router.get_model_version(customer_id, customer_segment)
49
    model = router.models[model_version]
50

51
    # Make prediction
52
    start_time = datetime.now()
53
    prediction = model.predict(data)
54
    latency = (datetime.now() - start_time).total_seconds()
55

56
    # Log for analysis
57
    log_prediction({
58
        'customer_id': customer_id,
59
        'model_version': model_version,
60
        'prediction': prediction,
61
        'latency': latency,
62
        'timestamp': datetime.now().isoformat()
63
    })
64

65
    return jsonify({
66
        'prediction': prediction,
67
        'model_version': model_version,
68
        'request_id': str(uuid.uuid4())
69
    })
70

71
@app.route('/rollout/update', methods=['POST'])
72
def update_rollout():
73
    """Update rollout configuration"""
74
    data = request.json
75

76
    # Validate rollout percentage
77
    new_percentage = data.get('v2_percentage')
78
    if new_percentage and 0 <= new_percentage <= 100:
79
        router.rollout_config['v2_percentage'] = new_percentage
80

81
        # Log configuration change
82
        log_config_change({
83
            'action': 'rollout_update',
84
            'new_percentage': new_percentage,
85
            'timestamp': datetime.now().isoformat()
86
        })
87

88
        return jsonify({'status': 'success', 'new_config': router.rollout_config})
89

90
    return jsonify({'status': 'error', 'message': 'Invalid percentage'}), 400

7. Automated ML Pipeline#

End-to-end automation with Apache Airflow.

1
# Airflow DAG for ML Pipeline
2
from airflow import DAG
3
from airflow.operators.python import PythonOperator
4
from airflow.operators.kubernetes_pod import KubernetesPodOperator
5
from airflow.providers.amazon.aws.sensors.s3 import S3KeySensor
6
from datetime import datetime, timedelta
7

8
default_args = {
9
    'owner': 'ml-team',
10
    'depends_on_past': False,
11
    'start_date': datetime(2024, 1, 1),
12
    'email_on_failure': True,
13
    'email_on_retry': False,
14
    'retries': 1,
15
    'retry_delay': timedelta(minutes=5)
16
}
17

18
dag = DAG(
19
    'ml_training_pipeline',
20
    default_args=default_args,
21
    description='Automated ML training pipeline',
22
    schedule_interval='@daily',
23
    catchup=False
24
)
25

26
# Wait for new data
27
wait_for_data = S3KeySensor(
28
    task_id='wait_for_data',
29
    bucket_name='ml-data',
30
    bucket_key='raw/{{ ds }}/customer_data.csv',
31
    aws_conn_id='aws_default',
32
    timeout=18*60*60,
33
    poke_interval=300,
34
    dag=dag
35
)
36

37
# Data validation
38
validate_data = KubernetesPodOperator(
39
    task_id='validate_data',
40
    name='validate-data',
41
    namespace='ml-jobs',
42
    image='ml-pipeline:data-validator',
43
    arguments=[
44
        '--input-path', 's3://ml-data/raw/{{ ds }}/customer_data.csv',
45
        '--schema-path', 's3://ml-config/schemas/customer_schema.json'
46
    ],
47
    dag=dag
48
)
49

50
# Feature engineering
51
feature_engineering = KubernetesPodOperator(
52
    task_id='feature_engineering',
53
    name='feature-engineering',
54
    namespace='ml-jobs',
55
    image='ml-pipeline:feature-engineering',
56
    arguments=[
57
        '--input-path', 's3://ml-data/raw/{{ ds }}/customer_data.csv',
58
        '--output-path', 's3://ml-data/features/{{ ds }}/features.parquet'
59
    ],
60
    resources={
61
        'request_memory': '4Gi',
62
        'request_cpu': '2',
63
        'limit_memory': '8Gi',
64
        'limit_cpu': '4'
65
    },
66
    dag=dag
67
)
68

69
# Model training
70
model_training = KubernetesPodOperator(
71
    task_id='model_training',
72
    name='model-training',
73
    namespace='ml-jobs',
74
    image='ml-pipeline:model-training',
75
    arguments=[
76
        '--features-path', 's3://ml-data/features/{{ ds }}/features.parquet',
77
        '--model-type', 'xgboost',
78
        '--hyperparameter-tuning', 'true'
79
    ],
80
    resources={
81
        'request_memory': '8Gi',
82
        'request_cpu': '4',
83
        'limit_memory': '16Gi',
84
        'limit_cpu': '8',
85
        'limit_gpu': '1'
86
    },
87
    dag=dag
88
)
89

90
# Model evaluation
91
model_evaluation = KubernetesPodOperator(
92
    task_id='model_evaluation',
93
    name='model-evaluation',
94
    namespace='ml-jobs',
95
    image='ml-pipeline:model-evaluation',
96
    arguments=[
97
        '--model-path', 's3://ml-models/{{ run_id }}/model.pkl',
98
        '--test-data', 's3://ml-data/features/{{ ds }}/test_features.parquet',
99
        '--baseline-metrics', 's3://ml-metrics/baseline_metrics.json'
100
    ],
101
    dag=dag
102
)
103

104
# Deploy model if evaluation passes
105
deploy_model = KubernetesPodOperator(
106
    task_id='deploy_model',
107
    name='deploy-model',
108
    namespace='ml-jobs',
109
    image='ml-pipeline:model-deployer',
110
    arguments=[
111
        '--model-path', 's3://ml-models/{{ run_id }}/model.pkl',
112
        '--deployment-config', 's3://ml-config/deployment/production.yaml',
113
        '--canary-percentage', '10'
114
    ],
115
    dag=dag
116
)
117

118
# Define dependencies
119
wait_for_data >> validate_data >> feature_engineering >> model_training >> model_evaluation >> deploy_model

Best Practices for Production MLOps#

1. Version Everything#

1
model:
2
  name: customer_churn_classifier
3
  version: 2.3.1
4
  framework: scikit-learn==1.2.0
5

6
training:
7
  data_version: v3.2.0
8
  code_version: git:8a3f2d1
9
  hyperparameters:
10
    n_estimators: 150
11
    max_depth: 12
12

13
dependencies:
14
  - pandas==1.5.3
15
  - numpy==1.24.2
16
  - scikit-learn==1.2.0
17

18
metrics:
19
  validation_auc: 0.892
20
  test_auc: 0.887
21
  training_date: 2024-01-15T10:30:00Z

2. Implement Comprehensive Testing#

1
import pytest
2
from model import ChurnClassifier
3
import pandas as pd
4

5
class TestModelQuality:
6

7
    def test_prediction_range(self, trained_model, test_data):
8
        """Ensure predictions are in valid range"""
9
        predictions = trained_model.predict_proba(test_data)
10
        assert (predictions >= 0).all() and (predictions <= 1).all()
11

12
    def test_prediction_distribution(self, trained_model, test_data):
13
        """Check prediction distribution is reasonable"""
14
        predictions = trained_model.predict(test_data)
15
        churn_rate = predictions.mean()
16
        assert 0.05 <= churn_rate <= 0.30, f"Unusual churn rate: {churn_rate}"
17

18
    def test_feature_importance_stability(self, trained_model, previous_model):
19
        """Ensure feature importance doesn't change drastically"""
20
        current_importance = trained_model.feature_importances_
21
        previous_importance = previous_model.feature_importances_
22

23
        correlation = np.corrcoef(current_importance, previous_importance)[0, 1]
24
        assert correlation > 0.8, f"Feature importance correlation too low: {correlation}"
25

26
    def test_inference_latency(self, trained_model, test_data):
27
        """Ensure inference meets latency requirements"""
28
        import time
29

30
        sample = test_data.sample(100)
31
        start = time.time()
32
        _ = trained_model.predict(sample)
33
        latency = (time.time() - start) / 100
34

35
        assert latency < 0.01, f"Inference too slow: {latency}s per prediction"

3. Implement Model Governance#

1
# Model Registry with Governance
2
class ModelRegistry:
3
    def __init__(self):
4
        self.approved_models = {}
5
        self.pending_approval = {}
6

7
    def submit_model(self, model_metadata):
8
        """Submit model for approval"""
9
        model_id = f"{model_metadata['name']}:{model_metadata['version']}"
10

11
        # Automated checks
12
        checks = {
13
            'performance': self._check_performance(model_metadata),
14
            'bias': self._check_bias(model_metadata),
15
            'security': self._check_security(model_metadata),
16
            'compliance': self._check_compliance(model_metadata)
17
        }
18

19
        if all(checks.values()):
20
            # Auto-approve if all checks pass
21
            self.approved_models[model_id] = {
22
                **model_metadata,
23
                'approval_date': datetime.now(),
24
                'checks': checks
25
            }
26
            return {'status': 'approved', 'model_id': model_id}
27
        else:
28
            # Require manual review
29
            self.pending_approval[model_id] = {
30
                **model_metadata,
31
                'checks': checks,
32
                'submitted_date': datetime.now()
33
            }
34
            return {'status': 'pending', 'model_id': model_id, 'failed_checks': [k for k, v in checks.items() if not v]}

Conclusion#

MLOps is essential for successfully deploying and maintaining ML models in production. Key takeaways:

Automate Everything: From data validation to model deployment
Version Religiously: Track data, code, models, and configurations
Monitor Continuously: Watch for drift, degradation, and anomalies
Test Comprehensively: Unit tests, integration tests, and ML-specific tests
Deploy Gradually: Use canary deployments and A/B testing
Govern Properly: Implement checks for bias, compliance, and security

The tools and practices we’ve covered form the foundation of a robust MLOps platform. As ML becomes increasingly critical to business operations, investing in proper MLOps infrastructure is no longer optional—it’s essential for maintaining competitive advantage while ensuring reliability, compliance, and scalability.