!pip install "numpy<2"

Requirement already satisfied: numpy<2 in /Users/ashrithreddy/anaconda3/lib/python3.11/site-packages (1.26.4)


import os
import joblib
import xgboost as xgb
import numpy as np

# Ensure directory exists
os.makedirs("models", exist_ok=True)

# Generate dummy data
X = np.array([[1], [2], [3], [4], [5]])
y = np.array([0, 0, 1, 1, 1])

# Train XGBoost model
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss")
model.fit(X, y)

# Save model
joblib.dump(model, "models/xgb_model.joblib")

['models/xgb_model.joblib']


import os
import json

# Create the models directory if it doesn't exist
os.makedirs("models", exist_ok=True)

# Metadata dictionary
metadata = {
    "model_version": "v1.0",
    "git_commit": "a1b2c3d",
    "trained_on": "2025-06-29",
    "features": ["sepal_length", "sepal_width", "petal_length", "petal_width"],
    "accuracy": 0.96
}

# Save to disk
with open("models/logreg_model_metadata.json", "w") as f:
    json.dump(metadata, f, indent=2)


import numpy as np
import random
import os

def set_seed(seed=42):
    np.random.seed(seed)
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)


import logging

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s"
)

logging.info("Training started")

2025-06-30 12:58:26,540 | INFO | Training started


import argparse
import sys

# Simulate CLI args
sys.argv = ['predict.py', '--input', 'models/sample.csv', '--model', 'models/xgb_model.joblib']

parser = argparse.ArgumentParser(description="Run inference using a trained model.")
parser.add_argument('--input', type=str, required=True, help="Path to input CSV or JSON")
parser.add_argument('--model', type=str, default='models/logreg_model.joblib', help="Path to model file")

args = parser.parse_args()


import pandas as pd
import os

# Make sure the current directory is writeable and file is saved
sample = pd.DataFrame({"feature": [1, 2, 3, 4, 5]})
sample.to_csv("models/sample.csv", index=False)

# Confirm it's created
assert os.path.exists("models/sample.csv"), "models/sample.csv was not created."


import joblib
import pandas as pd

# Load model
model = joblib.load(args.model)

# Load input data
data = pd.read_csv(args.input)

# Predict
preds = model.predict(data)

# Output predictions
print("Predictions:", preds.tolist())

Predictions: [1, 1, 1, 1, 1]


# Your Dockerfile content here, save the file with the name "Dockerfile"

# # Use an official lightweight Python image
# FROM python:3.10-slim

# # Set working directory
# WORKDIR /app

# # Copy files into the image
# COPY requirements.txt .
# COPY predict.py .
# COPY models/ models/
# COPY sample.csv .

# # Install Python dependencies
# RUN pip install --no-cache-dir -r requirements.txt

# # Run prediction script by default
# ENTRYPOINT ["python", "predict.py"]


# test_predict.py (example structure)
import subprocess

def test_predict_runs():
    result = subprocess.run(
        ["python", "predict.py", "--input", "models/sample.csv", "--model", "models/xgb_model.joblib"],
        capture_output=True,
        text=True
    )
    assert result.returncode == 0
    assert "Predictions" in result.stdout


import joblib
import pandas as pd

model = joblib.load("models/xgb_model.joblib")
df = pd.read_csv("models/sample.csv")
print("Prediction output:", model.predict(df).tolist())

Prediction output: [1, 1, 1, 1, 1]

📖 Model Packaging¶

🎯 Goal of This Notebook¶

🧱 Project Structure & Folder Hygiene¶

📁 Suggested Layout¶

Organizing Your Project for Reuse¶

🧼 What to Keep Out of Version Control¶

Use `.gitignore` to Keep the Repo Clean¶

🧠 Model Serialization¶

💾 Saving Models (joblib, pickle, cloudpickle)¶

Why Model Serialization Matters¶

🧪 Versioning + Metadata Storage¶

Add Context to Every Saved Model¶

🗂️ Dependency Management¶

🧾 `requirements.txt` vs `environment.yml`¶

Two Styles of Declaring Dependencies¶

🔐 Deterministic Environments (pip freeze, lock files)¶

Why Just Listing Packages Isn’t Enough¶

🧪 Reproducible Training Script¶

📜 Convert Notebook to Script¶

From Experiment to Reusable Code¶

🔁 Add Seed Control + Logging¶

Making Training Repeatable and Traceable¶

🧰 CLI for Prediction¶

🧑‍💻 `argparse` Interface¶

Why Build a CLI?¶

📦 Load Model → Accept Input → Return Output¶

A Minimal End-to-End Prediction Script¶

🐳 Introduction to Docker¶

📦 What Is Docker (In Plain English)¶

Why ML Engineers Love It¶

🧱 Writing a Simple Dockerfile¶

Minimal Dockerfile to Serve a Model¶

🧪 Testing the Packaged Model¶

🧪 Unit Test the CLI¶

Why Test the Interface?¶

🧪 Sanity Check After Serialization¶

Quick Load → Predict Roundtrip¶

📖 Model Packaging¶

🎯 Goal of This Notebook¶

🧱 Project Structure & Folder Hygiene¶

📁 Suggested Layout¶

Organizing Your Project for Reuse¶

🧼 What to Keep Out of Version Control¶

Use .gitignore to Keep the Repo Clean¶

🧠 Model Serialization¶

💾 Saving Models (joblib, pickle, cloudpickle)¶

Why Model Serialization Matters¶

🧪 Versioning + Metadata Storage¶

Add Context to Every Saved Model¶

🗂️ Dependency Management¶

🧾 requirements.txt vs environment.yml¶

Two Styles of Declaring Dependencies¶

🔐 Deterministic Environments (pip freeze, lock files)¶

Why Just Listing Packages Isn’t Enough¶

🧪 Reproducible Training Script¶

📜 Convert Notebook to Script¶

From Experiment to Reusable Code¶

🔁 Add Seed Control + Logging¶

Making Training Repeatable and Traceable¶

🧰 CLI for Prediction¶

🧑‍💻 argparse Interface¶

Why Build a CLI?¶

📦 Load Model → Accept Input → Return Output¶

A Minimal End-to-End Prediction Script¶

🐳 Introduction to Docker¶

📦 What Is Docker (In Plain English)¶

Why ML Engineers Love It¶

🧱 Writing a Simple Dockerfile¶

Minimal Dockerfile to Serve a Model¶

🧪 Testing the Packaged Model¶

🧪 Unit Test the CLI¶

Why Test the Interface?¶

🧪 Sanity Check After Serialization¶

Quick Load → Predict Roundtrip¶

Use `.gitignore` to Keep the Repo Clean¶

🧾 `requirements.txt` vs `environment.yml`¶

🧑‍💻 `argparse` Interface¶