from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
import pandas as pd

# Dummy data
df = pd.DataFrame({
    "age": [25, 32, 47],
    "gender": ["M", "F", "M"],
    "purchased": [0, 1, 1]
})

X = df[["age", "gender"]]
y = df["purchased"]

# Define transformations
preprocessor = ColumnTransformer(transformers=[
    ("num", StandardScaler(), ["age"]),
    ("cat", OneHotEncoder(), ["gender"])
])

# Define pipeline
clf = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("classifier", LogisticRegression())
])

clf.fit(X, y)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['age']),
                                                 ('cat', OneHotEncoder(),
                                                  ['gender'])])),
                ('classifier', LogisticRegression())])

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['age']),
                                                 ('cat', OneHotEncoder(),
                                                  ['gender'])])),
                ('classifier', LogisticRegression())])

ColumnTransformer(transformers=[('num', StandardScaler(), ['age']),
                                ('cat', OneHotEncoder(), ['gender'])])

['age']

StandardScaler()

['gender']

OneHotEncoder()

LogisticRegression()


import mlflow
import mlflow.sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris

X, y = load_iris(return_X_y=True)
model = LogisticRegression(max_iter=200)

with mlflow.start_run():
    model.fit(X, y)
    acc = model.score(X, y)
    
    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_metric("accuracy", acc)
    mlflow.sklearn.log_model(model, "model")

2025/06/30 13:25:17 WARNING mlflow.models.model: `artifact_path` is deprecated. Please use `name` instead.
2025/06/30 13:25:19 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.

📖 Pipeline Automation¶

🎯 Goal of This Notebook¶

Why This Notebook Exists¶

🔁 Sklearn Pipelines¶

🧱 `Pipeline` and `ColumnTransformer`¶

Cleanly Chain Your Workflow¶

⚙️ Fit → Transform → Predict Flow¶

What Happens Internally¶

🧪 Avoiding Leakage (fit vs transform scope)¶

The #1 Pipeline Mistake¶

🧪 MLflow Basics¶

🧭 What MLflow Tracks¶

High-Level Tracking Capabilities¶

⚙️ Tracking Parameters, Metrics, Artifacts¶

The Core MLflow Logging API¶

🗂️ Local vs Remote Storage Options¶

Where Are Your Runs Saved?¶

📊 Tracking Experiments¶

🏷️ Naming Runs & Experiments¶

Keep Your Logs Human-Readable¶

🧼 Organizing Output¶

Store the Right Things, Not Everything¶

🔍 Comparing Model Results¶

Spot Patterns Across Experiments¶

🏛️ Model Registry Concepts¶

🔁 Lifecycle States (Staging, Prod, Archived)¶

What a Model Registry Actually Does¶

🧪 Promoting + Loading Registered Models¶

Go From Best Run → Production Model¶

🧬 Reproducibility Tips¶

🧪 Seed Setting + Random State¶

Make Results Stable Across Runs¶

🗂️ Artifact Logging for Reuse¶

Log More Than Just the Model¶

📖 Pipeline Automation¶

🎯 Goal of This Notebook¶

Why This Notebook Exists¶

🔁 Sklearn Pipelines¶

🧱 Pipeline and ColumnTransformer¶

Cleanly Chain Your Workflow¶

⚙️ Fit → Transform → Predict Flow¶

What Happens Internally¶

🧪 Avoiding Leakage (fit vs transform scope)¶

The #1 Pipeline Mistake¶

🧪 MLflow Basics¶

🧭 What MLflow Tracks¶

High-Level Tracking Capabilities¶

⚙️ Tracking Parameters, Metrics, Artifacts¶

The Core MLflow Logging API¶

🗂️ Local vs Remote Storage Options¶

Where Are Your Runs Saved?¶

📊 Tracking Experiments¶

🏷️ Naming Runs & Experiments¶

Keep Your Logs Human-Readable¶

🧼 Organizing Output¶

Store the Right Things, Not Everything¶

🔍 Comparing Model Results¶

Spot Patterns Across Experiments¶

🏛️ Model Registry Concepts¶

🔁 Lifecycle States (Staging, Prod, Archived)¶

What a Model Registry Actually Does¶

🧪 Promoting + Loading Registered Models¶

Go From Best Run → Production Model¶

🧬 Reproducibility Tips¶

🧪 Seed Setting + Random State¶

Make Results Stable Across Runs¶

🗂️ Artifact Logging for Reuse¶

Log More Than Just the Model¶

🧱 `Pipeline` and `ColumnTransformer`¶