# Data handling and manipulation
import pandas as pd
import numpy as np

# Machine Learning and Model Evaluation
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, TimeSeriesSplit, KFold
from sklearn.decomposition import PCA
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Statistical and Other Utilities
from scipy.stats import zscore
from termcolor import colored

# Visualization
import matplotlib.pyplot as plt


from sklearn.datasets import make_classification
import numpy as np
import pandas as pd

# Simulate base classification dataset
X, y = make_classification(
    n_samples=1000,
    n_features=10,
    n_informative=2,
    n_redundant=2,
    n_repeated=0,
    n_classes=2,
    weights=[0.7, 0.3],  # simulate class imbalance
    flip_y=0.01,         # 1% label noise
    class_sep=0.8,       # less separation = harder task
    random_state=42
)

# Create DataFrame
df = pd.DataFrame(X, columns=[f"Feature_{i}" for i in range(1, 11)])
target_col = "Target" 
df[target_col] = y

# Inject missing values randomly (e.g., 1% of cells)
# mask = np.random.rand(*df.shape) < 0.01
# df[mask] = np.nan

# Display preview
df.head()


# Initialize the data characteristics dictionary
data_characteristics = {
    "target_variable": {
        "type": None,  # "binary", "multiclass"
        "imbalance": None,  # True if imbalanced, False otherwise
        "class_imbalance_severity": None  # e.g., "high", "low"
    },
    "features": {
        "type": None,  # "categorical", "continuous", "mixed"
        "correlation": None,  # "low", "medium", "high"
        "outliers": None,  # True if outliers detected, False otherwise
        "missing_data": None  # Percentage of missing data or boolean
    },
    "data_size": None,  # Size of dataset (samples, features)
    "linear_separability": None  # True if classes are linearly separable
}


import numpy as np
import pandas as pd
from scipy.stats import pearsonr

# If needed, convert X and y to DataFrame and Series
if isinstance(X, np.ndarray):
    X_df = pd.DataFrame(X, columns=[f"Feature_{i}" for i in range(X.shape[1])])
else:
    X_df = X

if isinstance(y, np.ndarray):
    y_series = pd.Series(y, name="Target")
else:
    y_series = y

# Target-related
target_type = "binary" if y_series.nunique() == 2 else "multiclass"
imbalance_ratio = y_series.value_counts(normalize=True).min()
imbalance_flag = imbalance_ratio < 0.4
imbalance_severity = "high" if imbalance_ratio < 0.2 else "low" if imbalance_ratio < 0.4 else "balanced"

# Feature-related
num_cols = X_df.select_dtypes(include=["number"]).shape[1]
cat_cols = X_df.select_dtypes(exclude=["number"]).shape[1]
feature_type = "continuous" if cat_cols == 0 else "categorical" if num_cols == 0 else "mixed"

missing_pct = X_df.isna().mean().mean()
outlier_flag = any(X_df.apply(lambda col: (col > col.mean() + 3 * col.std()) | (col < col.mean() - 3 * col.std())).sum() > 0)

# Correlation level — only if continuous
if feature_type == "continuous":
    corr_matrix = X_df.corr().abs()
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    avg_corr = upper_tri.stack().mean()
    corr_level = "high" if avg_corr > 0.7 else "medium" if avg_corr > 0.3 else "low"
else:
    corr_level = "N/A"

# Final update
data_characteristics.update({
    "target_variable": {
        "type": target_type,
        "imbalance": imbalance_flag,
        "class_imbalance_severity": imbalance_severity
    },
    "features": {
        "type": feature_type,
        "correlation": corr_level,
        "outliers": outlier_flag,
        "missing_data": f"{missing_pct:.2%}"
    },
    "data_size": X_df.shape,
    "linear_separability": None
})


from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Infer positive class
positive_class = y_series.unique()[1] if len(y_series.unique()) == 2 else 1

# PCA to 2D
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_df)

# Train/test split
X_pca_train, X_pca_test, y_pca_train, y_pca_test = train_test_split(
    X_pca, y_series, test_size=0.2, random_state=42, stratify=y_series
)

# Fit linear model
clf = LogisticRegression()
clf.fit(X_pca_train, y_pca_train)
y_pred_pca = clf.predict(X_pca_test)

# F1-based separability score
f1_pca = f1_score(y_pca_test, y_pred_pca, pos_label=positive_class, zero_division=0)
data_characteristics["linear_separability"] = f1_pca > 0.75  # Adjustable threshold

# Optional print
print(f"✅ Linear separability (2D PCA, Logistic F1): {f1_pca:.2f}")
print(f"↪ Updated: linear_separability = {data_characteristics['linear_separability']}")
if f1_pca > 0.85:
    interpretation = "Strong linear separability in 2D — linear models likely to perform well."
elif f1_pca > 0.7:
    interpretation = "Moderate linear separability — linear models may work with tuning."
else:
    interpretation = "Poor linear separability — expect better results with non-linear models."

print(f"📌 Interpretation: {interpretation}")

✅ Linear separability (2D PCA, Logistic F1): 0.74
↪ Updated: linear_separability = False
📌 Interpretation: Moderate linear separability — linear models may work with tuning.


from pprint import pprint
pprint(data_characteristics)

{'data_size': (1000, 10),
 'features': {'correlation': 'low',
              'missing_data': '0.00%',
              'outliers': True,
              'type': 'continuous'},
 'linear_separability': False,
 'target_variable': {'class_imbalance_severity': 'low',
                     'imbalance': True,
                     'type': 'binary'}}


from sklearn.model_selection import train_test_split

# Define features and target
X = df.drop(columns=target_col)
y = df[target_col]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("✅ Data split complete:")
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")

✅ Data split complete:
Train size: 800, Test size: 200


# Initialize Central tracker dictionary to track best model details upon iterations
best_model_info = {
    "name": None,
    "model": None,
    "metrics": {
        "train": {
            "accuracy": -np.inf,
            "precision": -np.inf,
            "recall": -np.inf,
            "f1": -np.inf,
            "roc_auc": -np.inf
            # Note: confusion_matrix and classification_report omitted for train
            # because they're redundant and cluttered for internal training fit
        },
        "test": {
            "accuracy": -np.inf,
            "precision": -np.inf,
            "recall": -np.inf,
            "f1": -np.inf,
            "roc_auc": -np.inf,
            "confusion_matrix": None,
            "classification_report": None
        }
    },
    "hyperparameters": None
}

# Dictionary to store all model performance results for comparison
model_results = {}


# Metric to decide which model is "best"
# Common choices (ranked by practical usage):
# 1. "f1"        → balanced precision/recall (default choice, esp. with class imbalance)
# 2. "roc_auc"   → good for imbalanced classes, uses probability scores
# 3. "accuracy"  → only when classes are balanced and all errors are equal
# 4. "precision" → when false positives are costly (e.g., spam detection)
# 5. "recall"    → when false negatives are costly (e.g., fraud, cancer)

# Success metric used to select the best model
success_metric = "f1"  # or "roc_auc", depending on use case
# success_split = "test"  # "train" or "test"


from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Fit a dummy classifier as a baseline
dummy_clf = DummyClassifier(strategy="most_frequent")  # or try "stratified", "uniform"
dummy_clf.fit(X_train, y_train)

DummyClassifier(strategy='most_frequent')

DummyClassifier(strategy='most_frequent')


# Predict on both train and test
y_train_pred = dummy_clf.predict(X_train)
y_test_pred = dummy_clf.predict(X_test)


from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# Technical output
print("📉 Classification Report\n")
print(classification_report(y_test, y_test_pred))

📉 Classification Report

              precision    recall  f1-score   support

           0       0.70      1.00      0.82       140
           1       0.00      0.00      0.00        60

    accuracy                           0.70       200
   macro avg       0.35      0.50      0.41       200
weighted avg       0.49      0.70      0.58       200

/Users/ashrithreddy/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/Users/ashrithreddy/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/Users/ashrithreddy/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Determine positive class once
positive_class = y_train.unique()[1] if len(y_train.unique()) == 2 else 1

def evaluate_model(y_true, y_pred, label="Model"):
    acc  = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, pos_label=positive_class, average='binary', zero_division=0)
    rec  = recall_score(y_true, y_pred, pos_label=positive_class, average='binary', zero_division=0)
    f1   = f1_score(y_true, y_pred, pos_label=positive_class, average='binary', zero_division=0)

    # Aligned core metrics
    print(f"\n📊 {label} — Performance Summary:")
    print(f"- Accuracy  : {acc :>7.2%} → Overall correctness.")
    print(f"- Precision : {prec:>7.2%} → Of predicted '{positive_class}', how many were right.")
    print(f"- Recall    : {rec :>7.2%} → Of actual '{positive_class}', how many we caught.")
    print(f"- F1 Score  : {f1  :>7.2%} → Balance of precision & recall.")

    # Business interpretation
    print("\n📌 Interpretation:")
    if prec < 0.6:
        print("- High false positives → risky if false alarms are costly.")
    else:
        print("- Precision looks acceptable; false positives under control.")

    if rec < 0.6:
        print("- High false negatives → risky if missing positives is costly.")
    else:
        print("- Recall is strong; model is catching true cases well.")

    print(f"- F1 Score shows overall tradeoff quality: {f1:.2f}")

# Example usage
evaluate_model(y_test, y_test_pred, label="Baseline Classifier")

📊 Baseline Classifier — Performance Summary:
- Accuracy  :  70.00% → Overall correctness.
- Precision :   0.00% → Of predicted '1', how many were right.
- Recall    :   0.00% → Of actual '1', how many we caught.
- F1 Score  :   0.00% → Balance of precision & recall.

📌 Interpretation:
- High false positives → risky if false alarms are costly.
- High false negatives → risky if missing positives is costly.
- F1 Score shows overall tradeoff quality: 0.00


import numpy as np
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

def plot_confusion(y_true, y_pred, model_name="Model"):
    """
    Plot a confusion matrix with count and percentage annotations.
    Warns if y_pred contains unseen labels not present in y_true.
    """
    # Robust label set
    labels = np.unique(np.concatenate([y_true, y_pred]))
    
    # Check for potential leakage or mismatch
    unseen_preds = set(y_pred) - set(y_true)
    if unseen_preds:
        print(f"\033[91m⚠️ Warning: y_pred contains unseen class labels: {unseen_preds} — "
              f"this may indicate leakage or label mismatch.\033[0m")

    # Compute confusion matrix and percentages
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    cm_sum = np.sum(cm)
    cm_perc = cm / cm_sum * 100

    # Annotate with count and %
    annot = np.empty_like(cm).astype(str)
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            c = cm[i, j]
            p = cm_perc[i, j]
            annot[i, j] = f"{c}\n({p:.1f}%)"

    # Plot
    plt.figure(figsize=(3, 2))
    sns.heatmap(cm, annot=annot, fmt="", cmap="Blues", cbar=True,
                xticklabels=labels, yticklabels=labels)
    plt.xlabel("Predicted label")
    plt.ylabel("True label")
    plt.title(f"Confusion Matrix ({model_name})")
    plt.tight_layout()
    plt.show()

plot_confusion(y_test, y_test_pred, model_name="Baseline Classifier")


from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

def plot_roc_auc(model, X_test, y_test, model_name="Model"):
    """
    Plot ROC curve, print AUC score, and give business-facing interpretation.
    """
    if hasattr(model, "predict_proba"):
        y_scores = model.predict_proba(X_test)[:, 1]
    elif hasattr(model, "decision_function"):
        y_scores = model.decision_function(X_test)
    else:
        raise ValueError("Model does not support probability estimates or decision function.")
    
    fpr, tpr, _ = roc_curve(y_test, y_scores)
    auc_score = roc_auc_score(y_test, y_scores)
    
    # Plot
    plt.figure(figsize=(6, 4))
    plt.plot(fpr, tpr, label=f"AUC = {auc_score:.2f}")
    plt.plot([0, 1], [0, 1], "k--", label="Random Guess")
    plt.xlabel("False Positive Rate (1 - Specificity)")
    plt.ylabel("True Positive Rate (Recall / Sensitivity)")
    plt.title(f"ROC Curve ({model_name})")
    plt.legend()
    plt.tight_layout()
    plt.show()
    
    # Output
    print(f"🔹 ROC AUC Score for {model_name}: {auc_score:.4f}")
    if auc_score <= 0.55:
        print("📌 Interpretation: Model performs at or near random. It cannot meaningfully separate classes.")
    elif auc_score < 0.7:
        print("📌 Interpretation: Some separability, but not reliable yet. Needs improvement.")
    else:
        print("📌 Interpretation: Model is doing a good job distinguishing between classes.")

plot_roc_auc(dummy_clf, X_test, y_test, model_name="Baseline Classifier")

🔹 ROC AUC Score for Baseline Classifier: 0.5000
📌 Interpretation: Model performs at or near random. It cannot meaningfully separate classes.


from termcolor import colored
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report
)

def update_best_model(model_name, model_obj, y_train, y_test, y_train_pred, y_test_pred, hyperparameters=None):
    """
    Computes metrics internally, updates best_model_info if model outperforms current best.
    Also logs all model results.
    """
    # Evaluate performance
    metrics = {
        "train": {
            "accuracy": accuracy_score(y_train, y_train_pred),
            "precision": precision_score(y_train, y_train_pred, pos_label=positive_class, zero_division=0),
            "recall": recall_score(y_train, y_train_pred, pos_label=positive_class, zero_division=0),
            "f1": f1_score(y_train, y_train_pred, pos_label=positive_class, zero_division=0),
            "roc_auc": roc_auc_score(y_train, model_obj.predict_proba(X_train)[:, 1])
        },
        "test": {
            "accuracy": accuracy_score(y_test, y_test_pred),
            "precision": precision_score(y_test, y_test_pred, pos_label=positive_class, zero_division=0),
            "recall": recall_score(y_test, y_test_pred, pos_label=positive_class, zero_division=0),
            "f1": f1_score(y_test, y_test_pred, pos_label=positive_class, zero_division=0),
            "roc_auc": roc_auc_score(y_test, model_obj.predict_proba(X_test)[:, 1]),
            "confusion_matrix": confusion_matrix(y_test, y_test_pred),
            "classification_report": classification_report(y_test, y_test_pred, output_dict=True)
        }
    }

    # Compare with current best
    current_score = metrics["test"][success_metric]
    best_score = best_model_info["metrics"]["test"].get(success_metric, -1)
    previous_best = best_model_info["name"] or "None"

    if current_score > best_score:
        best_model_info.update({
            "name": model_name,
            "model": model_obj,
            "metrics": metrics,
            "hyperparameters": hyperparameters or {}
        })
        print(colored(
            f"✅ {model_name} just beat previous best ({previous_best}) → "
            f"{success_metric}: {best_score:.4f} → {current_score:.4f}", "green"))
        # print(f"📊 Current Test Performance:")
        # for metric in ["accuracy", "precision", "recall", "f1", "roc_auc"]:
        #     val = metrics["test"][metric]
        #     print(f"- {metric.capitalize():<9}: {val:.4f}")

    # Log all model results
    model_results[model_name] = {
        "model": model_obj,
        "metrics": metrics,
        "hyperparameters": hyperparameters or {}
    }


update_best_model(
    model_name="DummyClassifier",
    model_obj=dummy_clf,
    y_train=y_train,
    y_test=y_test,
    y_train_pred=y_train_pred,
    y_test_pred=y_test_pred,
    hyperparameters={"strategy": "most_frequent"}
)

✅ DummyClassifier just beat previous best (None) → f1: -inf → 0.0000

/Users/ashrithreddy/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/Users/ashrithreddy/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/Users/ashrithreddy/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))


# from pprint import pprint
# pprint(best_model_info)
# pprint(model_results)

# import json
# print(json.dumps(best_model_info, indent=2, default=str))
# print(json.dumps(model_results, indent=2, default=str))


# 1. Train model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

LogisticRegression()


# 2. Show the learned equation
print("🧠 Learned Logistic Equation:\n")

terms = [f"({w[i]:+.4f})·{col}" for i, col in enumerate(X_train.columns)]
equation = " +\n    ".join(terms)
print("z =\n    " + equation)
print(f"  + ({b:+.4f})  ← bias\n")

🧠 Learned Logistic Equation:

z =
    (-0.3652)·Feature_1 +
    (+0.2297)·Feature_2 +
    (-0.5858)·Feature_3 +
    (+0.0253)·Feature_4 +
    (+0.0175)·Feature_5 +
    (+0.0402)·Feature_6 +
    (+1.2599)·Feature_7 +
    (-0.1435)·Feature_8 +
    (-0.4557)·Feature_9 +
    (-0.0027)·Feature_10
  + (-0.7569)  ← bias


# 3. Manually compute z, sigmoid, and prediction
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

X_sample = X_test.iloc[:10].copy()
z_vals = np.dot(X_sample, w) + b
probs = sigmoid(z_vals)
preds = (probs >= 0.5).astype(int)


# 4. Print table showing internals
diagnostics = X_sample.copy()
diagnostics["z = w·x + b"] = np.round(z_vals, 4)
diagnostics["sigmoid(z) = prob"] = np.round(probs, 4)
diagnostics["prediction"] = preds
diagnostics["true_label"] = y_test.iloc[:10].values
diagnostics["log_loss_row"] = -(
    diagnostics["true_label"] * np.log(diagnostics["sigmoid(z) = prob"]) +
    (1 - diagnostics["true_label"]) * np.log(1 - diagnostics["sigmoid(z) = prob"])
).round(4)

print("\n📊 Internal Breakdown (First 10 Test Rows):")
display(diagnostics)

📊 Internal Breakdown (First 10 Test Rows):


# 5. Calculate and print total log loss
from sklearn.metrics import log_loss
total_loss = log_loss(y_test, model.predict_proba(X_test)[:, 1])
print(f"\n📉 Total Log Loss: {total_loss:.4f}")

📉 Total Log Loss: 0.3407


from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import xgboost as xgb

model_registry = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(probability=True),  # needed for ROC AUC
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "Neural Network": MLPClassifier(max_iter=1000)
}


def recommend_models(data_characteristics, verbose=True):
    """
    Scores and ranks models based on data characteristics.
    Prints the recommended order and rationales.
    """
    from termcolor import colored

    score = {}
    rationale = {}
    
    # Extract characteristics
    target_info = data_characteristics.get("target_variable", {})
    feature_info = data_characteristics.get("features", {})
    is_linear = data_characteristics.get("linear_separability", False)

    imbalance = target_info.get("imbalance", False)
    imbalance_severity = target_info.get("class_imbalance_severity", "")
    feature_type = feature_info.get("type", "")
    corr = feature_info.get("correlation", "")
    outliers = feature_info.get("outliers", False)

    # --- Logistic Regression ---
    score["Logistic Regression"] = 2
    rationale["Logistic Regression"] = ["Good for linearly separable, numeric data"]
    if is_linear:
        score["Logistic Regression"] += 3
        rationale["Logistic Regression"].append("Linear separability is True")
    if feature_type == "continuous":
        score["Logistic Regression"] += 1
        rationale["Logistic Regression"].append("Features are continuous")
    if outliers:
        score["Logistic Regression"] -= 1
        rationale["Logistic Regression"].append("Sensitive to outliers")

    # --- Naive Bayes ---
    score["Naive Bayes"] = 1
    rationale["Naive Bayes"] = ["Good for categorical, independent features"]
    if feature_type == "categorical":
        score["Naive Bayes"] += 2
        rationale["Naive Bayes"].append("Features are categorical")
    if corr == "low":
        score["Naive Bayes"] += 1
        rationale["Naive Bayes"].append("Feature correlation is low")
    if corr == "high":
        score["Naive Bayes"] -= 2
        rationale["Naive Bayes"].append("Correlation is high → violates independence")

    # --- Decision Tree ---
    score["Decision Tree"] = 2
    rationale["Decision Tree"] = ["Fast, flexible, handles most data types"]
    if corr == "high":
        score["Decision Tree"] += 1
        rationale["Decision Tree"].append("Can exploit feature redundancy")
    if outliers:
        score["Decision Tree"] += 1
        rationale["Decision Tree"].append("Robust to outliers")

    # --- Random Forest ---
    score["Random Forest"] = 3
    rationale["Random Forest"] = ["Strong baseline, handles imbalance + outliers"]
    if outliers:
        score["Random Forest"] += 1
        rationale["Random Forest"].append("Handles outliers well")
    if imbalance:
        score["Random Forest"] += 1
        rationale["Random Forest"].append("Bootstrap helps with imbalance")

    # --- KNN ---
    score["KNN"] = 1
    rationale["KNN"] = ["Simple, distance-based"]
    if feature_type == "continuous":
        score["KNN"] += 1
        rationale["KNN"].append("Distance-based → works better on continuous features")
    if outliers:
        score["KNN"] -= 2
        rationale["KNN"].append("Very sensitive to outliers")
    if imbalance:
        score["KNN"] -= 1
        rationale["KNN"].append("Imbalance skews neighbors")

    # --- SVM ---
    score["SVM"] = 2
    rationale["SVM"] = ["Margin-based model"]
    if is_linear:
        score["SVM"] += 2
        rationale["SVM"].append("Linear separability is True")
    if imbalance:
        score["SVM"] -= 1
        rationale["SVM"].append("Needs tuning to handle imbalance")
    if feature_type == "continuous":
        score["SVM"] += 1
        rationale["SVM"].append("Requires numeric features")

    # --- Neural Network ---
    score["Neural Network"] = 2
    rationale["Neural Network"] = ["Flexible but sensitive"]
    if imbalance_severity == "high":
        score["Neural Network"] += 1
        rationale["Neural Network"].append("Can learn from imbalance if tuned")
    if outliers:
        score["Neural Network"] -= 1
        rationale["Neural Network"].append("Can be unstable with outliers")

    # --- XGBoost ---
    score["XGBoost"] = 4
    rationale["XGBoost"] = ["Strong general-purpose model"]
    if outliers:
        score["XGBoost"] += 1
        rationale["XGBoost"].append("Robust to outliers")
    if imbalance:
        score["XGBoost"] += 1
        rationale["XGBoost"].append("scale_pos_weight helps with imbalance")
    if corr == "high":
        score["XGBoost"] += 1
        rationale["XGBoost"].append("Handles redundant features well")

    # Sort by descending score
    ranked_models = sorted(score.items(), key=lambda x: x[1], reverse=True)
    ranked_model_names = [model for model, _ in ranked_models]

    # Filter and reorder model_registry
    ranked_registry = {name: model_registry[name] for name in ranked_model_names if name in model_registry}

    if verbose:
        print("🧠 Recommended Model Evaluation Order:\n")
        for i, name in enumerate(ranked_model_names, 1):
            if name in model_registry:
                prefix = colored(f"{i}. {name} (Score: {score[name]})", "green") if i <= 3 else f"{i}. {name} (Score: {score[name]})"
                print(prefix)
                for reason in rationale[name]:
                    print(f"   ↪ {reason}")
        print()

    return ranked_model_names, ranked_registry


_, model_registry = recommend_models(data_characteristics, model_registry)

# model_registry

🧠 Recommended Model Evaluation Order:

1. XGBoost (Score: 6)
   ↪ Strong general-purpose model
   ↪ Robust to outliers
   ↪ scale_pos_weight helps with imbalance
2. Random Forest (Score: 5)
   ↪ Strong baseline, handles imbalance + outliers
   ↪ Handles outliers well
   ↪ Bootstrap helps with imbalance
3. Decision Tree (Score: 3)
   ↪ Fast, flexible, handles most data types
   ↪ Robust to outliers
4. Logistic Regression (Score: 2)
   ↪ Good for linearly separable, numeric data
   ↪ Features are continuous
   ↪ Sensitive to outliers
5. Naive Bayes (Score: 2)
   ↪ Good for categorical, independent features
   ↪ Feature correlation is low
6. SVM (Score: 2)
   ↪ Margin-based model
   ↪ Needs tuning to handle imbalance
   ↪ Requires numeric features
7. Neural Network (Score: 1)
   ↪ Flexible but sensitive
   ↪ Can be unstable with outliers
8. KNN (Score: -1)
   ↪ Simple, distance-based
   ↪ Distance-based → works better on continuous features
   ↪ Very sensitive to outliers
   ↪ Imbalance skews neighbors


from sklearn.metrics import (
    precision_score, recall_score, f1_score,
    accuracy_score, roc_auc_score, confusion_matrix, log_loss
)

top_k = 3
for name in list(model_registry.keys())[:top_k]:
    # We evaluate only the top 3 recommended models (ranked earlier) for focused comparison.
    print(f"\n🔧 Training: {name}")

    # Fit and predict
    model = model_registry[name]
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluation summary
    evaluate_model(y_test, y_test_pred, label=name)
    plot_confusion(y_test, y_test_pred, model_name=name)
    plot_roc_auc(model, X_test, y_test, model_name=name)

    # Track and log best model
    update_best_model(
        model_name=name,
        model_obj=model,
        y_train=y_train,
        y_test=y_test,
        y_train_pred=y_train_pred,
        y_test_pred=y_test_pred
    )

    print("—" * 80)  # horizontal line

🔧 Training: XGBoost

📊 XGBoost — Performance Summary:
- Accuracy  :  93.00% → Overall correctness.
- Precision :  89.66% → Of predicted '1', how many were right.
- Recall    :  86.67% → Of actual '1', how many we caught.
- F1 Score  :  88.14% → Balance of precision & recall.

📌 Interpretation:
- Precision looks acceptable; false positives under control.
- Recall is strong; model is catching true cases well.
- F1 Score shows overall tradeoff quality: 0.88

🔹 ROC AUC Score for XGBoost: 0.9554
📌 Interpretation: Model is doing a good job distinguishing between classes.
✅ XGBoost just beat previous best (DummyClassifier) → f1: 0.0000 → 0.8814
————————————————————————————————————————————————————————————————————————————————

🔧 Training: Random Forest

📊 Random Forest — Performance Summary:
- Accuracy  :  93.00% → Overall correctness.
- Precision :  92.59% → Of predicted '1', how many were right.
- Recall    :  83.33% → Of actual '1', how many we caught.
- F1 Score  :  87.72% → Balance of precision & recall.

📌 Interpretation:
- Precision looks acceptable; false positives under control.
- Recall is strong; model is catching true cases well.
- F1 Score shows overall tradeoff quality: 0.88

🔹 ROC AUC Score for Random Forest: 0.9605
📌 Interpretation: Model is doing a good job distinguishing between classes.
————————————————————————————————————————————————————————————————————————————————

🔧 Training: Decision Tree

📊 Decision Tree — Performance Summary:
- Accuracy  :  86.00% → Overall correctness.
- Precision :  70.51% → Of predicted '1', how many were right.
- Recall    :  91.67% → Of actual '1', how many we caught.
- F1 Score  :  79.71% → Balance of precision & recall.

📌 Interpretation:
- Precision looks acceptable; false positives under control.
- Recall is strong; model is catching true cases well.
- F1 Score shows overall tradeoff quality: 0.80

🔹 ROC AUC Score for Decision Tree: 0.8762
📌 Interpretation: Model is doing a good job distinguishing between classes.
————————————————————————————————————————————————————————————————————————————————


# from pprint import pprint
# pprint(best_model_info)
# pprint(model_results)


# Print current best model based on success_metric
print(f"\n🏆 Best model so far: {best_model_info['name']} "
      f"({success_metric.upper()} = {best_model_info['metrics']['test'][success_metric]:.4f})")

print(f"\n📊 Model Ranking by {success_metric.upper()}:\n")
ranked = sorted(
    model_results.items(),
    key=lambda x: x[1]["metrics"]["test"][success_metric],
    reverse=True
)

for i, (name, result) in enumerate(ranked, 1):
    score = result["metrics"]["test"][success_metric]
    print(f"{i}. {name:<20} {success_metric}: {score:.4f}")

🏆 Best model so far: XGBoost (F1 = 0.8814)

📊 Model Ranking by F1:

1. XGBoost              f1: 0.8814
2. Random Forest        f1: 0.8772
3. Decision Tree        f1: 0.7971
4. DummyClassifier      f1: 0.0000


import plotly.graph_objects as go
import plotly.subplots as sp
import pandas as pd

# Extract test metrics
df_results = pd.DataFrame({
    model_name: data["metrics"]["test"]
    for model_name, data in model_results.items()
}).T

# Original metrics you'd like to plot
desired_metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc', 'specificity']

# Filter only those that exist in df_results
metrics = [m for m in desired_metrics if m in df_results.columns]

# Create subplot layout
rows = (len(metrics) + 1) // 2
fig = sp.make_subplots(rows=rows, cols=2, subplot_titles=[m.upper() for m in metrics])

# Plot each available metric
for i, metric in enumerate(metrics):
    row, col = divmod(i, 2)
    fig.add_trace(
        go.Bar(
            x=df_results.index,
            y=df_results[metric],
            name=metric,
            text=pd.to_numeric(df_results[metric], errors="coerce").round(3),
            textposition="auto"
        ),
        row=row+1, col=col+1
    )

fig.update_layout(
    height=300 * rows,
    width=1000,
    title_text="Model Comparison by Metric",
    showlegend=False
)

fig.show()


# best_model_info


import pandas as pd
import matplotlib.pyplot as plt

def plot_feature_importance(model=None, feature_names=None, top_n=10, model_name=None):
    """
    Plots top N feature importances.
    Defaults to best_model_info['model'] unless overridden.
    Optionally takes a model_name for the plot title.
    """
    if model is None:
        model = best_model_info["model"]
        model_name = best_model_info.get("name", "Best Model") if model_name is None else model_name
    else:
        model_name = model_name or "Selected Model"

    if feature_names is None:
        feature_names = X_train.columns

    if not hasattr(model, "feature_importances_"):
        raise ValueError("Model does not support feature_importances_")

    importance_df = pd.DataFrame({
        "Feature": feature_names,
        "Importance": model.feature_importances_
    }).sort_values(by="Importance", ascending=False).head(top_n)

    plt.figure(figsize=(8, 5))
    plt.barh(importance_df["Feature"][::-1], importance_df["Importance"][::-1])
    for i, (feature, importance) in enumerate(zip(importance_df["Feature"][::-1], importance_df["Importance"][::-1])):
        plt.text(importance + 0.005, i, f"{importance:.3f}", va='center')
    plt.title(f"Top Feature Importances ({model_name})")
    plt.xlabel("Importance Score")
    plt.tight_layout()
    plt.show()

    return list(importance_df["Feature"])

# ✅ Default: plot for best model
imp_ranked = plot_feature_importance()

# 🛠️ Optional: override model + title
# alt_model = model_results["Random Forest"]["model"]
# imp_ranked = plot_feature_importance(model=alt_model, model_name="Random Forest")


import shap

def plot_shap_summary_tree(model=None, X=None, model_name=None):
    """
    Plot SHAP summary for tree-based models (RandomForest, XGBoost).
    Defaults to best_model_info['model'] and X_test.
    """
    if model is None:
        model = best_model_info["model"]
        model_name = model_name or best_model_info.get("name", "Best Model")
    else:
        model_name = model_name or "Selected Model"

    if X is None:
        X = X_test

    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X)

    # For binary classification, use shap_values[1]
    if isinstance(shap_values, list) and len(shap_values) == 2:
        shap_values = shap_values[1]

    shap.summary_plot(shap_values, X)

    print(f"\n📌 SHAP Summary for {model_name}:")
    print("- Each bar shows how much that feature influences the model’s decision.")
    print("- Features at the top are the most impactful across all predictions.")
    print("- Blue/red indicate direction: does the feature push prediction up or down?")
    print("- Helps us understand *why* the model is confident — not just *what* it predicts.")

    shap_df = pd.DataFrame(np.abs(shap_values), columns=X.columns).mean().sort_values(ascending=False)
    return list(shap_df.index)

# ✅ Default: SHAP for best model
shap_ranked = plot_shap_summary_tree()

# 🛠️ Optional: SHAP for any other model
# alt_model = model_results["Random Forest"]["model"]
# shap_ranked = plot_shap_summary_tree(model=alt_model, model_name="Random Forest")

[23:15:09] WARNING: /Users/runner/work/xgboost/xgboost/src/c_api/c_api.cc:1240: Saving into deprecated binary model format, please consider using `json` or `ubj`. Model format will default to JSON in XGBoost 2.2 if not specified.

📌 SHAP Summary for XGBoost:
- Each bar shows how much that feature influences the model’s decision.
- Features at the top are the most impactful across all predictions.
- Blue/red indicate direction: does the feature push prediction up or down?
- Helps us understand *why* the model is confident — not just *what* it predicts.


if False:
    from sklearn.feature_selection import RFE

    # Use full X_train
    X_full = X_train.copy()
    model = best_model_info["model"]

    # Choose how many features to keep (optional: all, top 50%, or fixed)
    n_to_select = max(1, X_full.shape[1] // 2)  # or change to any value

    # Run RFE
    selector = RFE(estimator=model, n_features_to_select=n_to_select, step=1)
    selector.fit(X_full, y_train)

    # Final selected features
    selected_features = list(X_full.columns[selector.support_])
    print(f"✅ RFE selected features (no filtering): {selected_features}")


from sklearn.base import clone
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score
import shap
import numpy as np

def shap_guided_backward_elimination(
    model, X, y,
    shap_ranked=None,
    metric_name=None,
    drop_threshold=0.005,
    min_features=1,
    verbose=True
):
    """
    SHAP-guided backward elimination with early stopping.
    Drops features by SHAP rank until performance drops significantly or hits min_features.
    """
    model_base = clone(model)
    X_curr = X.copy()

    # Determine metric
    metric_name = metric_name or success_metric
    metric_func = {
        "f1": f1_score,
        "accuracy": accuracy_score,
        "precision": precision_score,
        "recall": recall_score,
        "roc_auc": roc_auc_score
    }.get(metric_name)

    if metric_func is None:
        raise ValueError(f"Unsupported metric: {metric_name}")

    # Get SHAP-ranked features if not provided
    if shap_ranked is None:
        explainer = shap.TreeExplainer(model_base.fit(X_curr, y))
        shap_values = explainer.shap_values(X_curr)
        if isinstance(shap_values, list) and len(shap_values) == 2:
            shap_values = shap_values[1]
        shap_importance = np.abs(shap_values).mean(axis=0)
        shap_ranked = X_curr.columns[np.argsort(shap_importance)[::-1]].tolist()
    else:
        shap_ranked = shap_ranked.copy()

    # Initialize tracking
    score_history = []
    previous_score = None

    while len(shap_ranked) >= min_features:
        model = clone(model_base)
        model.fit(X_curr[shap_ranked], y)
        y_pred = model.predict(X_curr[shap_ranked])
        score = metric_func(y, y_pred, zero_division=0)
        score_history.append((len(shap_ranked), score, shap_ranked.copy()))

        if verbose:
            feat_list = ", ".join(shap_ranked)
            print(f"✅ {len(shap_ranked)} features → {metric_name}: {score:.4f} → [{feat_list}]")

        # Early stop if score drops significantly
        if previous_score is not None and (previous_score - score) > drop_threshold:
            if verbose:
                print(f"🛑 Stopping early: {metric_name} dropped from {previous_score:.4f} to {score:.4f}")
            break

        previous_score = score
        shap_ranked.pop()  # Drop lowest-ranked SHAP feature

    if not score_history:
        raise ValueError("No elimination steps executed — shap_ranked too short or invalid inputs.")

    # Best configuration
    tolerance = 0.01  # Accept within 1% drop of best score
    best_score = max(score_history, key=lambda x: x[1])[1]
    # Keep all configs that are within tolerance
    candidates = [cfg for cfg in score_history if (best_score - cfg[1]) <= tolerance]
    # Pick one with the fewest features
    best_config = min(candidates, key=lambda x: x[0])
    print(f"\n🎯 Best config: {len(best_config[2])} features → {metric_name}: {best_config[1]:.4f}")
    return best_config[2], score_history


final_features, history = shap_guided_backward_elimination(
    model=best_model_info["model"],
    X=X_train,
    y=y_train,
    shap_ranked=shap_ranked
)
final_features

X_train_full = X_train.copy() # retaining copies for future reference
X_test_full = X_test.copy() # retaining copies for future reference
X_train = X_train[final_features]
X_test  = X_test[final_features]

✅ 10 features → f1: 1.0000 → [Feature_7, Feature_9, Feature_6, Feature_2, Feature_5, Feature_8, Feature_10, Feature_1, Feature_3, Feature_4]
✅ 9 features → f1: 1.0000 → [Feature_7, Feature_9, Feature_6, Feature_2, Feature_5, Feature_8, Feature_10, Feature_1, Feature_3]
✅ 8 features → f1: 1.0000 → [Feature_7, Feature_9, Feature_6, Feature_2, Feature_5, Feature_8, Feature_10, Feature_1]
✅ 7 features → f1: 1.0000 → [Feature_7, Feature_9, Feature_6, Feature_2, Feature_5, Feature_8, Feature_10]
✅ 6 features → f1: 1.0000 → [Feature_7, Feature_9, Feature_6, Feature_2, Feature_5, Feature_8]
✅ 5 features → f1: 1.0000 → [Feature_7, Feature_9, Feature_6, Feature_2, Feature_5]
✅ 4 features → f1: 1.0000 → [Feature_7, Feature_9, Feature_6, Feature_2]
✅ 3 features → f1: 1.0000 → [Feature_7, Feature_9, Feature_6]
✅ 2 features → f1: 0.9896 → [Feature_7, Feature_9]
🛑 Stopping early: f1 dropped from 1.0000 to 0.9896

🎯 Best config: 3 features → f1: 1.0000


from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

# 🔧 Complete and default-aware param grid
param_grids = {
    "RandomForestClassifier": {
        "n_estimators": [100, 200],              # default: 100
        "max_depth": [None, 5, 10],              # default: None
        "min_samples_split": [2, 5],             # default: 2
        "min_samples_leaf": [1, 2],              # default: 1
        "max_features": ["sqrt", "log2"]         # default: "sqrt"
    },
    "DecisionTreeClassifier": {
        "max_depth": [None, 5, 10],              # default: None
        "min_samples_split": [2, 5],             # default: 2
        "min_samples_leaf": [1, 2],              # default: 1
        "criterion": ["gini", "entropy"]         # default: "gini"
    },
    "GaussianNB": {
        # Note: Naive Bayes (GaussianNB) has limited tunable parameters — only var_smoothing is exposed
        "var_smoothing": [1e-9, 1e-8, 1e-7]      # default: 1e-9
    },
    "LogisticRegression": {
        "C": [0.01, 0.1, 1, 10],                 # default: 1
        "penalty": ["l2"],                       # default: "l2"
        "solver": ["lbfgs"],                     # default: "lbfgs"
        "max_iter": [100, 500]                   # default: 100
    },
    "SVC": {
        "C": [0.1, 1, 10],                       # default: 1
        "kernel": ["linear", "rbf"],             # default: "rbf"
        "gamma": ["scale", "auto"],              # default: "scale"
        "probability": [True]                    # default: False (forced True for AUC)
    },
    "KNeighborsClassifier": {
        "n_neighbors": [3, 5, 7],                # default: 5
        "weights": ["uniform", "distance"],      # default: "uniform"
        "metric": ["euclidean", "manhattan", "minkowski"]  # default: "minkowski"
    },
    "MLPClassifier": {
        "hidden_layer_sizes": [(50,), (100,)],  # default: (100,)
        "activation": ["relu", "tanh"],          # default: "relu"
        "alpha": [0.0001, 0.001],                # default: 0.0001
        "learning_rate": ["constant", "adaptive"],  # default: "constant"
        "max_iter": [200, 500]                   # default: 200
    },
    "XGBClassifier": {
        "n_estimators": [100, 200],
        "max_depth": [3, 5, 7],
        "learning_rate": [0.01, 0.1],
        "subsample": [0.8, 1.0],
        "colsample_bytree": [0.8, 1.0],
        "scale_pos_weight": [1, 2]  # useful for class imbalance
    }
}


from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    precision_score, recall_score, f1_score, accuracy_score,
    roc_auc_score, confusion_matrix, log_loss
)

# ⚙️ Resolve model name and corresponding grid
model_name = best_model_info["model"].__class__.__name__  # ✅ fixed here
param_grid = param_grids.get(model_name)

if param_grid is None:
    raise ValueError(f"No param grid defined for model: {model_name}")

print(f"\n🔧 Running Grid Search for: {model_name}")

# 🧪 Run Grid Search
model_instance = best_model_info["model"].__class__()

grid_search = GridSearchCV(
    estimator=model_instance,
    param_grid=param_grid,
    scoring="f1",
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)
best_tuned_model = grid_search.best_estimator_

print("✅ Best Parameters Found:")
print(grid_search.best_params_)

# 📈 Evaluate tuned model
y_test_pred = best_tuned_model.predict(X_test)

if hasattr(best_tuned_model, "predict_proba"):
    y_scores = best_tuned_model.predict_proba(X_test)[:, 1]
elif hasattr(best_tuned_model, "decision_function"):
    y_scores = best_tuned_model.decision_function(X_test)
else:
    y_scores = y_test_pred

cm = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = cm.ravel()

# Metrics
precision = precision_score(y_test, y_test_pred, zero_division=0)
recall = recall_score(y_test, y_test_pred, zero_division=0)
f1 = f1_score(y_test, y_test_pred, zero_division=0)
accuracy = accuracy_score(y_test, y_test_pred)
auc = roc_auc_score(y_test, y_scores)
specificity = tn / (tn + fp)
logloss = log_loss(y_test, y_scores)

# Add to model_results with a new key
model_results[f"{model_name} (Tuned)"] = {
    "model": best_tuned_model,
    "accuracy": accuracy,
    "precision": precision,
    "recall": recall,
    "f1": f1,
    "auc": auc,
    "specificity": specificity,
    "log_loss": logloss
}

# Evaluation summary
evaluate_model(y_test, y_test_pred, label=name)
plot_confusion(y_test, y_test_pred, model_name=f"{model_name} (Tuned)")
plot_roc_auc(best_tuned_model, X_test, y_test, model_name=f"{model_name} (Tuned)")

🔧 Running Grid Search for: XGBClassifier
Fitting 5 folds for each of 96 candidates, totalling 480 fits
✅ Best Parameters Found:
{'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 200, 'scale_pos_weight': 1, 'subsample': 0.8}

📊 DummyClassifier — Performance Summary:
- Accuracy  :  93.00% → Overall correctness.
- Precision :  94.23% → Of predicted '1', how many were right.
- Recall    :  81.67% → Of actual '1', how many we caught.
- F1 Score  :  87.50% → Balance of precision & recall.

📌 Interpretation:
- Precision looks acceptable; false positives under control.
- Recall is strong; model is catching true cases well.
- F1 Score shows overall tradeoff quality: 0.87

🔹 ROC AUC Score for XGBClassifier (Tuned): 0.9700
📌 Interpretation: Model is doing a good job distinguishing between classes.


# best_model_info


from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
from sklearn.metrics import (
    precision_score, recall_score, f1_score, accuracy_score,
    roc_auc_score, confusion_matrix, log_loss
)

# 🔁 Use same param grid as defined earlier
model_name = best_model_info["model"].__class__.__name__
param_dist = param_grids.get(model_name)

if param_dist is None:
    raise ValueError(f"No param distribution defined for model: {model_name}")

print(f"\n🎲 Running Randomized Search for: {model_name}")

# Create a new instance of the model
model_instance = best_model_info["model"].__class__()

# 🔍 Run randomized search
random_search = RandomizedSearchCV(
    estimator=model_instance,
    param_distributions=param_dist,
    n_iter=15,
    scoring="f1",
    cv=5,
    n_jobs=-1,
    verbose=1,
    random_state=42
)

random_search.fit(X_train, y_train)
best_random_model = random_search.best_estimator_

print("✅ Best Parameters Found:")
print(random_search.best_params_)

# 🔎 Evaluate tuned model
y_test_pred = best_random_model.predict(X_test)

if hasattr(best_random_model, "predict_proba"):
    y_scores = best_random_model.predict_proba(X_test)[:, 1]
elif hasattr(best_random_model, "decision_function"):
    y_scores = best_random_model.decision_function(X_test)
else:
    y_scores = y_test_pred

cm = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = cm.ravel()

# Metrics
precision = precision_score(y_test, y_test_pred, zero_division=0)
recall = recall_score(y_test, y_test_pred, zero_division=0)
f1 = f1_score(y_test, y_test_pred, zero_division=0)
accuracy = accuracy_score(y_test, y_test_pred)
auc = roc_auc_score(y_test, y_scores)
specificity = tn / (tn + fp)
logloss = log_loss(y_test, y_scores)

# Store results
model_results[f"{model_name} (RandomSearch)"] = {
    "model": best_random_model,
    "accuracy": accuracy,
    "precision": precision,
    "recall": recall,
    "f1": f1,
    "auc": auc,
    "specificity": specificity,
    "log_loss": logloss
}

# Visual eval
evaluate_model(y_test, y_test_pred, label=f"{model_name} (RandomSearch)")
plot_confusion(y_test, y_test_pred, model_name=f"{model_name} (RandomSearch)")
plot_roc_auc(best_random_model, X_test, y_test, model_name=f"{model_name} (RandomSearch)")

🎲 Running Randomized Search for: XGBClassifier
Fitting 5 folds for each of 15 candidates, totalling 75 fits
✅ Best Parameters Found:
{'subsample': 0.8, 'scale_pos_weight': 1, 'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.1, 'colsample_bytree': 1.0}

📊 XGBClassifier (RandomSearch) — Performance Summary:
- Accuracy  :  92.00% → Overall correctness.
- Precision :  89.29% → Of predicted '1', how many were right.
- Recall    :  83.33% → Of actual '1', how many we caught.
- F1 Score  :  86.21% → Balance of precision & recall.

📌 Interpretation:
- Precision looks acceptable; false positives under control.
- Recall is strong; model is catching true cases well.
- F1 Score shows overall tradeoff quality: 0.86

🔹 ROC AUC Score for XGBClassifier (RandomSearch): 0.9594
📌 Interpretation: Model is doing a good job distinguishing between classes.


from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

# Define voting type: 'hard' or 'soft'
voting_type = 'hard'  # change to 'hard' if you want majority voting

# Define the ensemble
voting_clf = VotingClassifier(
    estimators=[
        ('lr', LogisticRegression(max_iter=1000)),
        ('dt', DecisionTreeClassifier()),
        ('nb', GaussianNB())
    ],
    voting=voting_type
)

# Train the ensemble
print(f"🔧 Training: Voting Classifier ({voting_type})")
voting_clf.fit(X_train, y_train)

# Predict labels
y_pred_voting = voting_clf.predict(X_test)

# Evaluate
plot_confusion(y_test, y_pred_voting, model_name=f"Voting Classifier ({voting_type})")

# Only plot ROC if model supports probability estimates
if voting_type == 'soft':
    plot_roc_auc(voting_clf, X_test, y_test, model_name=f"Voting Classifier ({voting_type})")

evaluate_model(y_test, y_pred_voting, label=f"Voting Classifier ({voting_type})")

🔧 Training: Voting Classifier (hard)

📊 Voting Classifier (hard) — Performance Summary:
- Accuracy  :  88.00% → Overall correctness.
- Precision :  90.91% → Of predicted '1', how many were right.
- Recall    :  66.67% → Of actual '1', how many we caught.
- F1 Score  :  76.92% → Balance of precision & recall.

📌 Interpretation:
- Precision looks acceptable; false positives under control.
- Recall is strong; model is catching true cases well.
- F1 Score shows overall tradeoff quality: 0.77


from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

# Define base models
base_estimators = [
    ('lr', LogisticRegression(max_iter=1000)),
    ('dt', DecisionTreeClassifier()),
    ('nb', GaussianNB())
]

# Define meta-model (final estimator)
meta_model = LogisticRegression()

# Build stacking classifier
stacking_clf = StackingClassifier(
    estimators=base_estimators,
    final_estimator=meta_model,
    passthrough=False,  # set to True if you want raw features included in meta-model input
    cv=5                # internal cross-validation
)

# Train the ensemble
print("🔧 Training: Stacking Classifier")
stacking_clf.fit(X_train, y_train)

# Predict labels
y_pred_stack = stacking_clf.predict(X_test)

# Evaluate
plot_confusion(y_test, y_pred_stack, model_name="Stacking Classifier")
plot_roc_auc(stacking_clf, X_test, y_test, model_name="Stacking Classifier")
evaluate_model(y_test, y_pred_stack, label="Stacking Classifier")

🔧 Training: Stacking Classifier

🔹 ROC AUC Score for Stacking Classifier: 0.9429
📌 Interpretation: Model is doing a good job distinguishing between classes.

📊 Stacking Classifier — Performance Summary:
- Accuracy  :  93.50% → Overall correctness.
- Precision :  88.52% → Of predicted '1', how many were right.
- Recall    :  90.00% → Of actual '1', how many we caught.
- F1 Score  :  89.26% → Balance of precision & recall.

📌 Interpretation:
- Precision looks acceptable; false positives under control.
- Recall is strong; model is catching true cases well.
- F1 Score shows overall tradeoff quality: 0.89


from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier            # fast, default
from sklearn.linear_model import LogisticRegression        # works well with linear patterns
from sklearn.neighbors import KNeighborsClassifier         # unstable, benefits a lot from bagging
from sklearn.svm import SVC                                # slow with bagging, use carefully
from sklearn.naive_bayes import GaussianNB                 # rare with bagging (already stable)
from sklearn.ensemble import RandomForestClassifier        # not recommended — it's already bagged

# Example usage:
# base_estimator = LogisticRegression(max_iter=1000)
# base_estimator = KNeighborsClassifier()
# base_estimator = SVC(probability=True)
# base_estimator = GaussianNB()

# Define bagging classifier with decision trees
bagging_clf = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=50,              # number of trees
    max_samples=0.8,              # bootstrap sample size
    max_features=1.0,             # use all features
    random_state=42,
    n_jobs=-1                     # parallel processing
)

# Train the ensemble
print("🔧 Training: Bagging Classifier")
bagging_clf.fit(X_train, y_train)

# Predict
y_pred_bag = bagging_clf.predict(X_test)

# Evaluate
plot_confusion(y_test, y_pred_bag, model_name="Bagging Classifier")
plot_roc_auc(bagging_clf, X_test, y_test, model_name="Bagging Classifier")
evaluate_model(y_test, y_pred_bag, label="Bagging Classifier")

🔧 Training: Bagging Classifier

`base_estimator` was renamed to `estimator` in version 1.2 and will be removed in 1.4.

🔹 ROC AUC Score for Bagging Classifier: 0.9685
📌 Interpretation: Model is doing a good job distinguishing between classes.

📊 Bagging Classifier — Performance Summary:
- Accuracy  :  92.00% → Overall correctness.
- Precision :  87.93% → Of predicted '1', how many were right.
- Recall    :  85.00% → Of actual '1', how many we caught.
- F1 Score  :  86.44% → Balance of precision & recall.

📌 Interpretation:
- Precision looks acceptable; false positives under control.
- Recall is strong; model is catching true cases well.
- F1 Score shows overall tradeoff quality: 0.86


from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier 

# Define the boosting classifier
boosting_clf = GradientBoostingClassifier(
    n_estimators=100,        # number of boosting rounds
    learning_rate=0.1,       # step size shrinkage
    max_depth=3,             # depth of each weak learner
    subsample=1.0,           # can be <1.0 for stochastic gradient boosting
    random_state=42
)

# Train the ensemble
print("🔧 Training: Boosting Classifier")
boosting_clf.fit(X_train, y_train)

# Predict
y_pred_boost = boosting_clf.predict(X_test)

# Evaluate
plot_confusion(y_test, y_pred_boost, model_name="Boosting Classifier")
plot_roc_auc(boosting_clf, X_test, y_test, model_name="Boosting Classifier")
evaluate_model(y_test, y_pred_boost, label="Boosting Classifier")

🔧 Training: Boosting Classifier

🔹 ROC AUC Score for Boosting Classifier: 0.9738
📌 Interpretation: Model is doing a good job distinguishing between classes.

📊 Boosting Classifier — Performance Summary:
- Accuracy  :  95.00% → Overall correctness.
- Precision :  93.10% → Of predicted '1', how many were right.
- Recall    :  90.00% → Of actual '1', how many we caught.
- F1 Score  :  91.53% → Balance of precision & recall.

📌 Interpretation:
- Precision looks acceptable; false positives under control.
- Recall is strong; model is catching true cases well.
- F1 Score shows overall tradeoff quality: 0.92


import joblib
import json
import os

export=False
if export:
    # 📦 Create export folder if it doesn't exist
    os.makedirs("export", exist_ok=True)

    # 💾 Save the best model
    joblib.dump(best_model, "export/best_model.joblib")

    # 🧮 Prepare and save the evaluation metrics (exclude the model object)
    metrics_copy = {k: v for k, v in model_results[best_model_name].items() if k != "model"}
    with open("export/metrics.json", "w") as f:
        json.dump(metrics_copy, f, indent=2)

    print("✅ Model and metrics exported to /export/")

	Feature_1	Feature_2	Feature_3	Feature_4	Feature_5	Feature_6	Feature_7	Feature_8	Feature_9	Feature_10	Target
0	0.959085	-0.066449	0.918572	-0.358079	0.997266	1.181890	-1.415679	-1.210161	-0.828077	1.227274	0
1	-0.910796	-0.566395	-0.940419	0.831617	-1.176962	1.820544	1.552375	-0.984534	0.563896	0.209470	1
2	-0.103769	-0.432774	-0.389454	0.793818	-0.268646	-1.836360	1.039086	-0.246383	-0.858145	-0.297376	1
3	1.580930	2.023606	1.542262	0.006800	-1.607661	0.184741	-2.419427	-0.357445	-1.273127	-0.190039	0
4	-0.006898	-0.711303	0.139918	0.117124	1.536061	0.597538	-0.437329	-0.939156	0.484698	0.236224	0

Criterion	📊 Logistic Regression	🧮 Naive Bayes	🌳 Decision Tree	🌲 Random Forest	🎯 KNN (K-Nearest Neighbors)	📈 SVM (Support Vector Machines)	🚀 XGBoost	🧠 Neural Network
Interpretability	✅ Excellent – coefficients are directly interpretable	✅ Good – conditional probabilities are intuitive	✅ Very good – rules and splits are easily visualized and explained	❌ Low – individual trees interpretable, but ensemble is a black box	⚠️ Moderate – intuitive idea, but no explicit model or coefficients	⚠️ Moderate – support vectors and margins can be visualized in 2D, but overall less intuitive	❌ Low – complex ensemble; partial plots or SHAP values needed for insight	❌ Very low – acts as a black box unless aided by techniques like SHAP, LIME
Linearity Expectation	⚠️ Yes – assumes linear relationship between features and log-odds	⚠️ Assumes feature independence → not truly linear or interactive	✅ No – naturally captures non-linear relationships	✅ No – captures complex non-linear relationships	✅ No – captures non-linear patterns based on local neighborhoods	⚠️ Depends – linear SVM assumes linear separability; kernel SVM handles non-linear	✅ No – naturally models complex non-linear relationships	✅ No – inherently models non-linear and complex interactions
High dimensionality	✅ Good – performs well with many features (with regularization)	✅ Very good – handles high-dimensional sparse data well	⚠️ Moderate – can overfit with too many features unless pruned	✅ Good – handles many features well via feature bagging	❌ Poor – suffers from the curse of dimensionality; distances become less meaningful	✅ Very good – especially effective in high-dimensional spaces (e.g., text data)	✅ Excellent – handles many features via regularization and feature importance	✅ Excellent – scales well with many features and large data
Handling of multicollinearity	❌ Needs treatment – regularization helps, but still sensitive	❌ Poor – assumes feature independence; correlated features hurt performance	✅ Handles it – but may create instability in splits	✅ Handles it – less sensitive due to random feature selection per tree	❌ Problematic – redundant features distort distance metrics	⚠️ Can be sensitive – especially in linear SVM; use regularization	✅ Handles well – trees split on the most useful among correlated features	✅ Handles – internal weights adjust during training, but correlated inputs may slow convergence
Handling of categorical features	❌ Needs preprocessing – requires one-hot or ordinal encoding	⚠️ Needs preprocessing – requires encoding, but categorical NB variants exist	⚠️ Partial – numerical encoding needed; some implementations support direct handling	⚠️ Requires encoding – label or one-hot encoding typically needed	❌ Not natively supported – requires careful encoding and distance handling	❌ Not supported – requires one-hot or other encoding	⚠️ Needs encoding – label encoding typically used; native support improving	❌ Not native – requires one-hot encoding or embeddings
Handling of outliers	❌ Sensitive – can distort coefficients significantly	❌ Very sensitive – assumes Gaussian or other strict distributional forms	✅ Robust – splits are based on thresholds, not sensitive to extreme values	✅ Robust – not sensitive due to median-based splits and ensembling	❌ Sensitive – local distance-based voting easily skewed by outliers	❌ Sensitive – margin-based optimization gets distorted by outliers	✅ Robust – trees are insensitive to extreme values	❌ Sensitive – can destabilize training; often mitigated with preprocessing
Handling of missing values	❌ Not supported – requires imputation	❌ Not supported – requires imputation	⚠️ Limited – some implementations handle missing splits, others need imputation	⚠️ Some support – not native in all implementations; imputation often needed	❌ Not supported – requires complete-case or imputation preprocessing	❌ Not supported – must impute before training	✅ Built-in – learns optimal path for missing values during tree construction	❌ Not supported – must be imputed before training
Scaling of features needed	⚠️ Yes – especially important when using regularization	⚠️ Sometimes – required if Gaussian NB is used (assumes normal distribution)	✅ Not needed – uses raw feature values for splitting	✅ Not needed – tree splits are scale-invariant	✅ Yes – essential, as distance calculations are affected by feature magnitudes	✅ Yes – essential due to reliance on distance and dot products	✅ Not needed – tree-based, scale-invariant	✅ Yes – critical for stable and fast convergence (e.g., standardization or normalization)
Class Imbalance problem	⚠️ Needs adjustment – use class_weight='balanced' or resampling	⚠️ Needs adjustment – priors can be tuned or class weights applied manually	❌ Poor – biased toward majority class unless adjusted with class_weight or sampling	⚠️ Needs adjustment – use class_weight='balanced' or stratified sampling	❌ Poor – biased toward majority class due to majority voting	⚠️ Needs adjustment – use class_weight='balanced' or tune C and margins	✅ Handled – use scale_pos_weight, custom loss, or sampling	⚠️ Needs care – custom loss functions, class weights, or resampling required
Handling of sparseness in data	✅ Works fine – especially with L1 regularization for feature selection	✅ Excellent – especially performant in text classification or bag-of-words models	⚠️ Depends – not ideal for extremely sparse datasets (e.g., text data)	⚠️ Moderate – not ideal for extreme sparsity (e.g., NLP bag-of-words)	❌ Weak – sparse vectors make distance metrics ineffective	✅ Good – works well in high-dimensional sparse spaces (esp. linear SVM)	✅ Excellent – designed to handle sparse matrices natively	⚠️ Depends – not ideal unless using sparse-aware architectures or embedding layers
Accuracy	Moderate – often outperformed by tree-based models for complex, non-linear patterns.	Surprisingly strong baseline for some problems (e.g., NLP); weak when feature independence assumption breaks.	Prone to overfitting if unpruned; weak alone but powerful as base learners in ensembles.	✅ Strong – robust out-of-the-box performance with low overfitting risk.	⚠️ Highly data-dependent – can perform well with clean, balanced, low-dimensional data.	✅ High – strong performance on well-separated data, especially with good kernel choice.	✅ Top-tier – one of the most accurate out-of-the-box models for tabular data.	✅ High – can outperform other models with enough data and tuning, especially on non-tabular data.
Training speed	✅ Fast – very efficient even on large datasets.	✅ Extremely fast – almost instantaneous to train.	✅ Fast – quick to train on moderate-sized datasets.	⚠️ Slower than single models – parallelizable but can be compute-heavy.	✅ Fast training, ❌ Slow inference – lazy learner, evaluates at prediction time.	❌ Slow – especially on large datasets or with complex kernels.	⚠️ Slower – faster than many ensembles, but heavier than single models; GPU support helps.	❌ Slow – resource-intensive; requires tuning and hardware for best performance.

Pros	Cons
Fast and efficient	Assumes linear relationship (log-odds)
Easy to interpret (feature weights)	Doesn’t handle complex patterns well
Works well with small datasets	Sensitive to multicollinearity
Outputs probabilities	May underperform on nonlinear data

Criterion	Comment
Interpretability	✅ Excellent – coefficients are directly interpretable
Linearity Expectation	⚠️ Yes – assumes linear relationship between features and log-odds
High dimensionality	✅ Good – performs well with many features (with regularization)
Handling of multicollinearity	❌ Needs treatment – regularization helps, but still sensitive
Handling of categorical features	❌ Needs preprocessing – requires one-hot or ordinal encoding
Handling of outliers	❌ Sensitive – can distort coefficients significantly
Handling of missing values	❌ Not supported – requires imputation
Scaling of features needed	⚠️ Yes – especially important when using regularization
Class Imbalance problem	⚠️ Needs adjustment – use `class_weight='balanced'` or resampling
Handling of sparseness in data	✅ Works fine – especially with L1 regularization for feature selection

	Feature_1	Feature_2	Feature_3	Feature_4	Feature_5	Feature_6	Feature_7	Feature_8	Feature_9	Feature_10	z = w·x + b	sigmoid(z) = prob	true_label	log_loss_row
595	0.178986	1.033881	0.488455	-0.407460	-0.574101	0.536414	-1.232457	0.123480	0.881295	0.907962	-2.8442	0.0550	0	0.0566
868	0.994921	0.202329	0.936990	1.631857	-1.269330	1.702515	-1.419999	1.818062	-0.910983	-0.733033	-3.1678	0.0404	0	0.0412
406	0.030370	0.967794	0.407555	0.398992	-1.271549	-1.153084	-1.200730	0.324658	1.210347	1.050049	-2.9566	0.0494	0	0.0507
815	1.248285	-1.087246	1.198098	-1.085825	-0.675708	0.034152	-1.850321	-1.148794	-1.069471	0.679373	-3.8832	0.0202	0	0.0204
762	0.070351	-0.115110	0.300255	-0.345919	-1.391958	1.704102	-0.815085	-1.121751	0.700136	0.889154	-2.1370	0.1056	0	0.1116
229	0.049136	0.862543	0.306325	-1.694973	-1.426827	0.069337	-0.864365	0.817306	0.804673	0.820232	-2.3964	0.0835	0	0.0872
445	-0.300191	0.372648	-0.264597	0.166493	-1.285599	-1.615846	0.373120	0.243475	0.334053	1.542736	-0.2111	0.4474	1	0.8043
691	0.370534	0.184971	0.508500	0.452756	0.576451	-1.508556	-1.016108	-0.770819	0.181994	1.707330	-2.4438	0.0799	0	0.0833
625	0.263054	-0.335138	0.132288	0.342338	1.987061	-0.530971	-0.022842	0.853976	-0.618068	1.554160	-0.8592	0.2975	0	0.3531
697	0.843541	0.978422	0.785313	0.522143	0.975312	0.515628	-1.176116	-0.330789	-0.802144	-1.103670	-2.3149	0.0899	0	0.0942

Pros	Cons
Very fast and scalable	Assumes feature independence (naive)
Handles high-dimensional data well	May underperform with correlated inputs
Simple and interpretable	Struggles with numeric feature scaling
Works well with text data	Outputs are often overconfident

Pros	Cons
Easy to visualize and interpret	Prone to overfitting on noisy data
No need for feature scaling	Can create unstable splits
Captures non-linear relationships	Doesn’t generalize well on small data
Works for both numeric and categorical	Can be biased toward dominant features

Pros	Cons
Strong performance out of the box	Less interpretable than a single tree
Handles non-linearities and interactions	Slower for real-time predictions
Resistant to overfitting	May require tuning to perform well
Works well with large feature spaces	Not ideal when interpretability is key

Pros	Cons
Simple and intuitive	Slow at prediction time (no training step)
No training required	Struggles with high-dimensional data
Captures local patterns	Requires feature scaling
Flexible distance metrics	Memory-intensive with large datasets

Pros	Cons
Works well in high-dimensional spaces	Slow on large datasets
Effective for non-linear boundaries	Requires careful parameter tuning
Robust to overfitting (with regularization)	Not intuitive to interpret
Supports different kernels	Doesn’t scale well with noisy data

Pros	Cons
High predictive accuracy	Harder to interpret
Built-in regularization (less overfitting)	More complex than basic tree models
Fast and scalable	Requires tuning for best performance
Handles missing data automatically	May overfit small/noisy datasets

Pros	Cons
Can model complex, non-linear relationships	Requires lots of data and tuning
Works well on both tabular and image/text data	Not interpretable out of the box
Scales with data and compute	Can overfit if not regularized
Highly customizable architectures	Slower to train, harder to debug

Target Type	Linearly Separable	Correlation	Imbalance	Recommended Models	Notes
Binary	✅ True	Low	✅ True	XGBoost > Random Forest	Use tree-based models with class weights or resampling.
Binary	✅ True	Low	❌ False	Logistic Regression > SVM	Start with simple linear models. Use as benchmark.
Binary	✅ True	High	✅ True	XGBoost > Random Forest	Use tree-based models with class weights or resampling.
Binary	✅ True	High	❌ False	Logistic Regression > SVM	Start with simple linear models. Use as benchmark.
Binary	❌ False	Low	✅ True	XGBoost > Random Forest	Boosting or RF with class weights to handle imbalance + complexity.
Binary	❌ False	Low	❌ False	Random Forest > Decision Tree	Simple non-linear trees likely sufficient. Avoid tuning-heavy models.
Binary	❌ False	High	✅ True	XGBoost > Random Forest	Boosting or RF with class weights to handle imbalance + complexity.
Binary	❌ False	High	❌ False	Random Forest > Decision Tree	Simple non-linear trees likely sufficient. Avoid tuning-heavy models.
Multiclass	✅ True	Low	✅ True	XGBoost > Logistic Regression	Use OvR strategy with LR/XGB. Watch for class separation.
Multiclass	✅ True	Low	❌ False	XGBoost > Logistic Regression	Use OvR strategy with LR/XGB. Watch for class separation.
Multiclass	✅ True	High	✅ True	XGBoost > Logistic Regression	Use OvR strategy with LR/XGB. Watch for class separation.
Multiclass	✅ True	High	❌ False	XGBoost > Logistic Regression	Use OvR strategy with LR/XGB. Watch for class separation.
Multiclass	❌ False	Low	✅ True	Neural Network > KNN	Use Neural Net or KNN. Prioritize decision boundary complexity.
Multiclass	❌ False	Low	❌ False	Neural Network > KNN	Use Neural Net or KNN. Prioritize decision boundary complexity.
Multiclass	❌ False	High	✅ True	XGBoost > Random Forest	Tree-based models preferred. Skip preprocessing of collinear features.
Multiclass	❌ False	High	❌ False	XGBoost > Random Forest	Tree-based models preferred. Skip preprocessing of collinear features.

Row	Hours Studied (x)	Pass? (y)
1	1	0
2	2	0
3	3	0
4	4	1
5	5	1
6	6	1

📖 Classification¶

🧭 Problem Statement¶

📌 What is Classification?

Key Points

📂 Data Setup¶

📥 Load Dataset¶

📊 Data Characteristics Dictionary

🔎 EDA¶

🛠️ Feature Engineering¶

🧹 Preprocessing¶

🧪 Baseline Classifier Model¶

📊 Model Evaluation¶

📉 Confusion Matrix¶

📈 ROC Curve / AUC¶

🧮 Update Best Model Info¶

🔍 Algorithms¶

📊 Logistic Regression¶

🔍 What is Logistic Regression?

✅ Pros vs ❌ Cons

🧠 When to Use

⚠️ Pitfalls & Hacks

🧮 Logistic Regression – Internal Workflow

📊 Toy Dataset

🔁 Internal Steps (Training Loop)

✅ Convergence Goal

🧮 Naive Bayes¶

🔍 What is Naive Bayes?

✅ Pros vs ❌ Cons

🧠 When to Use

⚠️ Pitfalls & Hacks

🌳 Decision Tree¶

🔍 What is a Decision Tree?

✅ Pros vs ❌ Cons

🧠 When to Use

⚠️ Pitfalls & Hacks

🌲 Random Forest¶

🔍 What is a Random Forest?

✅ Pros vs ❌ Cons

🧠 When to Use

⚠️ Pitfalls & Hacks

🎯 KNN (K-Nearest Neighbors)¶

🔍 What is K-Nearest Neighbors?

✅ Pros vs ❌ Cons

🧠 When to Use

⚠️ Pitfalls & Hacks

📈 SVM (Support Vector Machines)¶

🔍 What is SVM?

✅ Pros vs ❌ Cons

🧠 When to Use

⚠️ Pitfalls & Hacks

🚀 XGBoost¶

🔍 What is XGBoost?

✅ Pros vs ❌ Cons

🧠 When to Use

⚠️ Pitfalls & Hacks

🧠 Neural Network¶

🔍 What is a Neural Network?

✅ Pros vs ❌ Cons

🧠 When to Use

⚠️ Pitfalls & Hacks

📊 Model Selection¶

🧠 Model Selection Table (Based on Data Characteristics)

🧠 Model Selection Table (Based on Data Characteristics)

🧠 Recommend Models¶

📈 Model Comparison¶

📊 Feature Importance

🧬 SHAP Values

🛠️ Fine-Tune¶

🧪 Feature Selection – RFE¶

🧪 Feature Selection – RFE + SHAP¶

🔎 Grid Search¶

🔍 What is Grid Search?

🎲 Randomized Search¶

🔍 What is Randomized Search?

🔀 Ensemble Methods (Templates)¶

🔀 When Should You Use Ensemble Methods?

🗳️ Voting Classifier¶

🗳️ What is a Voting Classifier?

🧬 Stacking Classifier¶

🧬 What is Stacking?