# Data handling and manipulation
import pandas as pd
import numpy as np

# Machine Learning and Model Evaluation
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, TimeSeriesSplit, KFold
from sklearn.decomposition import PCA
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Statistical and Other Utilities
from scipy.stats import zscore
from termcolor import colored

# Visualization
import matplotlib.pyplot as plt

# Simulating a classification dataset
X, y = make_classification(n_samples=1000, n_features=10, n_informative=2, n_classes=2, random_state=42)

# Converting to a DataFrame for easier handling
df = pd.DataFrame(X, columns=[f"Feature_{i}" for i in range(1, 11)])
df['Target'] = y

# Display the first few rows of the dataset
df.head()

# Initialize the data characteristics dictionary
data_characteristics = {
    "target_variable": {
        "type": None,  # "binary", "multiclass"
        "imbalance": None,  # True if imbalanced, False otherwise
        "class_imbalance_severity": None  # e.g., "high", "low"
    },
    "features": {
        "type": None,  # "categorical", "continuous", "mixed"
        "correlation": None,  # "low", "medium", "high"
        "outliers": None,  # True if outliers detected, False otherwise
        "missing_data": None  # Percentage of missing data or boolean
    },
    "data_size": None,  # Size of dataset (samples, features)
    "linear_separability": None  # True if classes are linearly separable
}

def analyze_target_variable(df, data_characteristics):
    target_column = 'Target'  # Assuming 'Target' is the target column name
    unique_classes = df[target_column].nunique()
    
    print(colored(f"📊 Analyzing Target Variable: {target_column}"))
    print(f"🔍 Found {unique_classes} unique classes in the target variable.")
    
    # Check if binary or multiclass
    if unique_classes == 2:
        data_characteristics["target_variable"]["type"] = "binary"
        print(colored("✅ The target variable is binary (2 classes).", 'green'))
    else:
        data_characteristics["target_variable"]["type"] = "multiclass"
        print(colored(f"⚖️ The target variable is multiclass ({unique_classes} classes)."))
    
    # Check if imbalanced
    class_counts = df[target_column].value_counts()
    imbalance_ratio = class_counts.min() / class_counts.max()
    print(f"📊 Class distribution: {class_counts.to_dict()}")
    
    if imbalance_ratio < 0.2:
        data_characteristics["target_variable"]["imbalance"] = True
        data_characteristics["target_variable"]["class_imbalance_severity"] = "high"
        print(colored(f"⚠️ Target variable is imbalanced with a severity ratio of {imbalance_ratio:.2f} (high imbalance).", 'red'))
    else:
        data_characteristics["target_variable"]["imbalance"] = False
        data_characteristics["target_variable"]["class_imbalance_severity"] = "low"
        print(colored(f"✅ Target variable is balanced with a severity ratio of {imbalance_ratio:.2f} (low imbalance).", 'green'))
    
    return data_characteristics

data_characteristics = analyze_target_variable(df, data_characteristics)

📊 Analyzing Target Variable: Target
🔍 Found 2 unique classes in the target variable.
✅ The target variable is binary (2 classes).
📊 Class distribution: {0: 501, 1: 499}
✅ Target variable is balanced with a severity ratio of 1.00 (low imbalance).

def analyze_feature_types(df, data_characteristics):
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
    
    print(colored("🔍 Analyzing Feature Types"))
    print(f"📋 Categorical features: {categorical_cols.tolist()}")
    print(f"📐 Numerical features: {numerical_cols.tolist()}")
    
    if len(categorical_cols) > 0 and len(numerical_cols) > 0:
        data_characteristics["features"]["type"] = "mixed"
        print(colored("🔀 The dataset contains both categorical and numerical features (mixed)."))
    elif len(categorical_cols) > 0:
        data_characteristics["features"]["type"] = "categorical"
        print(colored(f"📊 The dataset contains only categorical features: {categorical_cols.tolist()}.", 'green'))
    else:
        data_characteristics["features"]["type"] = "continuous"
        print(colored(f"📈 The dataset contains only continuous features: {numerical_cols.tolist()}.", 'green'))
    
    return data_characteristics

data_characteristics = analyze_feature_types(df, data_characteristics)

🔍 Analyzing Feature Types
📋 Categorical features: []
📐 Numerical features: ['Feature_1', 'Feature_2', 'Feature_3', 'Feature_4', 'Feature_5', 'Feature_6', 'Feature_7', 'Feature_8', 'Feature_9', 'Feature_10', 'Target']
📈 The dataset contains only continuous features: ['Feature_1', 'Feature_2', 'Feature_3', 'Feature_4', 'Feature_5', 'Feature_6', 'Feature_7', 'Feature_8', 'Feature_9', 'Feature_10', 'Target'].

def analyze_feature_correlations(df, data_characteristics):
    # Calculate the correlation matrix (excluding the target variable)
    corr_matrix = df.drop('Target', axis=1).corr()
    high_corr_threshold = 0.9  # Correlation threshold for "high"
    
    print(colored("📊 Analyzing Feature Correlations"))
    print(f"📉 Correlation matrix:\n{corr_matrix}")
    
    # Check if any correlations exceed the threshold
    high_corr = (corr_matrix.abs() > high_corr_threshold).sum().sum() - len(df.columns)
    print(f"⚡ Number of highly correlated feature pairs: {high_corr}")
    
    if high_corr > 0:
        data_characteristics["features"]["correlation"] = "high"
        print(colored(f"🔴 Found {high_corr} pairs of features with high correlation (above {high_corr_threshold}).", 'red'))
    else:
        data_characteristics["features"]["correlation"] = "low"
        print(colored(f"✅ No feature pairs found with high correlation (above {high_corr_threshold}).", 'green'))
    
    return data_characteristics

data_characteristics = analyze_feature_correlations(df, data_characteristics)

📊 Analyzing Feature Correlations
📉 Correlation matrix:
            Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  Feature_6  \
Feature_1    1.000000   0.058172   0.935882   0.012282   0.027394   0.016197   
Feature_2    0.058172   1.000000   0.062426   0.022266  -0.031579  -0.026735   
Feature_3    0.935882   0.062426   1.000000   0.014551   0.012872  -0.000606   
Feature_4    0.012282   0.022266   0.014551   1.000000  -0.016467  -0.002439   
Feature_5    0.027394  -0.031579   0.012872  -0.016467   1.000000  -0.010388   
Feature_6    0.016197  -0.026735  -0.000606  -0.002439  -0.010388   1.000000   
Feature_7   -0.779008  -0.059525  -0.949966  -0.015008   0.001379   0.015440   
Feature_8   -0.028237   0.037916  -0.013006   0.032063  -0.039584  -0.017981   
Feature_9   -0.701895  -0.024690  -0.405943  -0.002441  -0.045036  -0.043242   
Feature_10  -0.035538  -0.048878  -0.025878  -0.043532   0.041455  -0.036535   

            Feature_7  Feature_8  Feature_9  Feature_10  
Feature_1   -0.779008  -0.028237  -0.701895   -0.035538  
Feature_2   -0.059525   0.037916  -0.024690   -0.048878  
Feature_3   -0.949966  -0.013006  -0.405943   -0.025878  
Feature_4   -0.015008   0.032063  -0.002441   -0.043532  
Feature_5    0.001379  -0.039584  -0.045036    0.041455  
Feature_6    0.015440  -0.017981  -0.043242   -0.036535  
Feature_7    1.000000  -0.001887   0.100172    0.014548  
Feature_8   -0.001887   1.000000   0.046952   -0.039575  
Feature_9    0.100172   0.046952   1.000000    0.039866  
Feature_10   0.014548  -0.039575   0.039866    1.000000  
⚡ Number of highly correlated feature pairs: 3
🔴 Found 3 pairs of features with high correlation (above 0.9).

def analyze_outliers(df, data_characteristics):
    numeric_data = df.select_dtypes(include=['float64', 'int64'])
    
    print(colored("🔍 Analyzing Outliers"))
    print(f"📉 Z-scores for numerical features:\n{zscore(numeric_data)}")
    
    # Calculate Z-scores for each feature
    z_scores = zscore(numeric_data)
    
    # Check for outliers where z-scores are greater than 3
    outliers = (z_scores > 3).sum().sum()  # Sum across both axes to get a scalar value
    
    print(f"⚡ Total outliers detected: {outliers}")
    
    if outliers > 0:
        data_characteristics["features"]["outliers"] = True
        print(colored(f"🚨 Detected {outliers} outliers across the features.", 'red'))
    else:
        data_characteristics["features"]["outliers"] = False
        print(colored("✅ No significant outliers detected.", 'green'))
    
    return data_characteristics

data_characteristics = analyze_outliers(df, data_characteristics)

🔍 Analyzing Outliers
📉 Z-scores for numerical features:
     Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  Feature_6  \
0     1.089170  -0.051238   1.180508  -0.332625   0.996738   1.167833   
1    -1.102121  -0.550766  -1.259751   0.864132  -1.099597   1.815262   
2    -0.162122  -0.417257  -0.585947   0.826109  -0.223822  -1.891890   
3     2.004226   2.037069   2.038277   0.034419  -1.514866   0.156982   
4    -0.296352  -0.695553  -0.296264   0.145399   1.516231   0.575451   
..         ...        ...        ...        ...        ...        ...   
995  -1.627580   1.476187  -1.438674  -0.264598  -0.363413   0.002544   
996  -1.799079  -3.203167  -1.617012   0.425526  -0.473879   1.341364   
997   1.915962   1.768620   1.913529   0.046090  -1.425014   0.295715   
998  -0.941497  -0.819835  -0.264546   1.131311   0.208820  -0.306981   
999   0.254793  -0.987445   0.414563  -0.305740   1.375175  -1.393991   

     Feature_7  Feature_8  Feature_9  Feature_10    Target  
0    -1.135314  -1.075348  -0.438634    1.199727 -0.998002  
1     1.264860  -0.860151   0.312022    0.183985  1.002002  
2     0.899075  -0.156119  -0.764079   -0.321835  1.002002  
3    -1.850614  -0.262048  -1.078108   -0.214715 -0.998002  
4     0.264521  -0.816870   0.169771    0.210684  1.002002  
..         ...        ...        ...         ...       ...  
995   1.117428   1.441595   1.313323    1.045308  1.002002  
996   1.282769  -1.524301   1.397637    0.009141  1.002002  
997  -1.706853   0.739579  -1.101357    0.199747 -0.998002  
998  -0.363899  -0.395490   1.907391    0.840370 -0.998002  
999  -0.511903   0.816374   0.177201    1.388482 -0.998002  

[1000 rows x 11 columns]
⚡ Total outliers detected: 16
🚨 Detected 16 outliers across the features.

def analyze_missing_data(df, data_characteristics):
    print(colored("🔍 Analyzing Missing Data"))
    
    # Calculate percentage of missing data for each feature
    missing_data = df.isnull().mean() * 100  # Percentage of missing data
    print(f"📊 Missing data percentage:\n{missing_data}")
    
    if missing_data.max() > 0:
        data_characteristics["features"]["missing_data"] = missing_data
        print(f"⚠️ Features with missing data: {missing_data[missing_data > 0]}")
    else:
        data_characteristics["features"]["missing_data"] = 0
        print(colored("✅ No missing data detected.", 'green'))
    
    return data_characteristics

data_characteristics = analyze_missing_data(df, data_characteristics)

🔍 Analyzing Missing Data
📊 Missing data percentage:
Feature_1     0.0
Feature_2     0.0
Feature_3     0.0
Feature_4     0.0
Feature_5     0.0
Feature_6     0.0
Feature_7     0.0
Feature_8     0.0
Feature_9     0.0
Feature_10    0.0
Target        0.0
dtype: float64
✅ No missing data detected.

def analyze_data_size(df, data_characteristics):
    print(colored("📊 **Analyzing Data Size**"))
    
    # Get the number of rows (samples) and columns (features)
    num_samples, num_features = df.shape
    data_characteristics["data_size"] = {"samples": num_samples, "features": num_features}
    
    print(f"📋 Data Size: {num_samples} samples, {num_features} features.")
    
    return data_characteristics

data_characteristics = analyze_data_size(df, data_characteristics)  # New function

📊 **Analyzing Data Size**
📋 Data Size: 1000 samples, 11 features.

def analyze_linear_separability(df, data_characteristics):
    print(colored("🔍 **Analyzing Linear Separability**"))
    
    X = df.drop('Target', axis=1)
    y = df['Target']
    
    # Use PCA to reduce to 2D for visualization
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)
    
    # Plotting the PCA projection
    plt.figure(figsize=(8, 6))
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='coolwarm', alpha=0.7)
    plt.title('PCA Projection of Features')
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.colorbar(label='Target')
    plt.show()
    
    # Manually set linear separability after inspection
    # Example: You can directly update based on your judgment
    # data_characteristics["linear_separability"] = "separable"  # Set this if the classes are separable
    # OR
    # data_characteristics["linear_separability"] = "non-separable"  # Set this if the classes are not separable
    # OR
    # data_characteristics["linear_separability"] = "unknown"  # Set this if it's unclear    
    # print(f"Linear separability is set to: {data_characteristics['linear_separability']}")
    
    return data_characteristics

data_characteristics = analyze_linear_separability(df, data_characteristics)

# Assuming you have already visualized the PCA plot
# Manually set linear separability based on your observation
data_characteristics["linear_separability"] = "separable"  # Or "non-separable" or "unknown"

🔍 **Analyzing Linear Separability**

# from IPython.display import HTML
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"
# from IPython.display import display, HTML
# import json
# display(HTML(f"<pre style='color:teal; font-size:14px;'>{json.dumps(data_characteristics, indent=4)}</pre>"))
data_characteristics

{'target_variable': {'type': 'binary',
  'imbalance': False,
  'class_imbalance_severity': 'low'},
 'features': {'type': 'continuous',
  'correlation': 'high',
  'outliers': True,
  'missing_data': 0},
 'data_size': {'samples': 1000, 'features': 11},
 'linear_separability': 'separable'}

def handle_missing_values(df):
    """
    Handle missing values in the dataset.
    Placeholder: Decide whether to impute or drop rows/columns with missing data.
    """
    # Placeholder - Example of handling missing data
    # df = df.dropna()  # Drop rows with missing values
    # OR
    # df['column_name'] = df['column_name'].fillna(df['column_name'].mean())  # Impute missing values with mean
    pass

def feature_engineering(df):
    """
    Perform feature engineering to create or modify features in the dataset.
    Placeholder: Create new features or transform existing features based on insights.
    """
    # Placeholder - Example of feature engineering
    # df['new_feature'] = df['feature1'] * df['feature2']  # Create a new feature
    pass

def handle_categorical_data(df):
    """
    Handle categorical data in the dataset.
    Placeholder: Apply encoding techniques like One-Hot Encoding or Label Encoding.
    """
    # Placeholder - Example of handling categorical data
    # df = pd.get_dummies(df, columns=['categorical_column'])  # One-Hot Encoding
    # OR
    # from sklearn.preprocessing import LabelEncoder
    # le = LabelEncoder()
    # df['encoded_column'] = le.fit_transform(df['categorical_column'])  # Label Encoding
    pass

def scale_data(df):
    """
    Scale the numerical features in the dataset.
    Placeholder: Normalize or standardize numerical features if needed.
    """
    # Placeholder - Example of scaling the data
    # from sklearn.preprocessing import StandardScaler
    # scaler = StandardScaler()
    # df[['numerical_column1', 'numerical_column2']] = scaler.fit_transform(df[['numerical_column1', 'numerical_column2']])
    pass

def basic_train_test_split(df, target_column, test_size=0.2, random_state=42):
    """
    Perform a basic train-test split (80-20 by default) on the dataset.
    
    Parameters:
    - df: DataFrame containing the dataset.
    - target_column: The column name representing the target variable.
    - test_size: The proportion of the dataset to include in the test split (default 0.2).
    - random_state: Controls the randomness of the train-test split (default 42).
    
    Returns:
    - X_train: Training data features.
    - X_test: Test data features.
    - y_train: Training data target.
    - y_test: Test data target.
    """
    # Features and target variables
    X = df.drop(target_column, axis=1)  # Features
    y = df[target_column]  # Target variable
    
    # Perform the train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    return X_train, X_test, y_train, y_test

def stratified_train_test_split(df, target_column, test_size=0.2, random_state=42):
    """
    Perform a stratified train-test split to maintain class distribution.
    
    Parameters:
    - df: DataFrame containing the dataset.
    - target_column: The column name representing the target variable.
    - test_size: The proportion of the dataset to include in the test split (default 0.2).
    - random_state: Controls the randomness of the train-test split (default 42).
    
    Returns:
    - X_train: Training data features.
    - X_test: Test data features.
    - y_train: Training data target.
    - y_test: Test data target.
    """
    # Features and target variables
    X = df.drop(target_column, axis=1)  # Features
    y = df[target_column]  # Target variable
    
    # Perform the stratified train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y, random_state=random_state)
    
    return X_train, X_test, y_train, y_test

def time_series_split(df, target_column, n_splits=5):
    """
    Perform a time-series split, ensuring the model is trained on past data and tested on future data.
    
    Parameters:
    - df: DataFrame containing the dataset.
    - target_column: The column name representing the target variable.
    - n_splits: The number of splits (default 5).
    
    Returns:
    - X_train, X_test, y_train, y_test for each fold in the time-series split.
    """
    # Features and target variables
    X = df.drop(target_column, axis=1)  # Features
    y = df[target_column]  # Target variable
    
    # Sort the data by date (ensure the data is sorted chronologically before splitting)
    df = df.sort_values('date_column')  # Assuming there is a 'date_column' to sort by
    
    # Initialize TimeSeriesSplit
    tscv = TimeSeriesSplit(n_splits=n_splits)
    
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        print(f"Train samples: {len(X_train)} | Test samples: {len(X_test)}")
    
    return X_train, X_test, y_train, y_test

def k_fold_cross_validation(df, target_column, n_splits=5):
    """
    Perform K-Fold Cross-Validation on the dataset.
    
    Parameters:
    - df: DataFrame containing the dataset.
    - target_column: The column name representing the target variable.
    - n_splits: The number of splits (default 5).
    
    Returns:
    - X_train, X_test, y_train, y_test for each fold in the cross-validation.
    """
    # Features and target variables
    X = df.drop(target_column, axis=1)  # Features
    y = df[target_column]  # Target variable
    
    # Initialize KFold
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        print(f"Train samples: {len(X_train)} | Test samples: {len(X_test)}")
    
    return X_train, X_test, y_train, y_test

def check_for_data_leakage(df, target_column):
    """
    Function to check for potential data leakage in the dataset.
    
    Parameters:
    - df: DataFrame containing the dataset.
    - target_column: The column name representing the target variable.
    """
    # Placeholder logic to check for potential data leakage
    pass

X_train, X_test, y_train, y_test = basic_train_test_split(df, target_column='Target')

import numpy as np
import pandas as pd

def generate_worst_case_metrics(df, target_column):
    # Get the number of unique classes in the target column
    n_classes = df[target_column].nunique()

    # Initialize worst-case metrics
    worst_case_metrics = {
        'model_name': 'Worst Case Model',  # Worst case model (predicting everything incorrectly)
        'accuracy': 0.0,  # Accuracy will be 0 (no correct predictions)
        'metrics': {
            'precision': 0.0,  # Worst case: no correct predictions, so precision is 0
            'recall': 0.0,  # Worst case: no correct predictions, so recall is 0
            'f1_score': 0.0,  # Worst case: both precision and recall are 0
            'tpr': 0.0,  # True Positive Rate (Recall), 0 in worst-case
            'fpr': 1.0,  # False Positive Rate, 1 in worst-case (model predicts all negatives as positives)
            'tnr': 0.0,  # True Negative Rate, 0 in worst case (model predicts all negatives incorrectly)
            'fnr': 1.0,  # False Negative Rate, 1 in worst-case (model misses all positives)
        },
        # Dynamically generate confusion matrix for multi-class (NxN)
        'confusion_matrix': np.full((n_classes, n_classes), np.nan).tolist(),  # Initialize with NaN values
    }

    # Generate worst-case confusion matrix:
    # Diagonal elements should be 0 (no true positives)
    # Off-diagonal elements will be NaN (model is misclassifying everything)
    for i in range(n_classes):
        worst_case_metrics['confusion_matrix'][i][i] = 0  # Diagonal elements are 0 (no true positives)

    # Advanced diagnostics (commented out for now, can be enabled if needed)
    # 'roc_auc': 0.0,  # Area Under the ROC Curve, 0 in worst-case model
    # 'log_loss': 1.0,  # Log Loss / Cross-Entropy Loss, worst-case will be high
    # 'mcc': 0.0,  # Matthews Correlation Coefficient, 0 in worst-case model
    # 'cohen_kappa': 0.0,  # Cohen's Kappa, 0 in worst-case model
    # 'pr_auc': 0.0,  # Precision-Recall AUC, 0 in worst-case model
    # 'brier_score': 1.0,  # Brier Score, worst-case will be 1
    # 'lift': 0.0,  # Lift score, 0 in worst-case model
    # 'gain': 0.0,  # Gain score, 0 in worst-case model
    # 'calibration_plot': 'Not available',  # Calibration plot, used for probabilistic models
    # 'hamming_loss': 1.0,  # Hamming Loss, 1 in worst-case model for multi-label classification

    return worst_case_metrics

# Example usage:
# Assuming df is your dataframe and 'Target' is the column you're working with
worst_case = generate_worst_case_metrics(df, 'Target')
print(worst_case)

{'model_name': 'Worst Case Model', 'accuracy': 0.0, 'metrics': {'precision': 0.0, 'recall': 0.0, 'f1_score': 0.0, 'tpr': 0.0, 'fpr': 1.0, 'tnr': 0.0, 'fnr': 1.0}, 'confusion_matrix': [[0, nan], [nan, 0]]}

def dummy_classifier_baseline(X_train, y_train, X_test, y_test):
    """
    Dummy classifier predicting the majority class as the baseline.
    """
    # Initialize the Dummy Classifier to predict the most frequent class
    baseline_model = DummyClassifier(strategy='most_frequent', random_state=42)
    
    # Fit the model on training data
    baseline_model.fit(X_train, y_train)
    
    # Make predictions on the test data
    y_pred = baseline_model.predict(X_test)
    
    # Return predictions for evaluation
    return y_pred

y_pred = dummy_classifier_baseline(X_train, y_train, X_test, y_test)

# # 3. Evaluate the baseline model's performance
# accuracy = accuracy_score(y_test, y_pred)
# clf_report = classification_report(y_test, y_pred)
# conf_matrix = confusion_matrix(y_test, y_pred)

# # 4. Print the evaluation metrics
# print(f"Baseline Model Accuracy: {accuracy:.4f}")
# print("\nClassification Report:\n", clf_report)
# print("Confusion Matrix:\n", conf_matrix)

def evaluate_baseline_model(y_test, y_pred):
    """
    Evaluate the baseline model using key classification metrics.
    """
    # Accuracy score
    accuracy = accuracy_score(y_test, y_pred)
    
    # Classification Report (Precision, Recall, F1-Score)
    clf_report = classification_report(y_test, y_pred)
    
    # Confusion Matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Print all evaluation metrics
    print(f"Baseline Model Accuracy: {accuracy:.4f}")
    print("\nClassification Report:\n", clf_report)
    print("Confusion Matrix:\n", conf_matrix)

evaluate_baseline_model(y_test, y_pred)

Baseline Model Accuracy: 0.4450

Classification Report:
               precision    recall  f1-score   support

           0       0.45      1.00      0.62        89
           1       0.00      0.00      0.00       111

    accuracy                           0.45       200
   macro avg       0.22      0.50      0.31       200
weighted avg       0.20      0.45      0.27       200

Confusion Matrix:
 [[ 89   0]
 [111   0]]

/Users/ar/Library/Python/3.9/lib/python/site-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/Users/ar/Library/Python/3.9/lib/python/site-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
/Users/ar/Library/Python/3.9/lib/python/site-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

	Feature_1	Feature_2	Feature_3	Feature_4	Feature_5	Feature_6	Feature_7	Feature_8	Feature_9	Feature_10	Target
0	0.964799	-0.066449	0.986768	-0.358079	0.997266	1.181890	-1.615679	-1.210161	-0.628077	1.227274	0
1	-0.916511	-0.566395	-1.008614	0.831617	-1.176962	1.820544	1.752375	-0.984534	0.363896	0.209470	1
2	-0.109484	-0.432774	-0.457649	0.793818	-0.268646	-1.836360	1.239086	-0.246383	-1.058145	-0.297376	1
3	1.750412	2.023606	1.688159	0.006800	-1.607661	0.184741	-2.619427	-0.357445	-1.473127	-0.190039	0
4	-0.224726	-0.711303	-0.220778	0.117124	1.536061	0.597538	0.348645	-0.939156	0.175915	0.236224	1

	Predicted Positive	Predicted Negative
Actual Positive	True Positive (TP)	False Negative (FN)
Actual Negative	False Positive (FP)	True Negative (TN)

📖 Table of Contents¶

📚 Classification Overview¶

Key Points¶

📊 Data Setup¶

📚 Libraries¶

🧾 Sample Data¶

📊 Data Characteristics Dictionary¶

📊 EDA¶

🎯 Analyzing Target Variable¶

🔢 Analyzing Feature Types¶

📊 Analyzing Feature Correlations¶

🚨 Analyzing Outliers¶

🛠️ Analyzing Missing Data¶

📊 Analyzing Data Size¶

🧠 Analyzing Linear Separability¶

🧰 Data Treatment¶

🛑 Handling Missing Values¶

🔧 Feature Engineering¶

🏷️ Handling Categorical Data¶

⚖️ Scaling the Data¶

📊 Data Splitting¶

📈 Basic Train-Test Split¶

🧑‍🤝‍🧑 Stratified Train-Test Split¶

🕰️ Time-series Split¶

🔄 K-Fold Cross-Validation¶

🚨 Handling Data Leakage¶

🧪 Baseline Model¶

📊 Model Diagnostics¶

Confusion Matrix (CM) for Binary Classification¶

Metric Formula ¶

🚀 Advanced Model Diagnostics¶

🎯 1. AUC-ROC (Area Under the Curve - Receiver Operating Characteristic):¶

📊 2. Precision-Recall AUC (PR AUC):¶

🧮 3. Matthews Correlation Coefficient (MCC):¶

🔐 4. Logarithmic Loss (Log Loss):¶

✖️ 5. Cohen's Kappa:¶

📉 6. Cross-Entropy:¶

🌐 7. Kullback-Leibler Divergence (KL Divergence):¶

⚠️ 8. Hamming Loss:¶

🎯 9. F2 Score:¶

📈 Dummy Classifier¶

📊 Metrics to Benchmark¶