import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import numpy as np

# Set a seed for reproducibility
np.random.seed(42)

# Number of rows in the dummy dataset
n_rows = 100

# Define possible values for categorical columns
stock_codes = ['85123A', '71053', '84406B', '84406G', '84406E']
descriptions = [
    'WHITE HANGING HEART T-LIGHT HOLDER',
    'WHITE METAL LANTERN',
    'CREAM CUPID HEARTS COAT HANGER',
    'PARTY BUNTING',
    'RED WOOLLY HOTTIE'
]
countries = ['United Kingdom', 'France', 'Germany', 'Spain', 'Netherlands']
customer_ids = [np.nan, 12346, 12347, 12348, 12349]

# Create dummy retail dataset and assign it to df
df = pd.DataFrame({
    'InvoiceNo': np.random.randint(10000, 20000, n_rows),
    'StockCode': np.random.choice(stock_codes, n_rows),
    'Description': np.random.choice(descriptions, n_rows),
    'Quantity': np.random.randint(1, 10, n_rows),
    'InvoiceDate': pd.date_range('2020-01-01', periods=n_rows, freq='D'),
    'UnitPrice': np.round(np.random.uniform(0.5, 20.0, n_rows), 2),
    'CustomerID': np.random.choice(customer_ids, n_rows),
    'Country': np.random.choice(countries, n_rows)
})

# Introduce additional random missingness into numeric columns (around 10% missing)
missing_mask = np.random.rand(n_rows) < 0.1  # roughly 10% missing
df.loc[missing_mask, 'Quantity'] = np.nan
missing_mask = np.random.rand(n_rows) < 0.1
df.loc[missing_mask, 'UnitPrice'] = np.nan

# Display a preview of the dummy dataset
print("Dummy Retail Dataset Preview:")
display(df.head())

# Optionally, show dataset info for a quick summary
print("\nDataset Info:")
df.info()

Dummy Retail Dataset Preview:

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   InvoiceNo    100 non-null    int64         
 1   StockCode    100 non-null    object        
 2   Description  100 non-null    object        
 3   Quantity     87 non-null     float64       
 4   InvoiceDate  100 non-null    datetime64[ns]
 5   UnitPrice    94 non-null     float64       
 6   CustomerID   79 non-null     float64       
 7   Country      100 non-null    object        
dtypes: datetime64[ns](1), float64(3), int64(1), object(3)
memory usage: 6.4+ KB

import matplotlib.pyplot as plt
import seaborn as sns

def missing_values_heatmap(df):
    """
    Display a heatmap of missing values for the entire dataframe with enhanced x-axis labels.
    
    Parameters:
        df (pd.DataFrame): Input dataframe.
        
    Behavior:
        - The x-axis displays column names with missing percentages in parentheses (if non-zero).
        - The y-axis labels (row indices) are hidden to reduce clutter.
    """
    # Compute missing percentages per column
    missing_percent = 100 * df.isnull().sum() / len(df)
    
    # Create custom x-axis labels: append missing percentage for non-zero values
    x_labels = [
        f"{col} ({round(missing_percent[col], 1)}%)" if missing_percent[col] > 0 else col 
        for col in df.columns
    ]
    
    plt.figure(figsize=(12, 6))
    sns.heatmap(df.isnull(), cbar=False, yticklabels=False)
    plt.title("Missing Values Heatmap")
    plt.xlabel("Columns")
    # Set custom x-axis tick labels with rotation for better readability
    plt.xticks(ticks=plt.xticks()[0], labels=x_labels, rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

# Example usage:
missing_values_heatmap(df)

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def detect_missing_by_column(df, visualize=False, plot_type='heatmap', ordering=False):
    """
    Generate a table of missing values by column and display a bar chart.
    
    Parameters:
        df (pd.DataFrame): Input dataframe.
        visualize (bool): If True, displays a bar chart.
        plot_type (str): Ignored (always bar chart is used).
        ordering (bool): If True, orders the columns by descending missing count.
    
    Returns:
        pd.DataFrame: A DataFrame with columns 'Column_Name', 'Missing_Count', and 'Missing_Percentage'.
    """
    # Calculate missing counts and percentages.
    missing_counts = df.isnull().sum()
    missing_percent = 100 * missing_counts / len(df)
    
    # Create summary table with no index.
    summary = pd.DataFrame({
        'Missing_Count': missing_counts,
        'Missing_Percentage': missing_percent
    }).reset_index()
    summary.rename(columns={'index': 'Column_Name'}, inplace=True)
    
    # Order the summary if requested.
    if ordering:
        summary = summary.sort_values(by='Missing_Count', ascending=False).reset_index(drop=True)
    
    # Visualization: Bar chart only.
    if visualize:
        plt.figure(figsize=(10, 4))
        plt.title("Missing Values By Column")
        
        # Build custom x-axis labels: append missing percentage (if non-zero).
        column_labels = [
            f"{col} ({round(perc, 1)}%)" if perc > 0 else col
            for col, perc in zip(summary['Column_Name'], summary['Missing_Percentage'])
        ]
        
        bars = plt.bar(range(len(summary)), summary['Missing_Count'], color='skyblue')
        plt.xticks(range(len(summary)), column_labels, rotation=45, ha='right')
        plt.ylabel("Missing Count")
        
        # Annotate each bar with missing count and percentage.
        for i, (count, perc) in enumerate(zip(summary['Missing_Count'], summary['Missing_Percentage'])):
            if count > 0:
                annotation = f"{int(count)} ({round(perc, 1)}%)"
                plt.text(i, count, annotation, ha='center', va='bottom')
                
        plt.tight_layout()
        plt.show()
    
    return summary

# Example usage (the function call remains unchanged):
detect_missing_by_column(df, visualize=True, plot_type='heatmap', ordering=True)

import pandas as pd
import matplotlib.pyplot as plt

def missing_count_by_row(df, visualize=False):
    """
    Compute and return a table of missing counts for each row.
    
    Parameters:
        df (pd.DataFrame): Input dataframe.
        visualize (bool): If True, displays a bar chart of missing counts in the original (unsorted) order,
                          with x-axis ticks only at multiples of 5.
        
    Returns:
        pd.DataFrame: A DataFrame with two columns:
            - Row_ID: The original row index of df.
            - Missing_Count: The number of missing values in that row.
        The DataFrame is ordered by Missing_Count in descending order.
    """
    # Compute missing values count per row (unsorted)
    missing_counts_unsorted = df.isnull().sum(axis=1)
    
    # Create the result table and a sorted version.
    result = pd.DataFrame({
        "Row_ID": df.index,
        "Missing_Count": missing_counts_unsorted
    })
    result_sorted = result.sort_values(by="Missing_Count", ascending=False).reset_index(drop=True)
    
    # Visualization: Plot the unsorted missing counts (original order) with x-axis ticks only at multiples of 5.
    if visualize:
        plt.figure(figsize=(10, 4))
        plt.bar(range(len(missing_counts_unsorted)), missing_counts_unsorted, color="salmon")
        
        # Define tick positions: multiples of 5.
        tick_positions = list(range(0, len(df), 5))
        tick_labels = [df.index[i] for i in tick_positions]
        
        plt.xticks(tick_positions, tick_labels, rotation=45, ha="right")
        plt.ylabel("Missing Count")
        plt.title("Missing Count by Row (Unordered)")
        plt.tight_layout()
        plt.show()
    
    return result_sorted

# Example usage:
row_missing_counts = missing_count_by_row(df, visualize=True)
row_missing_counts.head()

def deletion_imputation(df, target='rows'):
    """
    Delete rows or columns with missing values.

    Parameters:
        df (pd.DataFrame): Input dataframe.
        target (str): Determines whether to delete rows or columns with missing values.
                      Use 'rows' to remove rows (default) or 'columns' to remove columns.

    Returns:
        pd.DataFrame: DataFrame with the specified rows or columns removed.
    """
    if target == 'rows':
        return df.dropna(axis=0)
    elif target == 'columns':
        return df.dropna(axis=1)
    else:
        raise ValueError("target must be 'rows' or 'columns'")

# Example usage:
# To delete rows with missing values:
# df_imputed = deletion_imputation(df, target='rows')
#
# To delete columns with missing values:
# df_imputed = deletion_imputation(df, target='columns')

def simple_imputation(df, column, method='mean'):
    """
    Impute missing values in a single specified column using a simple method.

    Parameters:
        df (pd.DataFrame): Input dataframe.
        column (str): Column name to impute.
        method (str): Imputation method; must be 'mean', 'median', or 'mode'.
    
    Returns:
        pd.DataFrame: DataFrame with missing values imputed in the specified column.
    """
    if method == 'mean':
        df[column].fillna(df[column].mean(), inplace=True)
    elif method == 'median':
        df[column].fillna(df[column].median(), inplace=True)
    elif method == 'mode':
        df[column].fillna(df[column].mode()[0], inplace=True)
    else:
        raise ValueError("Method must be 'mean', 'median', or 'mode'")
    return df

# Example usage:
# df_imputed = df.copy()
# for col in ['col1', 'col2']:
#     df_imputed = simple_imputation(df_imputed, column=col, method='mean')

def knn_imputation(df, n_neighbors=5):
    """
    Impute missing values using KNN imputation on numeric columns.

    Parameters:
        df (pd.DataFrame): Input dataframe.
        n_neighbors (int): Number of neighbors for KNN imputation.

    Returns:
        pd.DataFrame: DataFrame with numeric missing values imputed via KNN.
    """
    from sklearn.impute import KNNImputer

    numeric_cols = df.select_dtypes(include=['number']).columns
    imputer = KNNImputer(n_neighbors=n_neighbors)
    # Replace the numeric columns in the original dataframe with imputed values.
    df[numeric_cols] = pd.DataFrame(imputer.fit_transform(df[numeric_cols]),
                                    columns=numeric_cols,
                                    index=df.index)
    return df

# Example usage:
# df_imputed = knn_imputation(df, n_neighbors=5)

def iterative_imputation(df, random_state=42, max_iter=100):
    """
    Impute missing values using iterative (model-based) imputation on numeric columns.

    Parameters:
        df (pd.DataFrame): Input dataframe.
        random_state (int): Random state for reproducibility.
        max_iter (int, optional): Maximum number of imputation rounds (default: 100).
    
    Returns:
        pd.DataFrame: DataFrame with missing values imputed iteratively.
    """
    from sklearn.experimental import enable_iterative_imputer
    from sklearn.impute import IterativeImputer

    numeric_cols = df.select_dtypes(include=['number']).columns
    imputer = IterativeImputer(random_state=random_state, max_iter=max_iter)
    df[numeric_cols] = pd.DataFrame(imputer.fit_transform(df[numeric_cols]),
                                    columns=numeric_cols,
                                    index=df.index)
    return df

# Example usage:
df_imputed = iterative_imputation(df, random_state=42)

def timeseries_imputation(df, method='ffill', target_col=None):
    """
    Impute missing values in a time series dataframe (or column) using forward fill or backward fill.

    Parameters:
        df (pd.DataFrame): Input dataframe.
        method (str): The imputation method to use. Options:
            - 'ffill': Forward Fill – propagates the last valid observation forward to fill missing values.
            - 'bfill': Backward Fill – uses the next valid observation to fill missing values.
            These methods assume that adjacent time steps are similar.
        target_col (str or None): Specifies a single column to impute. If None, imputation is applied to the entire dataframe.


    Returns:
        pd.DataFrame: DataFrame with missing values imputed using the selected method.
    """
    if method not in ['ffill', 'bfill']:
        raise ValueError("Method must be 'ffill' or 'bfill'")
    
    if target_col:
        df[target_col].fillna(method=method, inplace=True)
    else:
        df.fillna(method=method, inplace=True)
    
    return df

# Example usage:
# To impute the entire dataframe:
# df_imputed = timeseries_imputation(df, method='ffill')
#
# To impute only a specific column, e.g., 'col1':
# df_imputed = timeseries_imputation(df, method='bfill', target_col='col1')

def linear_regression_imputation(df, target_col, predictor_cols):
    """
    Impute missing values in a target column using linear regression based on other predictor columns.

    Parameters:
        df (pd.DataFrame): Input dataframe.
        target_col (str): The column to impute.
        predictor_cols (list of str): List of column names used as predictors.
    
    Returns:
        pd.DataFrame: DataFrame with missing values in the target column imputed.
    """
    from sklearn.linear_model import LinearRegression

    # Split the data into rows with non-missing and missing values in the target column.
    train_data = df[df[target_col].notna()]
    test_data = df[df[target_col].isna()]
    
    # If there are no missing values to impute, return the original dataframe.
    if test_data.empty:
        return df
    
    # Train the regression model using the non-missing rows.
    model = LinearRegression()
    model.fit(train_data[predictor_cols], train_data[target_col])
    
    # Predict missing values for the target column.
    predicted_values = model.predict(test_data[predictor_cols])
    df.loc[test_data.index, target_col] = predicted_values
    
    return df

# Example usage:
# df_imputed = linear_regression_imputation(df, target_col='target', predictor_cols=['col1', 'col2'])

missing_values_heatmap(df_imputed)

detect_missing_by_column(df_imputed, visualize=True, plot_type='bar', ordering=True)

missing_count_by_row(df_imputed, visualize=True)

	Row_ID	Missing_Count
0	77	2
1	74	2
2	0	1
3	54	1
4	31	1

	Row_ID	Missing_Count
0	0	0
1	63	0
2	73	0
3	72	0
4	71	0
...	...	...
95	30	0
96	29	0
97	28	0
98	27	0
99	99	0

	InvoiceNo	StockCode	Description	Quantity	InvoiceDate	UnitPrice	CustomerID	Country
0	17270	84406G	WHITE METAL LANTERN	5.0	2020-01-01	NaN	12349.0	France
1	10860	85123A	CREAM CUPID HEARTS COAT HANGER	8.0	2020-01-02	9.56	NaN	United Kingdom
2	15390	84406G	WHITE HANGING HEART T-LIGHT HOLDER	1.0	2020-01-03	13.17	12348.0	Germany
3	15191	71053	WHITE HANGING HEART T-LIGHT HOLDER	5.0	2020-01-04	1.44	NaN	Netherlands
4	15734	85123A	PARTY BUNTING	3.0	2020-01-05	19.01	NaN	France

	Column_Name	Missing_Count	Missing_Percentage
0	CustomerID	21	21.0
1	Quantity	13	13.0
2	UnitPrice	6	6.0
3	InvoiceNo	0	0.0
4	StockCode	0	0.0
5	Description	0	0.0
6	InvoiceDate	0	0.0
7	Country	0	0.0

📖 Missing Values¶

🗂️ Data Setup¶

📚 Import Libraries¶

📄 Generate Dataset¶

🔍 Missing Value Detection ¶

🔥 Missing Values Heatmap ¶

📊 Column-wise ¶

🔄 Row-wise ¶

🧪 Missing Value Treatment¶

🔧 Deletion¶

🔍 Simple Imputation¶

🤖 Algorithm / Inner-Workings:¶

✅ When to Use:¶

⚖️ Trade-Offs:¶

👍 Pros:¶

👎 Cons:¶

🤖 KNN Imputation¶

🤖 Algorithm / Inner-Workings:¶

✅ When to Use:¶

⚖️ Trade-Offs:¶

👍 Pros:¶

👎 Cons:¶

🔄 Iterative Imputation¶

🤖 Algorithm / Inner-Workings:¶

✅ When to Use:¶

⚖️ Trade-Offs:¶

👍 Pros:¶

👎 Cons:¶

⏱️ Time Series Imputation (Smoothing / Carry-Forward/Backward)¶

🤖 Algorithm / Inner-Workings:¶

✅ When to Use:¶

⚖️ Trade-Offs:¶

👍 Pros:¶

👎 Cons:¶

📈 Linear Regression Imputation¶

🤖 Algorithm / Inner-Workings:¶

✅ When to Use:¶

⚖️ Trade-Offs:¶

👍 Pros:¶

👎 Cons:¶

📊 Post Imputation Checks¶