# Display Settings
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython.display import display, HTML
import warnings
import json

pretty_json = lambda d: display(HTML(f"""
<pre style='font-size:14px; font-family:monospace;'>
{json.dumps(d, indent=4)
   .replace(": null", ': <span style="color:crimson;"><b>null</b></span>')
   .replace(': "NA"', ': <span style="color:crimson;"><b>"NA"</b></span>')}
</pre>
<hr style='border: none; height: 1px; background-color: #ddd;' />
"""))

# Data Transformation Libraries
import numpy as np
import pandas as pd
my_seed=1995

# Stats Libraries
from scipy.stats import (
    ttest_1samp, ttest_rel, ttest_ind, wilcoxon, mannwhitneyu,
    shapiro, chi2_contingency, f_oneway, kruskal, binom_test, fisher_exact, levene
)
from statsmodels.stats.proportion import proportions_ztest

def generate_data_from_config(config, seed=my_seed):
    """
    Generates synthetic data based on the specified hypothesis test configuration.

    This function:
    - Supports one-sample and two-sample tests for continuous and binary outcomes
    - Simulates data using normal or binomial distributions depending on outcome type
    - Applies treatment effect to simulate group differences
    - Returns a DataFrame in the appropriate structure for the given test setup

    Parameters:
    -----------
    config : dict
        Dictionary specifying the test scenario with keys:
        - 'outcome_type' (e.g., 'continuous', 'binary')
        - 'group_count' ('one-sample' or 'two-sample')
        - 'group_relationship' ('independent' or 'paired')
        - 'sample_size': int (per group)
        - 'effect_size': float (simulated difference to inject)
        - 'population_mean': float (only used for one-sample tests)

    seed : int, optional
        Random seed for reproducibility (default = my_seed)

    Returns:
    --------
    pd.DataFrame
        A synthetic dataset compatible with the selected test type
    """

    np.random.seed(my_seed)
    
    outcome = config['outcome_type']
    group_count = config['group_count']
    relationship = config['group_relationship']
    size = config['sample_size']
    effect = config['effect_size']
    pop_mean = config.get('population_mean', 0)

    # 1️⃣ One-sample case
    if group_count == 'one-sample':
        if outcome == 'continuous':
            values = np.random.normal(loc=pop_mean + effect, scale=1.0, size=size)
            df = pd.DataFrame({'value': values})
        elif outcome == 'binary':
            prob = pop_mean + effect
            values = np.random.binomial(1, prob, size=size)
            df = pd.DataFrame({'value': values})
        else:
            raise NotImplementedError("One-sample generation only supports continuous/binary for now.")

    # 2️⃣ Two-sample case
    elif group_count == 'two-sample':
        if relationship == 'independent':
            if outcome == 'continuous':
                A = np.random.normal(loc=5.0, scale=1.0, size=size)
                B = np.random.normal(loc=5.0 + effect, scale=1.0, size=size)
            elif outcome == 'binary':
                A = np.random.binomial(1, 0.4, size=size)
                B = np.random.binomial(1, 0.4 + effect, size=size)
            else:
                raise NotImplementedError
            df = pd.DataFrame({
                'group': ['A'] * size + ['B'] * size,
                'value': np.concatenate([A, B])
            })
        
        elif relationship == 'paired':
            if outcome == 'continuous':
                before = np.random.normal(loc=5.0, scale=1.0, size=size)
                after = before + effect + np.random.normal(0, 0.5, size=size)
            elif outcome == 'binary':
                before = np.random.binomial(1, 0.4, size=size)
                after = np.random.binomial(1, 0.4 + effect, size=size)
            else:
                raise NotImplementedError
            df = pd.DataFrame({
                'user_id': np.arange(size),
                'group_A': before,
                'group_B': after
            })
        else:
            raise ValueError("Missing or invalid group relationship.")

    else:
        raise NotImplementedError("Multi-sample not supported yet.")

    return df

config = {
    'outcome_type': 'continuous',        # continuous, binary, categorical, count
    'group_relationship': 'independent', # independent or paired
    'group_count': 'two-sample',         # one-sample, two-sample, multi-sample
    'distribution': None,                # normal or non-normal → to be inferred
    'variance_equal': None,              # equal or unequal → to be inferred
    'tail_type': 'two-tailed',           # or 'one-tailed'
    'parametric': None,                  # True or False → to be inferred
    'alpha': 0.05,                       # significance level
    'sample_size': 100,                  # per group
    'effect_size': 0.5,                  # for generating synthetic difference
}
pretty_json(config)

{
    "outcome_type": "continuous",
    "group_relationship": "independent",
    "group_count": "two-sample",
    "distribution": null,
    "variance_equal": null,
    "tail_type": "two-tailed",
    "parametric": null,
    "alpha": 0.05,
    "sample_size": 100,
    "effect_size": 0.5
}

df = generate_data_from_config(config)
df

def validate_config(config):
    """
    Validates the hypothesis test configuration dictionary for completeness and logical consistency.
    
    Parameters:
    -----------
    config : dict
        Configuration dictionary to validate.
    
    Returns:
    --------
    None
        Raises ValueError if issues are found.
    """
    
    required_keys = [
        'outcome_type', 'group_relationship', 'group_count', 'distribution',
        'variance_equal', 'tail_type', 'parametric', 'alpha', 'sample_size', 'effect_size'
    ]

    valid_outcome_types = ['continuous', 'binary', 'categorical', 'count']
    valid_group_relationships = ['independent', 'paired']
    valid_group_counts = ['one-sample', 'two-sample', 'multi-sample']
    valid_distributions = ['normal', 'non-normal', 'NA', None]
    valid_variance_flags = ['equal', 'unequal', 'NA', None]
    valid_tail_types = ['one-tailed', 'two-tailed']
    valid_parametric_flags = [True, False, 'NA', None]

    # 1. Missing keys
    for key in required_keys:
        if key not in config:
            raise ValueError(f"❌ Missing key in config: '{key}'")

    # 2. Check values are within known sets
    if config['outcome_type'] not in valid_outcome_types:
        raise ValueError(f"❌ Invalid outcome_type: {config['outcome_type']}")

    if config['group_relationship'] not in valid_group_relationships and config['group_count'] != 'one-sample':
        raise ValueError(f"❌ Invalid group_relationship: {config['group_relationship']}")

    if config['group_count'] not in valid_group_counts:
        raise ValueError(f"❌ Invalid group_count: {config['group_count']}")

    if config['distribution'] not in valid_distributions:
        raise ValueError(f"❌ Invalid distribution: {config['distribution']}")

    if config['variance_equal'] not in valid_variance_flags:
        raise ValueError(f"❌ Invalid variance_equal: {config['variance_equal']}")

    if config['tail_type'] not in valid_tail_types:
        raise ValueError(f"❌ Invalid tail_type: {config['tail_type']}")

    if config['parametric'] not in valid_parametric_flags:
        raise ValueError(f"❌ Invalid parametric flag: {config['parametric']}")

    if not (0 < config['alpha'] < 1):
        raise ValueError("❌ Alpha level should be between 0 and 1.")

    if config['sample_size'] <= 0:
        raise ValueError("❌ Sample size must be positive.")

    # 3. Logical combination checks
    # One-sample + non-independent → override to independent
    if config['group_count'] == 'one-sample' and config['group_relationship'] != 'independent':
        print("⚠️ Overriding group_relationship to 'independent' for one-sample test.")
        config['group_relationship'] = 'independent'

    # Multi-sample + paired → not supported by this module
    if config['group_count'] == 'multi-sample' and config['group_relationship'] == 'paired':
        raise ValueError("❌ Paired relationship not supported for multi-sample tests.")

    # One-sample + missing population_mean → invalid config
    if config['group_count'] == 'one-sample' and 'population_mean' not in config:
        raise ValueError("❌ One-sample tests require `population_mean` to be specified.")
    
    # Paired + categorical (not supported by this module)
    if config['outcome_type'] == 'categorical' and config['group_relationship'] == 'paired':
        raise ValueError("❌ Paired tests are not supported for categorical outcomes in this module.")

    # Binary outcome + parametric + small n → warn about z-test validity
    if config['outcome_type'] == 'binary' and config['parametric'] is True:
        if config['sample_size'] < 30:
            print("⚠️ Sample size < 30 → z-test assumptions (np > 5) may be violated. Consider Fisher’s Exact.")

    # Parametric test selected, but distribution is missing
    if config['parametric'] is True and config['distribution'] in ['NA', None]:
        raise ValueError("❌ Parametric test requested, but distribution is not confirmed as normal.")

    # Count outcome + one-sample → not supported
    if config['outcome_type'] == 'count' and config['group_count'] == 'one-sample':
        raise ValueError("🔒 One-sample tests for count data are not supported by this module.")

    # Effect size unusually large or small (soft validation)
    if config['effect_size'] < 0 or config['effect_size'] > 2:
        print("⚠️ Effect size is unusually extreme. Are you simulating a realistic scenario?")

    # Optional: variance check mismatch
    if config['variance_equal'] not in ['equal', 'unequal', 'NA', None]:
        raise ValueError(f"❌ Invalid variance_equal flag: {config['variance_equal']}")

    # Optional: group relationship irrelevant in one-sample, but present
    if config['group_count'] == 'one-sample' and config.get('group_relationship') != 'independent':
        print("⚠️ One-sample tests don’t require `group_relationship`. Defaulting to 'independent'.")
        config['group_relationship'] = 'independent'


    print("✅ Config validated successfully.")

validate_config(config)

✅ Config validated successfully.

def print_config_summary(config):
    """
    Displays a structured summary of the test configuration with visual cues for missing or inferred values.

    This function:
    - Prints each key in the config with aligned formatting
    - Highlights `None` or 'NA' values in red (terminal only)
    - Provides a short inference summary based on group count and relationship

    Parameters:
    -----------
    config : dict
        Configuration dictionary containing test settings like outcome type, group relationship,
        distribution, variance assumption, parametric flag, and alpha level

    Returns:
    --------
    None
        Prints the formatted configuration summary directly to output
    """

    def highlight(value):
        if value in [None, 'NA'] or (isinstance(value, float) and np.isnan(value)):
            return "\033[91mNone\033[0m"  # Red in terminal
        return value

    print("📋 Hypothesis Test Configuration Summary\n")

    print(f"🔸 Outcome Type            : {highlight(config['outcome_type'])}")
    print(f"🔸 Group Relationship      : {highlight(config['group_relationship'])}")
    print(f"🔸 Group Count             : {highlight(config['group_count'])}")
    print(f"🔸 Distribution of Outcome : {highlight(config['distribution'])}")
    print(f"🔸 Equal Variance          : {highlight(config['variance_equal'])}")
    print(f"🔸 Parametric Test         : {highlight(config['parametric'])}")
    print(f"🔸 Tail Type               : {highlight(config['tail_type'])}")
    print(f"🔸 Significance Level α    : {highlight(config['alpha'])}")

    print("\n🧠 Inference Summary:")
    if config['group_count'] == 'one-sample':
        print("→ This is a one-sample test comparing a sample to a known value.")
    elif config['group_count'] == 'two-sample':
        if config['group_relationship'] == 'independent':
            print("→ Comparing two independent groups (A vs B).")
        elif config['group_relationship'] == 'paired':
            print("→ Comparing paired measurements (before vs after, same users).")
    display(HTML("<hr style='border: none; height: 1px; background-color: #ddd;' />"))

pretty_json(config)
print_config_summary(config)

{
    "outcome_type": "continuous",
    "group_relationship": "independent",
    "group_count": "two-sample",
    "distribution": null,
    "variance_equal": null,
    "tail_type": "two-tailed",
    "parametric": null,
    "alpha": 0.05,
    "sample_size": 100,
    "effect_size": 0.5
}

📋 Hypothesis Test Configuration Summary

🔸 Outcome Type            : continuous
🔸 Group Relationship      : independent
🔸 Group Count             : two-sample
🔸 Distribution of Outcome : None
🔸 Equal Variance          : None
🔸 Parametric Test         : None
🔸 Tail Type               : two-tailed
🔸 Significance Level α    : 0.05

🧠 Inference Summary:
→ Comparing two independent groups (A vs B).

def infer_distribution_from_data(config, df):
    """
    Infers whether the outcome variable follows a normal distribution using the Shapiro-Wilk test.

    This function:
    - Checks if the outcome type is continuous (required for normality testing)
    - Applies Shapiro-Wilk test to one or both groups depending on group structure
    - Updates the 'distribution' key in the config as 'normal', 'non-normal', or 'NA'
    - Logs interpretation and decision in a reader-friendly format

    Parameters:
    -----------
    config : dict
        Configuration dictionary containing 'outcome_type', 'group_count', and 'group_relationship'
    df : pandas.DataFrame
        Input dataframe containing outcome values and group assignments

    Returns:
    --------
    dict
        Updated config dictionary with the 'distribution' key set
    """

    print("\n🔍 Step: Infer Distribution of Outcome Variable")

    group_count = config['group_count']
    relationship = config['group_relationship']
    outcome = config['outcome_type']

    if outcome != 'continuous':
        print(f"⚠️ Skipping: Outcome type = `{outcome}` → normality check not applicable.")
        config['distribution'] = 'NA'
        display(HTML("<hr style='border: none; height: 1px; background-color: #ddd;' />"))
        return config

    print("📘 Checking if the outcome variable follows a normal distribution")
    print("   Using Shapiro-Wilk Test")
    print("   H₀: Data comes from a normal distribution")
    print("   H₁: Data does NOT come from a normal distribution\n")

    if group_count == 'one-sample':
        print("• One-sample case → testing entire column")
        stat, p = shapiro(df['value'])
        print(f"• Shapiro-Wilk p-value = {p:.4f}")

        if p > 0.05:
            print("✅ Fail to reject H₀ → Data is likely a normal distribution")
            config['distribution'] = 'normal'
        else:
            print("⚠️ Reject H₀ → Data is likely a non-normal distribution")
            config['distribution'] = 'non-normal'

        print(f"📦 Final Decision → config['distribution'] = `{config['distribution']}`")
        display(HTML("<hr style='border: none; height: 1px; background-color: #ddd;' />"))
        return config

    elif group_count == 'two-sample':
        print(f"• Two-sample ({relationship}) case → testing both groups")

        if relationship == 'independent':
            a = df[df['group'] == 'A']['value']
            b = df[df['group'] == 'B']['value']
        elif relationship == 'paired':
            a = df['group_A']
            b = df['group_B']
        else:
            print("❌ Invalid group relationship")
            config['distribution'] = 'NA'
            display(HTML("<hr style='border: none; height: 1px; background-color: #ddd;' />"))
            return config

        p1 = shapiro(a).pvalue
        p2 = shapiro(b).pvalue

        print(f"• Group A → Shapiro-Wilk p = {p1:.4f} →", 
              "Fail to reject H₀ ✅ (likely a normal distribution)" if p1 > 0.05 
              else "Reject H₀ ⚠️ (likely a non-normal distribution)")

        print(f"• Group B → Shapiro-Wilk p = {p2:.4f} →", 
              "Fail to reject H₀ ✅ (likely a normal distribution)" if p2 > 0.05 
              else "Reject H₀ ⚠️ (likely a non-normal distribution)")

        if p1 > 0.05 and p2 > 0.05:
            print("✅ Both groups are likely drawn from normal distributions")
            config['distribution'] = 'normal'
        else:
            print("⚠️ At least one group does not appear normally distributed")
            config['distribution'] = 'non-normal'

        print(f"📦 Final Decision → config['distribution'] = `{config['distribution']}`")
        display(HTML("<hr style='border: none; height: 1px; background-color: #ddd;' />"))
        return config

    else:
        print("❌ Unsupported group count for distribution check.")
        config['distribution'] = 'NA'
        display(HTML("<hr style='border: none; height: 1px; background-color: #ddd;' />"))
        return config

config = infer_distribution_from_data(config, df)
pretty_json(config)
print_config_summary(config)

🔍 Step: Infer Distribution of Outcome Variable
📘 Checking if the outcome variable follows a normal distribution
   Using Shapiro-Wilk Test
   H₀: Data comes from a normal distribution
   H₁: Data does NOT come from a normal distribution

• Two-sample (independent) case → testing both groups
• Group A → Shapiro-Wilk p = 0.4534 → Fail to reject H₀ ✅ (likely a normal distribution)
• Group B → Shapiro-Wilk p = 0.7146 → Fail to reject H₀ ✅ (likely a normal distribution)
✅ Both groups are likely drawn from normal distributions
📦 Final Decision → config['distribution'] = `normal`

{
    "outcome_type": "continuous",
    "group_relationship": "independent",
    "group_count": "two-sample",
    "distribution": "normal",
    "variance_equal": null,
    "tail_type": "two-tailed",
    "parametric": null,
    "alpha": 0.05,
    "sample_size": 100,
    "effect_size": 0.5
}

📋 Hypothesis Test Configuration Summary

🔸 Outcome Type            : continuous
🔸 Group Relationship      : independent
🔸 Group Count             : two-sample
🔸 Distribution of Outcome : normal
🔸 Equal Variance          : None
🔸 Parametric Test         : None
🔸 Tail Type               : two-tailed
🔸 Significance Level α    : 0.05

🧠 Inference Summary:
→ Comparing two independent groups (A vs B).

def infer_variance_equality(config, df):
    """
    Infers whether the variances across two independent groups are equal using Levene's test.

    This function:
    - Checks if the variance assumption is relevant based on config
    - Runs Levene’s test to compare the variances of Group A and Group B
    - Updates the 'variance_equal' key in the config as 'equal', 'unequal', or 'NA'
    - Logs interpretation of the test result

    Parameters:
    -----------
    config : dict
        Configuration dictionary containing 'group_count' and 'group_relationship'
    df : pandas.DataFrame
        Input dataframe containing 'group' and 'value' columns

    Returns:
    --------
    dict
        Updated config dictionary with the 'variance_equal' key set
    """
    print("\n📏 **Step: Infer Equality of Variance Across Groups**")

    # Skip if not applicable
    if config['group_count'] != 'two-sample' or config['group_relationship'] != 'independent':
        print("⚠️ Skipping variance check: Only applicable for two-sample independent tests.")
        config['variance_equal'] = 'NA'
        return config

    print("📘 We're checking if the spread (variance) of the outcome variable is similar across groups A and B.")
    print("   This is important for choosing between a **pooled t-test** vs **Welch’s t-test**.")
    print("🔬 Test Used: Levene’s Test for Equal Variance")
    print("   H₀: Variance in Group A = Variance in Group B")
    print("   H₁: Variances are different")

    # Extract data
    a = df[df['group'] == 'A']['value']
    b = df[df['group'] == 'B']['value']

    # Run Levene's test
    stat, p = levene(a, b)
    print(f"\n📊 Levene’s Test Result:")
    print(f"• Test Statistic = {stat:.4f}")
    print(f"• p-value        = {p:.4f}")

    if p > 0.05:
        print("✅ Fail to reject H₀ → Variances appear equal across groups")
        config['variance_equal'] = 'equal'
    else:
        print("⚠️ Reject H₀ → Variances appear unequal")
        config['variance_equal'] = 'unequal'

    print(f"\n📦 Final Decision → config['variance_equal'] = `{config['variance_equal']}`")
    display(HTML("<hr style='border: none; height: 1px; background-color: #ddd;' />"))
    return config

config = infer_variance_equality(config, df)
pretty_json(config)
print_config_summary(config)

📏 **Step: Infer Equality of Variance Across Groups**
📘 We're checking if the spread (variance) of the outcome variable is similar across groups A and B.
   This is important for choosing between a **pooled t-test** vs **Welch’s t-test**.
🔬 Test Used: Levene’s Test for Equal Variance
   H₀: Variance in Group A = Variance in Group B
   H₁: Variances are different

📊 Levene’s Test Result:
• Test Statistic = 1.0751
• p-value        = 0.3011
✅ Fail to reject H₀ → Variances appear equal across groups

📦 Final Decision → config['variance_equal'] = `equal`

{
    "outcome_type": "continuous",
    "group_relationship": "independent",
    "group_count": "two-sample",
    "distribution": "normal",
    "variance_equal": "equal",
    "tail_type": "two-tailed",
    "parametric": null,
    "alpha": 0.05,
    "sample_size": 100,
    "effect_size": 0.5
}

📋 Hypothesis Test Configuration Summary

🔸 Outcome Type            : continuous
🔸 Group Relationship      : independent
🔸 Group Count             : two-sample
🔸 Distribution of Outcome : normal
🔸 Equal Variance          : equal
🔸 Parametric Test         : None
🔸 Tail Type               : two-tailed
🔸 Significance Level α    : 0.05

🧠 Inference Summary:
→ Comparing two independent groups (A vs B).

def infer_parametric_flag(config):
    """
    Infers whether a parametric or non-parametric test should be used based on the config.

    This function:
    - Applies logic based on distribution and variance assumptions
    - Sets the 'parametric' key in the config to True, False, or 'NA'
    - Prints reasoning for transparency and learning

    Parameters:
    -----------
    config : dict
        Configuration dictionary containing 'outcome_type', 'distribution', and 'variance_equal'

    Returns:
    --------
    dict
        Updated config dictionary with the 'parametric' key populated
    """

    print("\n📏 Step: Decide Between Parametric vs Non-Parametric Approach")

    if config['outcome_type'] != 'continuous':
        print(f"⚠️ Skipping: Outcome type = `{config['outcome_type']}` → Parametric logic not applicable.")
        config['parametric'] = 'NA'
        return config

    is_normal = config['distribution'] == 'normal'
    is_equal_var = config['variance_equal'] in ['equal', 'NA']  # NA = not required for paired

    print(f"🔍 Outcome type             = `{config['outcome_type']}`")
    print(f"🔍 Distribution of outcome  = `{config['distribution']}`")
    print(f"🔍 Variance equal flag      = `{config['variance_equal']}`")

    if is_normal and is_equal_var:
        print("✅ Conditions met → Proceeding with a parametric test (e.g., t-test)")
        config['parametric'] = True
    else:
        print("⚠️ One or more assumptions violated → Using non-parametric alternative")
        config['parametric'] = False

    print(f"\n📦 Final Decision → config['parametric'] = `{config['parametric']}`")
    display(HTML("<hr style='border: none; height: 1px; background-color: #ddd;' />"))

    return config

config = infer_parametric_flag(config)
pretty_json(config)
print_config_summary(config)

📏 Step: Decide Between Parametric vs Non-Parametric Approach
🔍 Outcome type             = `continuous`
🔍 Distribution of outcome  = `normal`
🔍 Variance equal flag      = `equal`
✅ Conditions met → Proceeding with a parametric test (e.g., t-test)

📦 Final Decision → config['parametric'] = `True`

{
    "outcome_type": "continuous",
    "group_relationship": "independent",
    "group_count": "two-sample",
    "distribution": "normal",
    "variance_equal": "equal",
    "tail_type": "two-tailed",
    "parametric": true,
    "alpha": 0.05,
    "sample_size": 100,
    "effect_size": 0.5
}

📋 Hypothesis Test Configuration Summary

🔸 Outcome Type            : continuous
🔸 Group Relationship      : independent
🔸 Group Count             : two-sample
🔸 Distribution of Outcome : normal
🔸 Equal Variance          : equal
🔸 Parametric Test         : True
🔸 Tail Type               : two-tailed
🔸 Significance Level α    : 0.05

🧠 Inference Summary:
→ Comparing two independent groups (A vs B).

'test_not_found'
This acts as a signal to handle rare or unsupported config combinations.

</details>

def determine_test_to_run(config):
    """
    Determines the appropriate statistical test based on the provided configuration.

    This function:
    - Maps outcome type, group count, group relationship, distribution, and parametric flags
    to the correct hypothesis test
    - Prints the reasoning and selected test
    - Returns a string identifier for the test to be used

    Parameters:
    -----------
    config : dict
        A dictionary containing keys like 'outcome_type', 'group_count', 'group_relationship',
        'distribution', 'variance_equal', and 'parametric'.

    Returns:
    --------
    str
        A string representing the selected test name (e.g., 'two_sample_ttest_welch', 'mcnemar', etc.)
    """

    print("\n🧭 Step: Determine Which Statistical Test to Use")
    
    outcome = config['outcome_type']
    group_rel = config['group_relationship']
    group_count = config['group_count']
    dist = config['distribution']
    equal_var = config['variance_equal']
    parametric = config['parametric']

    print("📦 Inputs:")
    print(f"• Outcome Type         = `{outcome}`")
    print(f"• Group Count          = `{group_count}`")
    print(f"• Group Relationship   = `{group_rel}`")
    print(f"• Distribution         = `{dist}`")
    print(f"• Equal Variance       = `{equal_var}`")
    print(f"• Parametric Flag      = `{parametric}`")

    print("\n🔍 Matching against known test cases...")

    # One-sample
    if group_count == 'one-sample':
        if outcome == 'continuous':
            test = 'one_sample_ttest' if dist == 'normal' else 'one_sample_wilcoxon'
        elif outcome == 'binary':
            test = 'one_proportion_ztest'
        else:
            test = 'test_not_found'

    # Two-sample independent
    elif group_count == 'two-sample' and group_rel == 'independent':
        if outcome == 'continuous':
            if parametric:
                test = 'two_sample_ttest_pooled' if equal_var == 'equal' else 'two_sample_ttest_welch'
            else:
                test = 'mann_whitney_u'
        elif outcome == 'binary':
            test = 'two_proportion_ztest'
        elif outcome == 'categorical':
            test = 'chi_square'
        else:
            test = 'test_not_found'

    # Two-sample paired
    elif group_count == 'two-sample' and group_rel == 'paired':
        if outcome == 'continuous':
            test = 'paired_ttest' if parametric else 'wilcoxon_signed_rank'
        elif outcome == 'binary':
            test = 'mcnemar'
        else:
            test = 'test_not_found'

    # Multi-group
    elif group_count == 'multi-sample':
        if outcome == 'continuous':
            test = 'anova' if dist == 'normal' else 'kruskal_wallis'
        elif outcome == 'categorical':
            test = 'chi_square'
        else:
            test = 'test_not_found'

    # Count data
    elif outcome == 'count':
        test = 'poisson_test'

    else:
        test = 'test_not_found'

    print(f"\n✅ Selected Test: `{test}`")
    display(HTML("<hr style='border: none; height: 1px; background-color: #ddd;' />"))
    return test
determine_test_to_run(config)

🧭 Step: Determine Which Statistical Test to Use
📦 Inputs:
• Outcome Type         = `continuous`
• Group Count          = `two-sample`
• Group Relationship   = `independent`
• Distribution         = `normal`
• Equal Variance       = `equal`
• Parametric Flag      = `True`

🔍 Matching against known test cases...

✅ Selected Test: `two_sample_ttest_pooled`

'two_sample_ttest_pooled'

def print_hypothesis_statement(config):
    """
    Prints the null and alternative hypothesis statements based on the selected test and tail type.

    This function:
    - Uses the provided config to determine which statistical test applies
    - Displays clear H₀ and H₁ statements tailored to the test type and direction (one-tailed vs two-tailed)
    - Aims to bridge technical and business understanding of what is being tested

    Parameters:
    -----------
    config : dict
        Configuration dictionary containing at least 'tail_type' and the required fields to determine the test

    Returns:
    --------
    None
        Prints formatted hypothesis statements directly to output
    """

    print("\n🧠 Step: Generate Hypothesis Statement")

    test = determine_test_to_run(config)
    tail = config['tail_type']

    print(f"🔍 Selected Test        : `{test}`")
    print(f"🔍 Tail Type            : `{tail}`\n")

    print("📜 Hypothesis Statement:")

    if test == 'one_sample_ttest':
        print("• H₀: The sample mean equals the reference value.")
        print("• H₁:", "The sample mean is different from the reference." if tail == 'two-tailed' else "The sample mean is greater/less than the reference.")

    elif test == 'one_proportion_ztest':
        print("• H₀: The sample proportion equals the baseline rate.")
        print("• H₁:", "The sample proportion is different from the baseline." if tail == 'two-tailed' else "The sample proportion is greater/less than the baseline.")

    elif test in ['two_sample_ttest_pooled', 'two_sample_ttest_welch', 'mann_whitney_u', 'two_proportion_ztest']:
        print("• H₀: The outcome (mean/proportion) is the same across groups A and B.")
        print("• H₁:", "The outcome differs between groups." if tail == 'two-tailed' else "Group B is greater/less than Group A.")

    elif test in ['paired_ttest', 'wilcoxon_signed_rank']:
        print("• H₀: The average difference between paired values (before vs after) is zero.")
        print("• H₁:", "There is a difference in paired values." if tail == 'two-tailed' else "After is greater/less than before.")

    elif test == 'mcnemar':
        print("• H₀: Proportion of success is the same before and after.")
        print("• H₁: Proportion of success changed after treatment.")

    elif test in ['anova', 'kruskal_wallis']:
        print("• H₀: All group means (or distributions) are equal.")
        print("• H₁: At least one group differs.")

    elif test == 'chi_square':
        print("• H₀: Category distributions are the same across groups.")
        print("• H₁: At least one category behaves differently.")

    elif test == 'poisson_test':
        print("• H₀: The count rate (λ) is the same across groups.")
        print("• H₁: Count rate differs between groups.")

    elif test == 'bayesian_ab':
        print("• Posterior: Probability that Group B is better than Group A.")

    elif test == 'permutation_test':
        print("• H₀: Observed difference is due to chance.")
        print("• H₁: Observed difference is unlikely under random shuffling.")

    else:
        print(f"❓ Unable to generate hypothesis statement for test: `{test}`")

    display(HTML("<hr style='border: none; height: 1px; background-color: #ddd;' />"))
        
print_hypothesis_statement(config)

🧠 Step: Generate Hypothesis Statement

🧭 Step: Determine Which Statistical Test to Use
📦 Inputs:
• Outcome Type         = `continuous`
• Group Count          = `two-sample`
• Group Relationship   = `independent`
• Distribution         = `normal`
• Equal Variance       = `equal`
• Parametric Flag      = `True`

🔍 Matching against known test cases...

✅ Selected Test: `two_sample_ttest_pooled`

🔍 Selected Test        : `two_sample_ttest_pooled`
🔍 Tail Type            : `two-tailed`

📜 Hypothesis Statement:
• H₀: The outcome (mean/proportion) is the same across groups A and B.
• H₁: The outcome differs between groups.

def run_hypothesis_test(config, df):
    """
    Runs the appropriate hypothesis test based on the provided configuration and dataset.

    This function:
    - Identifies the correct test using `determine_test_to_run(config)`
    - Executes the corresponding statistical test (e.g., t-test, z-test, Mann-Whitney, ANOVA, etc.)
    - Prints a guided explanation of inputs, selected test, test statistic, p-value, and business interpretation
    - Returns a result dictionary with test details and significance flag

    Parameters:
    -----------
    config : dict
        Dictionary specifying test configuration (e.g., outcome_type, group_relationship, parametric, etc.)
    df : pandas.DataFrame
        Input dataset containing outcome values (and group labels if applicable)

    Returns:
    --------
    dict
        {
            'test': str,              # Name of the statistical test run
            'statistic': float,       # Test statistic
            'p_value': float,         # P-value of the test
            'significant': bool,      # True if p < alpha
            'alpha': float            # Significance threshold used
        }
    """



    print("\n🧪 Step: Run Hypothesis Test")

    test_name = determine_test_to_run(config)
    alpha = config.get('alpha', 0.05)

    print(f"\n🧭 Step: Determine Which Statistical Test to Use")
    print("📦 Inputs:")
    print(f"• Outcome Type         = `{config['outcome_type']}`")
    print(f"• Group Count          = `{config['group_count']}`")
    print(f"• Group Relationship   = `{config['group_relationship']}`")
    print(f"• Distribution         = `{config['distribution']}`")
    print(f"• Equal Variance       = `{config['variance_equal']}`")
    print(f"• Parametric Flag      = `{config['parametric']}`")

    print(f"\n🔍 Matching against known test cases...")
    print(f"✅ Selected Test        : `{test_name}`")
    print(f"🔍 Significance Threshold (α) : {alpha:.2f}\n")

    result = {
        'test': test_name,
        'statistic': None,
        'p_value': None,
        'significant': None,
        'alpha': alpha
    }

    try:
        print("🚀 Executing statistical test...")

        # --- Run appropriate test ---
        if test_name == 'one_sample_ttest':
            stat, p = ttest_1samp(df['value'], config['population_mean'])

        elif test_name == 'one_sample_wilcoxon':
            stat, p = wilcoxon(df['value'] - config['population_mean'])

        elif test_name == 'one_proportion_ztest':
            x = np.sum(df['value'])
            n = len(df)
            stat, p = proportions_ztest(x, n, value=config['population_mean'])

        elif test_name == 'two_sample_ttest_pooled':
            a = df[df['group'] == 'A']['value']
            b = df[df['group'] == 'B']['value']
            stat, p = ttest_ind(a, b, equal_var=True)

        elif test_name == 'two_sample_ttest_welch':
            a = df[df['group'] == 'A']['value']
            b = df[df['group'] == 'B']['value']
            stat, p = ttest_ind(a, b, equal_var=False)

        elif test_name == 'mann_whitney_u':
            a = df[df['group'] == 'A']['value']
            b = df[df['group'] == 'B']['value']
            stat, p = mannwhitneyu(a, b, alternative='two-sided')

        elif test_name == 'paired_ttest':
            stat, p = ttest_rel(df['group_A'], df['group_B'])

        elif test_name == 'wilcoxon_signed_rank':
            stat, p = wilcoxon(df['group_A'], df['group_B'])

        elif test_name == 'two_proportion_ztest':
            a = df[df['group'] == 'A']['value']
            b = df[df['group'] == 'B']['value']
            counts = [np.sum(a), np.sum(b)]
            nobs = [len(a), len(b)]
            stat, p = proportions_ztest(count=counts, nobs=nobs)

        elif test_name == 'mcnemar':
            before = df['group_A']
            after = df['group_B']
            both = np.sum((before == 1) & (after == 1))
            before_only = np.sum((before == 1) & (after == 0))
            after_only = np.sum((before == 0) & (after == 1))
            neither = np.sum((before == 0) & (after == 0))
            table = np.array([[both, before_only], [after_only, neither]])
            stat, p = chi2_contingency(table, correction=True)[:2]

        elif test_name == 'anova':
            groups = [g['value'].values for _, g in df.groupby('group')]
            stat, p = f_oneway(*groups)

        elif test_name == 'kruskal_wallis':
            groups = [g['value'].values for _, g in df.groupby('group')]
            stat, p = kruskal(*groups)

        elif test_name == 'chi_square':
            contingency = pd.crosstab(df['group'], df['value'])
            stat, p, _, _ = chi2_contingency(contingency)

        else:
            warnings.warn(f"❌ Test not implemented: `{test_name}`")
            return result

        result['statistic'] = stat
        result['p_value'] = p
        result['significant'] = p < alpha

        # --- Final Output Block ---
        print(f"\n📊 Test Summary: {test_name.replace('_', ' ').title()}")

        print("\n🧪 Technical Result")
        print(f"• Test Statistic     = {stat:.4f}")
        print(f"• P-value            = {p:.4f}")
        print(f"• Alpha (α)          = {alpha:.2f}")

        if p < alpha:
            print(f"• Conclusion         = ✅ Statistically significant → Reject H₀")
            print("\n📈 Interpretation")
            print("• The observed difference is unlikely due to random variation.")

            # Business Interpretation
            if config['group_count'] == 'two-sample' and config['group_relationship'] == 'independent':
                a = df[df['group'] == 'A']['value']
                b = df[df['group'] == 'B']['value']
                mean_a = np.mean(a)
                mean_b = np.mean(b)
                lift = mean_b - mean_a
                pct_lift = (lift / mean_a) * 100

                label = "mean" if config['outcome_type'] == 'continuous' else 'conversion rate'
                print("\n💼 Business Insight")
                print(f"• Group A {label} = {mean_a:.2f}")
                print(f"• Group B {label} = {mean_b:.2f}")
                print(f"• Lift = {lift:.2f} ({pct_lift:+.2f}%)")

                if lift > 0:
                    print("🏆 Group B outperforms Group A — and the effect is statistically significant.")
                else:
                    print("📉 Group B underperforms Group A — and the drop is statistically significant.")
        else:
            print(f"• Conclusion         = ❌ Not statistically significant → Fail to reject H₀")
            print("\n📈 Interpretation")
            print("• The observed difference could be explained by randomness.")
            print("\n💼 Business Insight")
            print("• No strong evidence of difference between the groups.")

        display(HTML("<hr style='border: none; height: 1px; background-color: #ddd;' />"))
        return result

    except Exception as e:
        warnings.warn(f"🚨 Error during test execution: {e}")
        return result

_ = run_hypothesis_test(config, df)

🧪 Step: Run Hypothesis Test

🧭 Step: Determine Which Statistical Test to Use
📦 Inputs:
• Outcome Type         = `continuous`
• Group Count          = `two-sample`
• Group Relationship   = `independent`
• Distribution         = `normal`
• Equal Variance       = `equal`
• Parametric Flag      = `True`

🔍 Matching against known test cases...

✅ Selected Test: `two_sample_ttest_pooled`

🧭 Step: Determine Which Statistical Test to Use
📦 Inputs:
• Outcome Type         = `continuous`
• Group Count          = `two-sample`
• Group Relationship   = `independent`
• Distribution         = `normal`
• Equal Variance       = `equal`
• Parametric Flag      = `True`

🔍 Matching against known test cases...
✅ Selected Test        : `two_sample_ttest_pooled`
🔍 Significance Threshold (α) : 0.05

🚀 Executing statistical test...

📊 Test Summary: Two Sample Ttest Pooled

🧪 Technical Result
• Test Statistic     = -3.0942
• P-value            = 0.0023
• Alpha (α)          = 0.05
• Conclusion         = ✅ Statistically significant → Reject H₀

📈 Interpretation
• The observed difference is unlikely due to random variation.

💼 Business Insight
• Group A mean = 4.92
• Group B mean = 5.36
• Lift = 0.45 (+9.07%)
🏆 Group B outperforms Group A — and the effect is statistically significant.

#	💼 Example Business Problem	📊 Outcome Variable Type	📈 Outcome Distribution	👥 Group Count	🔗 Groups Type	✅ Recommended Test	📝 Notes
1	Is average order value different from $50?	Continuous	Normal	One-Sample	Not Applicable	One-sample t-test	-
2	Do users who saw recs spend more time on site?	Continuous	Normal	Two-Sample	Independent	Two-sample t-test	Use if variances are equal; Welch’s t-test if not
3	Did users spend more after redesign?	Continuous	Normal	Two-Sample	Paired	Paired t-test	Use only if paired differences are roughly Normal
4	Does time spent differ across A/B/C?	Continuous	Normal	Multi-Sample (3+)	Independent	ANOVA	Use Welch ANOVA if group variances differ
5	Is average order value different from $50 (skewed)?	Continuous	Non-Normal	One-Sample	Not Applicable	Wilcoxon Signed-Rank Test	Use when normality is violated; tests median. Sign Test is an alternative with fewer assumptions
6	Is revenue different between coupon A vs B?	Continuous	Non-Normal	Two-Sample	Independent	Mann-Whitney U test	Use when data is skewed or has outliers
7	Did time on site change (skewed)?	Continuous	Non-Normal	Two-Sample	Paired	Wilcoxon signed-rank test	For paired non-normal distributions
8	Does spend differ across segments?	Continuous	Non-Normal	Multi-Sample (3+)	Independent	Kruskal-Wallis test	Non-parametric version of ANOVA
9	Is conversion rate different from 10%?	Binary	Not Applicable	One-Sample	Not Applicable	One-proportion z-test	Use binomial exact test if sample size is small
10	Does new CTA improve conversion?	Binary	Not Applicable	Two-Sample	Independent	Proportions z-test	Use when counts are raw; chi-square for independence. Fisher’s Exact if expected counts are low
11	Do users convert more after badges?	Binary	Not Applicable	Two-Sample	Paired	McNemar’s test	Used for 2×2 paired binary outcomes
12	Do plan choices differ across layout options?	Categorical	Not Applicable	Multi-Sample (3+)	Independent	Chi-square test	Requires expected frequency ≥5 in each cell. Use Fisher’s Exact if assumption fails
13	Do users add more items to cart?	Count	Poisson	Two-Sample	Independent	Poisson / NB test	Use Negative Binomial if variance > mean
14	Is effect still significant after adjusting for device & region?	Any	Not Applicable	Any	Any	Regression (linear / logistic)	Use to control for covariates / confounders
15	What’s the probability that B beats A?	Any	Not Applicable	Two-Sample	Any	Bayesian A/B test	Posterior probability; no p-value
16	Is observed lift statistically rare?	Any	Not Applicable	Two-Sample	Any	Permutation / Bootstrap	Use when parametric assumptions are violated

	group	value
0	A	3.759367
1	A	3.529421
2	A	7.101191
3	A	3.535178
4	A	5.817922
...	...	...
195	B	5.816507
196	B	5.218070
197	B	5.151804
198	B	5.072379
199	B	4.951768

Config Field	What it Affects
`outcome_type`	Binary / continuous / categorical
`group_count`	One-sample / two-sample / multi
`group_relationship`	Independent or paired
`distribution`	Normal or non-normal
`variance_equal`	Determines pooled vs Welch’s t-test
`parametric`	Whether to use parametric approach

Scenario	Selected Test
Continuous, 2 groups, normal, equal variance	Two-sample t-test (pooled)
Continuous, 2 groups, non-normal	Mann-Whitney U
Binary, 2 groups, independent	Proportions z-test
Continuous, paired, non-normal	Wilcoxon Signed-Rank
3+ groups, categorical outcome	Chi-square test

Test Type	Null Hypothesis (H₀)	Alternative (H₁)
Two-sample t-test	Mean A = Mean B	Mean A ≠ Mean B (or A > B, A < B)
One-sample t-test	Mean = reference value	Mean ≠ reference value
Proportions z-test	Prop A = Prop B	Prop A ≠ Prop B (or A > B)
ANOVA	All group means are equal	At least one group mean is different
Chi-square test	Category distributions are the same	Distributions differ across groups
McNemar’s test	Before = After	Before ≠ After

Field	What It Means
`statistic`	The test result (e.g., t-score, chi-square, etc)
`p_value`	Probability of seeing this result by chance
`significant`	`True` if `p < alpha` (reject H₀), else `False`
`alpha`	The pre-defined significance threshold (typically 0.05)

📖 Hypothesis Testing¶

🗂️ Data Setup¶

🧪 Generate Data from Config¶

⚙️ Define Test Configuration¶

🛠️ Test Setup¶

📊 Test Type (test_type)¶

📏 Tail Type (tail_type)¶

🧮 Parametric (parametric)¶

📊 Equal Variance (equal_variance)¶

🔑 Significance Level (alpha)¶

🎯 Putting It All Together¶

🧪 Validate Configuration Dictionary¶

📋 Print Config Summary¶

📈 Inference¶

🔍 Infer Distribution¶

📘 Why Are We Checking for Normality?¶

🧪 Test Used: Shapiro-Wilk¶

🧠 Interpretation:¶

❗Note:¶

📏 Infer Variance¶

📘 Why Are We Checking for Equal Variance?¶

🧪 Test Used: Levene’s Test¶

🧠 Interpretation:¶

✅ When Should You Check This?¶

📏 Infer Parametric Flag¶

📘 What Does "Parametric" Mean?¶

🔁 What Happens If Assumptions Don’t Hold?¶

🧠 How We Decide Here¶

🧪 Hypothesis Testing

🧭 Determine Test¶

🧠 What Are We Doing Here?¶

⚙️ How the Logic Works¶

🧪 Example Mappings:¶

🧯 What Happens if It Can’t Decide?¶

🧠 Print Hypothesis¶

📘 What Are We Doing Here?¶

🧪 Examples of Hypothesis Pairs¶

📏 One-Tailed vs Two-Tailed¶

🧠 Why Does This Matter?¶

🧪 Run Hypothesis Test¶

🧠 What Happens in This Step?¶

🧪 Interpreting the Output¶

📏 Significance Logic¶

⚠️ Robustness¶