# Display Settings
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython.display import display, HTML

# Set Seed 
my_seed=1995

# Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
from scipy import stats
from scipy.stats import (
    ttest_ind,
    ttest_rel,
    chi2_contingency,
    mannwhitneyu,
    levene,
    shapiro
)
import statsmodels.api as sm
from statsmodels.stats.power import (
    TTestIndPower,
    TTestPower,
    FTestAnovaPower,
    NormalIndPower
)
from sklearn.model_selection import train_test_split

observations_count = 1000

np.random.seed(my_seed) # For reproducibility
users = pd.DataFrame({
    # identifiers
    'user_id': range(1, observations_count+1),

    # segmentation features
    'platform': np.random.choice(['iOS', 'Android'], size=observations_count, p=[0.6, 0.4]), # 60% iOS, 40% Android
    'device_type': np.random.choice(['mobile', 'desktop'], size=observations_count, p=[0.7, 0.3]),
    'user_tier': np.random.choice(['new', 'returning'], size=observations_count, p=[0.4, 0.6]),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=observations_count, p=[0.25, 0.25, 0.25, 0.25]),
    'plan_type': np.random.choice(['basic', 'premium', 'pro'], size=observations_count, p=[0.6, 0.3, 0.1]), # 60% basic, 30% premium, 10% pro
    'city': np.random.choice(['ny', 'sf', 'chicago', 'austin'], size=observations_count),

    # outcome metrics
    'engagement_score': np.random.normal(50, 15, observations_count), # Simulated user engagement scores
    'converted': np.random.binomial(n=1, p=0.1, size=observations_count), # Simulated binary conversion: ~10% baseline
    'past_purchase_count': np.random.normal(loc=50, scale=10, size=observations_count), # pre_experiment_metric for CUPED randomization
    'bounce_rate': np.nan # will be simulated later
})

# Simulate  a guardrail metric (bounce_rate)
np.random.seed(my_seed)
users['bounce_rate'] = np.where(
    users['converted'] == 1,
    np.random.normal(loc=0.2, scale=0.05, size=observations_count),
    np.random.normal(loc=0.6, scale=0.10, size=observations_count)
)
users['bounce_rate'] = users['bounce_rate'].clip(0, 1) # Bound bounce_rate between 0 and 1

users

# 1. Main outcome variable you're testing
outcome_metric_col = 'engagement_score'

# 2. Metric type: 'binary', 'continuous', or 'categorical'
outcome_metric_datatype = 'continuous'

# 3. Group assignment (to be generated)
group_labels = ('control', 'treatment')

# 4. Experimental design variant: independent or paired
variant = 'independent'  # Options: 'independent' (supported), 'paired' (not supported yet)

# 5. Optional: Unique identifier for each observation (can be user_id, session_id, etc.)
observation_id_col = 'user_id'

# 6. Optional: Pre-experiment metric for CUPED, if used
pre_experiment_metric = 'past_purchase_count'  # Can be None

# Number of groups in the experiment (e.g., 2 for A/B test, 3 for A/B/C test)
group_count = len(group_labels)

# Column name used to store assigned group after randomization
group_col = 'group'

# Randomization method to assign users to groups
# Options: 'simple', 'stratified', 'block', 'matched_pair', 'cluster', 'cuped'
randomization_method = "simple"

test_config = {
    # Core experiment setup
    'outcome_metric_col'     : outcome_metric_col,         # Main metric to analyze (e.g., 'engagement_score')
    'outcome_metric_datatype': outcome_metric_datatype,    # One of: 'binary', 'continuous', 'categorical'
    'group_labels'           : group_labels,               # Tuple of (control, treatment) group names
    'group_count'            : group_count,                # Number of groups (usually 2 for A/B tests)
    'variant'                : variant,                    # 'independent' or 'paired'
    'observation_id_col'     : observation_id_col,         # Unique identifier for each observation
    'pre_experiment_metric'  : pre_experiment_metric,      # Used for CUPED adjustment (if any)

    # Diagnostic results — filled after EDA/assumptions check
    'normality'              : None,  # Will be set based on Shapiro-Wilk or visual tests
    'equal_variance'         : None,  # Will be set using Levene’s/Bartlett’s test
    'family'                 : None   # Test family → 'z_test', 't_test', 'anova', 'chi_square', etc.
}

from IPython.display import HTML
display(HTML(f"<pre style='color:teal; font-size:14px;'>{json.dumps(test_config, indent=4)}</pre>"))

{
    "outcome_metric_col": "engagement_score",
    "outcome_metric_datatype": "continuous",
    "group_labels": [
        "control",
        "treatment"
    ],
    "group_count": 2,
    "variant": "independent",
    "observation_id_col": "user_id",
    "pre_experiment_metric": "past_purchase_count",
    "normality": null,
    "equal_variance": null,
    "family": null
}

def apply_simple_randomization(df, group_labels=group_labels, group_col=group_col, seed=my_seed):
    """
    Randomly assigns each row to one of the specified groups.

    Parameters:
    - df: pandas DataFrame containing observations
    - group_labels: tuple of group names (default = ('control', 'treatment'))
    - group_col: name of the column to store group assignments
    - seed: random seed for reproducibility

    Returns:
    - DataFrame with an added group assignment column
    """
    np.random.seed(seed)
    df[group_col] = np.random.choice(group_labels, size=len(df), replace=True)
    return df

def apply_stratified_randomization(df, stratify_col, group_labels=group_labels, group_col=group_col, seed=my_seed):
    """
    Performs stratified randomization to assign rows into multiple groups while maintaining balance across strata.

    Parameters:
    - df: pandas DataFrame to assign groups to
    - stratify_col: column to balance across (e.g., platform, region)
    - group_labels: list or tuple of group names
    - group_col: name of the column to store group assignments
    - seed: random seed for reproducibility

    Returns:
    - DataFrame with a new group assignment column
    """
    np.random.seed(seed)
    df[group_col] = None
    n_groups = len(group_labels)

    # Stratify and assign
    for stratum_value, stratum_df in df.groupby(stratify_col):
        shuffled = stratum_df.sample(frac=1, random_state=seed)
        group_assignments = np.tile(group_labels, int(np.ceil(len(shuffled) / n_groups)))[:len(shuffled)]
        df.loc[shuffled.index, group_col] = group_assignments

    return df

def apply_block_randomization(df, observation_id_col, group_col=group_col, block_size=10, group_labels=group_labels, seed=my_seed):
    """
    Assigns group labels using block randomization to ensure balance within fixed-size blocks.

    Parameters:
    - df: DataFrame to assign groups
    - observation_id_col: Unique ID to sort and block on (e.g., user_id)
    - group_col: Name of column to store assigned group labels
    - block_size: Number of observations in each block
    - group_labels: Tuple or list of group names (e.g., ('control', 'treatment', 'variant_B'))
    - seed: Random seed for reproducibility

    Returns:
    - DataFrame with a new column [group_col] indicating assigned group
    """
    np.random.seed(seed)
    df = df.sort_values(observation_id_col).reset_index(drop=True).copy()
    n_groups = len(group_labels)

    # Create block ID per row
    df['_block'] = df.index // block_size

    # Assign groups within each block
    group_assignments = []
    for _, block_df in df.groupby('_block'):
        block_n = len(block_df)
        reps = int(np.ceil(block_n / n_groups))
        candidates = np.tile(group_labels, reps)[:block_n]
        np.random.shuffle(candidates)
        group_assignments.extend(candidates)

    df[group_col] = group_assignments
    df = df.drop(columns=['_block'])

    return df

def apply_matched_pair_randomization(df, sort_col, group_col=group_col, group_labels=group_labels):
    """
    Assigns groups using matched-pair randomization based on a sorting variable.

    Parameters:
    - df: pandas DataFrame to assign groups to
    - sort_col: column used to sort users before pairing (e.g., engagement score)
    - group_col: name of the column to store group assignments
    - group_labels: tuple of group names (e.g., ('control', 'treatment'))

    Returns:
    - DataFrame with alternating group assignments within sorted pairs
    """
    # Sort by matching variable so similar users are adjacent
    df = df.sort_values(by=sort_col).reset_index(drop=True)

    # Cycle through group labels for each row
    df[group_col] = [group_labels[i % len(group_labels)] for i in range(len(df))]

    return df

def apply_cluster_randomization(df, cluster_col, group_col=group_col, group_labels=group_labels, seed=my_seed):
    """
    Assigns groups using cluster-level randomization — all observations in a cluster
    receive the same group assignment.

    Parameters:
    - df: pandas DataFrame to assign groups to
    - cluster_col: column representing the cluster unit (e.g., city, store)
    - group_col: name of the column where group labels will be stored
    - group_labels: tuple of group names to randomly assign (e.g., ('control', 'treatment'))
    - seed: random seed for reproducibility

    Returns:
    - DataFrame with assigned groups at the cluster level
    """
    np.random.seed(seed)

    # Unique clusters (e.g., unique city/store values)
    unique_clusters = df[cluster_col].unique()

    # Randomly assign each cluster to a group
    cluster_assignments = dict(
        zip(unique_clusters, np.random.choice(group_labels, size=len(unique_clusters)))
    )

    # Map group assignments to full DataFrame
    df[group_col] = df[cluster_col].map(cluster_assignments)

    return df

def apply_cuped(
    df,
    pre_metric,
    outcome_metric_col,  # observed outcome column (e.g., engagement_score)
    outcome_col=None,
    group_col=group_col,
    group_labels=group_labels,
    seed=my_seed
):
    """
    Applies CUPED (Controlled Pre-Experiment Data) adjustment to reduce variance
    in the outcome metric using a pre-experiment covariate.

    CUPED is a post-randomization technique that reduces variance by adjusting the 
    observed outcome using a baseline (pre-metric) variable that is correlated 
    with the outcome.

    Parameters:
    ----------
    df : pandas.DataFrame
        Input DataFrame containing experiment data.
    pre_metric : str
        Column name of the pre-experiment covariate (e.g., 'past_purchase_count').
        This is the variable used to compute the adjustment factor (theta).
    outcome_metric_col : str
        Column name of the original observed outcome (e.g., 'engagement_score') 
        that you are comparing across groups.
    outcome_col : str, default=None
        Name of the new column where the adjusted outcome will be stored.
    group_col : str
        Column indicating the experiment group assignment (e.g., 'control' vs 'treatment').
    group_labels : tuple
        Tuple containing the names of the experiment groups.
    seed : int
        Random seed for reproducibility (used only if randomness is introduced later).

    Returns:
    -------
    df : pandas.DataFrame
        DataFrame with an additional column [outcome_col] containing the CUPED-adjusted outcome.
    """
    np.random.seed(seed)

    # Step 1: Use actual observed experiment outcome
    y = df[outcome_metric_col].values

    # Step 2: Regress outcome on pre-metric to estimate correction factor (theta)
    X = sm.add_constant(df[[pre_metric]])
    theta = sm.OLS(y, X).fit().params[pre_metric]

    # Step 3: Apply CUPED adjustment and save in new column
    if outcome_col is None:
        outcome_col = f'{outcome_metric_col}_cuped_adjusted'
    df[outcome_col] = y - theta * df[pre_metric]

    return df

# Apply randomization method
if randomization_method == "simple":
    users = apply_simple_randomization(users, group_col=group_col, seed=my_seed)

elif randomization_method == "stratified":
    users = apply_stratified_randomization(users, stratify_col='platform', group_col=group_col, seed=my_seed)

elif randomization_method == "block":
    users = apply_block_randomization(users, observation_id_col='user_id', group_col=group_col, block_size=10, seed=my_seed)

elif randomization_method == "matched_pair":
    users = apply_matched_pair_randomization(users, sort_col=outcome_metric_col, group_col=group_col, seed=my_seed)

elif randomization_method == "cluster":
    users = apply_cluster_randomization(users, cluster_col='city', group_col=group_col, seed=my_seed)

elif randomization_method == "cuped":
    users = apply_cuped(users, pre_metric='past_purchase_count', outcome_metric_col=outcome_metric_col, group_col=group_col, group_labels=group_labels, seed=my_seed)
    # Update global outcome to CUPED-adjusted version
    outcome_metric_col = f"{outcome_metric_col}_cuped_adjusted"
else:
    raise ValueError(f"❌ Unsupported randomization method: {randomization_method}")

users

def test_normality(df, outcome_metric_col, group_col, group_labels):
    results = {}
    for group in group_labels:
        group_data = df[df[group_col] == group][outcome_metric_col]
        stat, p = shapiro(group_data)
        results[group] = {'statistic': stat, 'p_value': p, 'normal': p > 0.05}
    return results

normality_results = test_normality(users, outcome_metric_col=outcome_metric_col, group_col='group', group_labels=group_labels)

print("Normality test (Shapiro-Wilk) results:")
for group, result in normality_results.items():
    print(f"{group}: p = {result['p_value']:.4f} → {'Normal' if result['normal'] else 'Non-normal'}")

Normality test (Shapiro-Wilk) results:
control: p = 0.2230 → Normal
treatment: p = 0.6053 → Normal

# Assume both groups must be normal to proceed with parametric tests
test_config['normality'] = all(result['normal'] for result in normality_results.values())
test_config

{'outcome_metric_col': 'engagement_score',
 'outcome_metric_datatype': 'continuous',
 'group_labels': ('control', 'treatment'),
 'group_count': 2,
 'variant': 'independent',
 'observation_id_col': 'user_id',
 'pre_experiment_metric': 'past_purchase_count',
 'normality': True,
 'equal_variance': None,
 'family': None}

def test_equal_variance(df, outcome_metric_col, group_col, group_labels):
    group_data = [df[df[group_col] == label][outcome_metric_col] for label in group_labels]
    stat, p = levene(*group_data)
    return {'statistic': stat, 'p_value': p, 'equal_variance': p > 0.05}

variance_result = test_equal_variance(users, outcome_metric_col=outcome_metric_col, group_col='group', group_labels=group_labels)
variance_result

{'statistic': 0.08918799756611763,
 'p_value': 0.7652741675085144,
 'equal_variance': True}

print(f"Levene’s test: p = {variance_result['p_value']:.4f} → {'Equal variances' if variance_result['equal_variance'] else 'Unequal variances'}")
test_config['equal_variance'] = variance_result['equal_variance']
test_config

Levene’s test: p = 0.7653 → Equal variances

{'outcome_metric_col': 'engagement_score',
 'outcome_metric_datatype': 'continuous',
 'group_labels': ('control', 'treatment'),
 'group_count': 2,
 'variant': 'independent',
 'observation_id_col': 'user_id',
 'pre_experiment_metric': 'past_purchase_count',
 'normality': True,
 'equal_variance': True,
 'family': None}

def determine_test_family(test_config):
    """
    Decide which family of statistical test to use based on:
    - outcome data type: binary / continuous / categorical
    - group count: 2 or 3+
    - variant: independent or paired (optional for family level)
    - normality assumption: passed or not
    """

    data_type = test_config['outcome_metric_datatype']
    group_count = test_config['group_count']
    variant = test_config['variant']
    normality = test_config['normality']

    # Binary outcome → Z-test for 2 groups, Chi-square for 3+ groups
    if data_type == 'binary':
        if group_count == 2:
            return 'z_test'           # Compare proportions across 2 groups
        else:
            return 'chi_square'      # 2x3+ contingency test

    # Continuous outcome → check for normality and group count
    elif data_type == 'continuous':
        if not normality:
            return 'non_parametric'  # Mann-Whitney U or Kruskal-Wallis
        if group_count == 2:
            return 't_test'          # Independent or paired t-test
        else:
            return 'anova'           # One-way ANOVA

    # Categorical outcome → Chi-square always
    elif data_type == 'categorical':
        return 'chi_square'

    else:
        raise ValueError(f"Unsupported outcome_metric_datatype: {data_type}")

test_config['family'] = determine_test_family(test_config)
test_config

print(f"✅ Selected test family: {test_config['family']}")

{'outcome_metric_col': 'engagement_score',
 'outcome_metric_datatype': 'continuous',
 'group_labels': ('control', 'treatment'),
 'group_count': 2,
 'variant': 'independent',
 'observation_id_col': 'user_id',
 'pre_experiment_metric': 'past_purchase_count',
 'normality': True,
 'equal_variance': True,
 'family': 't_test'}

✅ Selected test family: t_test

def run_outcome_similarity_test(
    df,
    group_col,
    metric_col,
    test_family,
    variant=None,
    group_labels=('control', 'treatment'),
    alpha=0.05,
    verbose=True
):
    """
    Runs a similarity test between two groups based on test_family and variant.

    Parameters:
    - df: pandas DataFrame
    - group_col: column with group assignment
    - metric_col: outcome metric
    - test_family: one of ['z_test', 't_test', 'chi_square', 'anova', 'non_parametric']
    - variant: 'independent' or 'paired' (required for t-test)
    - group_labels: tuple of (control, treatment)
    - alpha: significance threshold
    - verbose: print detailed interpretation
    """

    if verbose:
        print("📏 Outcome Similarity Check\n")

    group1 = df[df[group_col] == group_labels[0]][metric_col]
    group2 = df[df[group_col] == group_labels[1]][metric_col]

    # --- Run appropriate test ---
    if test_family == 'z_test':
        conv1, conv2 = group1.mean(), group2.mean()
        n1, n2 = len(group1), len(group2)
        pooled_prob = (group1.sum() + group2.sum()) / (n1 + n2)
        se = np.sqrt(pooled_prob * (1 - pooled_prob) * (1/n1 + 1/n2))
        z_score = (conv2 - conv1) / se
        p_value = 2 * (1 - stats.norm.cdf(abs(z_score)))
        test_name = "z-test for proportions"

    elif test_family == 't_test':
        if variant == 'independent':
            t_stat, p_value = stats.ttest_ind(group1, group2, equal_var=False)
            test_name = "independent t-test"
        elif variant == 'paired':
            if len(group1) != len(group2):
                print("❌ Paired t-test requires equal-length samples.")
                return None
            t_stat, p_value = stats.ttest_rel(group1, group2)
            test_name = "paired t-test"
        else:
            raise ValueError("Missing or invalid variant for t-test.")

    elif test_family == 'chi_square':
        contingency = pd.crosstab(df[group_col], df[metric_col])
        chi2_stat, p_value, _, _ = stats.chi2_contingency(contingency)
        test_name = "chi-square test"

    elif test_family == 'anova':
        f_stat, p_value = stats.f_oneway(group1, group2)
        test_name = "one-way ANOVA"

    elif test_family == 'non_parametric':
        u_stat, p_value = stats.mannwhitneyu(group1, group2, alternative='two-sided')
        test_name = "Mann-Whitney U test"

    else:
        raise ValueError(f"❌ Unsupported test family: {test_family}")

    # --- Detailed Interpretation ---
    if verbose:
        print("\n🧠 Interpretation:")

        if test_family == 'z_test':
            print(f"Used a {test_name} to compare conversion rates between groups.")
            print("Null Hypothesis: Conversion rates are equal across groups.")

        elif test_family == 't_test':
            if variant == 'independent':
                print(f"Used an {test_name} to compare means of '{metric_col}' across independent groups.")
                print("Null Hypothesis: Group means are equal.")
            elif variant == 'paired':
                print(f"Used a {test_name} to compare within-user differences in '{metric_col}'.")
                print("Null Hypothesis: Mean difference between pairs is zero.")

        elif test_family == 'chi_square':
            print(f"Used a {test_name} to test whether '{metric_col}' distribution depends on group.")
            print("Null Hypothesis: No association between group and category.")

        elif test_family == 'anova':
            print(f"Used a {test_name} to compare group means of '{metric_col}' across 3+ groups.")
            print("Null Hypothesis: All group means are equal.")

        elif test_family == 'non_parametric':
            print(f"Used a {test_name} to compare medians of '{metric_col}' across groups (non-parametric).")
            print("Null Hypothesis: Distributions are identical across groups.")

        print(f"\nWe use α = {alpha:.2f}")
        if p_value < alpha:
            print(f"➡️ p = {p_value:.4f} < α → Reject null hypothesis. Statistically significant difference.")
        else:
            print(f"➡️ p = {p_value:.4f} ≥ α → Fail to reject null. No statistically significant difference.")

    return p_value

def run_aa_testing_generalized(
    df,
    group_col,
    metric_col,
    group_labels,
    test_family,
    variant=None,
    alpha=0.05,
    visualize=True
):
    """
    Runs A/A test: SRM check + similarity test + optional visualization.
    All logic routed by test_family + variant (no experiment_type).
    """
    print(f"\n📊 A/A Test Summary for metric: '{metric_col}' [{test_family}, {variant}]\n")

    check_sample_ratio_mismatch(df, group_col, group_labels, alpha=alpha, expected_ratios=[0.5, 0.5])

    group1 = df[df[group_col] == group_labels[0]][metric_col]
    group2 = df[df[group_col] == group_labels[1]][metric_col]

    p_value = run_outcome_similarity_test(
        df=df,
        group_col=group_col,
        metric_col=metric_col,
        test_family=test_family,
        variant=variant,
        group_labels=group_labels,
        alpha=alpha
    )

    if visualize and p_value is not None:
        visualize_aa_distribution(
            df, group1, group2,
            group_col=group_col,
            metric_col=metric_col,
            test_family=test_family,
            variant=variant,
            group_labels=group_labels
        )

def check_sample_ratio_mismatch(df, group_col, group_labels, expected_ratios=None, alpha=0.05):
    """
    Checks for Sample Ratio Mismatch (SRM) using a Chi-Square test.

    Parameters:
    - df: DataFrame with group assignments
    - group_col: Column containing group assignment
    - group_labels: List or tuple of group names (e.g., ['control', 'treatment'])
    - expected_ratios: Expected proportions per group (e.g., [0.5, 0.5])
    - alpha: Significance level

    Prints observed vs expected distribution and test results.
    """
    print("🔍 Sample Ratio Mismatch (SRM) Check")

    observed_counts = df[group_col].value_counts().reindex(group_labels, fill_value=0)

    if expected_ratios is None:
        expected_ratios = [1 / len(group_labels)] * len(group_labels)
    else:
        total = sum(expected_ratios)
        expected_ratios = [r / total for r in expected_ratios]  # normalize to sum to 1

    expected_counts = [len(df) * ratio for ratio in expected_ratios]

    # Print group-wise summary
    for grp, expected in zip(group_labels, expected_counts):
        observed = observed_counts.get(grp, 0)
        pct = observed / len(df) * 100
        print(f"Group {grp}: {observed} users ({pct:.2f}%) — Expected: {expected:.1f}")

    # Run Chi-square test
    chi2_stat, chi2_p = stats.chisquare(f_obs=observed_counts, f_exp=expected_counts)
    print(f"\nChi2 Statistic: {chi2_stat:.4f}")
    print(f"P-value       : {chi2_p:.4f}")

    if chi2_p < alpha:
        print("⚠️ SRM Detected — group assignment might be biased.\n")
    else:
        print("✅ No SRM — group sizes look balanced.\n")

def visualize_aa_distribution(df, group1, group2, group_col, metric_col, test_family, variant=None, group_labels=('control', 'treatment')):
    if test_family in ['t_test', 'anova', 'non_parametric']:
        plt.hist(group1, bins=30, alpha=0.5, label=group_labels[0])
        plt.hist(group2, bins=30, alpha=0.5, label=group_labels[1])
        plt.title(f"A/A Test: {metric_col} Distribution")
        plt.xlabel(metric_col)
        plt.ylabel("Frequency")
        plt.legend()
        plt.show()

    elif test_family == 'z_test':
        rates = [group1.mean(), group2.mean()]
        plt.bar(group_labels, rates)
        for i, rate in enumerate(rates):
            plt.text(i, rate + 0.01, f"{rate:.2%}", ha='center')
        plt.title("A/A Test: Conversion Rate by Group")
        plt.ylabel("Conversion Rate")
        plt.ylim(0, 1)
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.show()

    elif test_family == 'chi_square':
        contingency = pd.crosstab(df[group_col], df[metric_col], normalize='index')
        contingency.plot(kind='bar', stacked=True)
        plt.title(f"A/A Test: {metric_col} Distribution by Group")
        plt.ylabel("Proportion")
        plt.xlabel(group_col)
        plt.legend(title=metric_col)
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.show()

run_aa_testing_generalized(
    df=users,
    group_col='group',
    metric_col=test_config['outcome_metric_col'],
    group_labels=test_config['group_labels'],
    test_family=test_config['family'],
    variant=test_config.get('variant'),
    alpha=0.05
)

📊 A/A Test Summary for metric: 'engagement_score' [t_test, independent]

🔍 Sample Ratio Mismatch (SRM) Check
Group control: 510 users (51.00%) — Expected: 500.0
Group treatment: 490 users (49.00%) — Expected: 500.0

Chi2 Statistic: 0.4000
P-value       : 0.5271
✅ No SRM — group sizes look balanced.

📏 Outcome Similarity Check


🧠 Interpretation:
Used an independent t-test to compare means of 'engagement_score' across independent groups.
Null Hypothesis: Group means are equal.

We use α = 0.05
➡️ p = 0.4657 ≥ α → Fail to reject null. No statistically significant difference.

def simulate_aa_type1_error_rate(
    df,
    metric_col,
    group_labels,
    test_family,
    variant=None,
    runs=100,
    alpha=0.05,
    seed=42,
    verbose=False
):
    """
    Simulates repeated A/A tests to estimate empirical Type I error rate.

    Returns:
    - p_values: list of p-values from each simulation
    """
    np.random.seed(seed)
    p_values = []

    for i in range(runs):
        shuffled_df = df.copy()
        shuffled_df['group'] = np.random.choice(group_labels, size=len(df), replace=True)

        p = run_outcome_similarity_test(
            df=shuffled_df,
            group_col='group',
            metric_col=metric_col,
            test_family=test_family,
            variant=variant,
            group_labels=group_labels,
            alpha=alpha,
            verbose=False
        )

        if p is not None:
            p_values.append(p)

        if verbose:
            print(f"Run {i+1}: p = {p:.4f}")

    significant = sum(p < alpha for p in p_values)
    error_rate = significant / runs

    print(f"\n📈 Type I Error Rate Estimate: {significant}/{runs} = {error_rate:.2%}")

    # Interpretation Block
    print(f"""
            🧠 Summary Interpretation:
            We simulated {runs} A/A experiments using random group assignment (no actual treatment).

            Test: {test_family.upper()}{' (' + variant + ')' if variant else ''}
            Metric: {metric_col}
            Alpha: {alpha}

            False positives (p < α): {significant} / {runs}
            → Estimated Type I Error Rate: {error_rate:.2%}

            This is within expected range for α = {alpha}.
            → ✅ Test framework is behaving correctly — no bias or sensitivity inflation.
            """)

    plot_p_value_distribution(p_values, alpha=alpha)

    return p_values

def plot_p_value_distribution(p_values, alpha=0.05):
    plt.figure(figsize=(8, 4))
    plt.hist(p_values, bins=20, edgecolor='black', alpha=0.7)
    plt.axvline(x=alpha, color='red', linestyle='--', label=f"α = {alpha}")
    plt.title("P-value Distribution Across A/A Tests")
    plt.xlabel("P-value")
    plt.ylabel("Frequency")
    plt.legend()
    plt.grid(axis='y', linestyle='--', alpha=0.6)
    plt.show()

_ = simulate_aa_type1_error_rate(
    df=users,
    metric_col=test_config['outcome_metric_col'],
    group_labels=test_config['group_labels'],
    test_family=test_config['family'],
    variant=test_config.get('variant'),
    runs=100,
    alpha=0.05
)

📈 Type I Error Rate Estimate: 4/100 = 4.00%

            🧠 Summary Interpretation:
            We simulated 100 A/A experiments using random group assignment (no actual treatment).

            Test: T_TEST (independent)
            Metric: engagement_score
            Alpha: 0.05

            False positives (p < α): 4 / 100
            → Estimated Type I Error Rate: 4.00%

            This is within expected range for α = 0.05.
            → ✅ Test framework is behaving correctly — no bias or sensitivity inflation.

# Define Core Inputs

# Use values from your config or plug in manually
alpha = 0.05  # False positive tolerance (Type I error)
power = 0.80  # Statistical power (1 - Type II error)
group_labels = test_config['group_labels']
metric_col = test_config['outcome_metric_col']
test_family = test_config['family']
variant = test_config.get('variant')

# 🧮 Data-Driven Baseline Metric

if test_family == 'z_test':
    # For binary outcome (e.g., conversion): baseline = conversion rate in data
    baseline_rate = users[metric_col].mean()
    print(f"📊 Baseline conversion rate: {baseline_rate:.2%}")

elif test_family in ['t_test', 'anova', 'non_parametric']:
    # For continuous metrics (e.g., revenue, engagement)
    control_data = users[users['group'] == group_labels[0]][metric_col]
    baseline_mean = control_data.mean()
    std_dev = control_data.std()
    print(f"📊 Control group mean: {baseline_mean:.2f}")
    print(f"📏 Control group std dev: {std_dev:.2f}")

else:
    baseline_rate = None
    std_dev = None

📊 Control group mean: 51.03
📏 Control group std dev: 15.61

# Minimum Detectable Effect (MDE)
# This is NOT data-driven — it reflects the minimum improvement you care about detecting.
# It should be small enough to catch valuable changes, but large enough to avoid inflating sample size.

# Examples by Metric Type:
# - Binary       : 0.02 → detect a 2% lift in conversion rate (e.g., from 10% to 12%)
# - Categorical  : 0.05 → detect a 5% shift in plan preference (e.g., more users choosing 'premium' over 'basic')
# - Continuous   : 3.0  → detect a 3-point gain in engagement score (e.g., from 50 to 53 avg. score)

mde = 5  # Change this based on business relevance

def calculate_power_sample_size(
    test_family,
    variant=None,
    alpha=0.05,
    power=0.80,
    baseline_rate=None,  # required for z-test
    mde=None,
    std_dev=None,
    effect_size=None,
    num_groups=2  # placeholder for future ANOVA support
):
    """
    Calculate required sample size per group based on test type and assumptions.

    Supported families:
    - 'z_test'              : Binary outcomes (proportions)
    - 't_test'              : Continuous outcomes (independent or paired)
    - 'non_parametric'      : Mann-Whitney (approximated as t-test)
    - 'anova'               : Not implemented (default to t-test)
    - 'chi_square'          : Categorical outcomes (not used in this version)
    """
    # -- Z-Test for Binary Proportions --
    if test_family == 'z_test':
        if baseline_rate is None or mde is None:
            raise ValueError("baseline_rate and mde are required for z-test (binary outcome).")

        z_alpha = stats.norm.ppf(1 - alpha / 2)
        z_beta = stats.norm.ppf(power)
        p1 = baseline_rate
        p2 = p1 + mde
        pooled_std = np.sqrt(2 * p1 * (1 - p1))

        n = ((z_alpha + z_beta) ** 2 * pooled_std ** 2) / (mde ** 2)
        return int(np.ceil(n))

    # -- T-Test for Continuous (Independent or Paired) --
    elif test_family in ['t_test', 'non_parametric', 'anova']:
        if effect_size is None:
            if std_dev is None or mde is None:
                raise ValueError("For continuous outcomes, provide either effect_size or both std_dev and mde.")
            effect_size = mde / std_dev  # Cohen's d

        if variant == 'independent':
            analysis = TTestIndPower()
        elif variant == 'paired':
            analysis = TTestPower()
        else:
            raise ValueError("variant must be 'independent' or 'paired' for t-test.")

        n = analysis.solve_power(effect_size=effect_size, power=power, alpha=alpha)
        return int(np.ceil(n))

    else:
        raise ValueError(f"❌ Unsupported test family: {test_family}")

required_sample_size = calculate_power_sample_size(
    test_family=test_family,
    variant=variant,
    alpha=alpha,
    power=power,
    baseline_rate=baseline_rate if test_family == 'z_test' else None,
    mde=mde,
    std_dev=std_dev if test_family in ['t_test', 'anova', 'non_parametric'] else None,
    effect_size=None,  # Let it compute internally via mde/std
    num_groups=2
)

test_config['required_sample_size'] = required_sample_size
print(f"✅ Required sample size per group: {required_sample_size}")
print(f"👥 Total sample size: {required_sample_size * 2}")

✅ Required sample size per group: 154
👥 Total sample size: 308

def print_power_summary(
    test_family,
    variant,
    alpha,
    power,
    baseline_rate=None,
    mde=None,
    std_dev=None,
    required_sample_size=None
):
    print("📈 Power Analysis Summary")
    print(f"- Test: {test_family.upper()}{' (' + variant + ')' if variant else ''}")
    print(f"- Significance level (α): {alpha}")
    print(f"- Statistical power (1 - β): {power}")

    if test_family == 'z_test':
        print(f"- Baseline conversion rate: {baseline_rate:.2%}")
        print(f"- MDE: {mde:.2%}")
        print(f"\n✅ To detect a lift from {baseline_rate:.2%} to {(baseline_rate + mde):.2%},")
        print(f"you need {required_sample_size} users per group → total {required_sample_size * 2} users.")

    elif test_family == 't_test':
        print(f"- Std Dev (control group): {std_dev:.2f}")
        print(f"- MDE (mean difference): {mde}")
        print(f"- Cohen's d: {mde / std_dev:.2f}")
        print(f"\n✅ To detect a {mde}-unit lift in mean outcome,")
        print(f"you need {required_sample_size} users per group → total {required_sample_size * 2} users.")

    else:
        print("⚠️ Unsupported family for summary.")

print_power_summary(
    test_family=test_family,
    variant=variant,
    alpha=alpha,
    power=power,
    baseline_rate=baseline_rate if test_family == 'z_test' else None,
    mde=mde,
    std_dev=std_dev if test_family == 't_test' else None,
    required_sample_size=required_sample_size
)

📈 Power Analysis Summary
- Test: T_TEST (independent)
- Significance level (α): 0.05
- Statistical power (1 - β): 0.8
- Std Dev (control group): 15.61
- MDE (mean difference): 5
- Cohen's d: 0.32

✅ To detect a 5-unit lift in mean outcome,
you need 154 users per group → total 308 users.

def run_ab_test(
    df,
    group_col,
    metric_col,
    group_labels,
    test_family,
    variant=None,
    alpha=0.05
):
    """
    Runs the correct statistical test based on test_family + variant combo.

    Returns:
    - result dict with summary stats, test used, p-value, and test-specific values
    """
    group1, group2 = group_labels
    data1 = df[df[group_col] == group1][metric_col]
    data2 = df[df[group_col] == group2][metric_col]

    result = {
        'test_family': test_family,
        'variant': variant,
        'group_labels': group_labels,
        'alpha': alpha,
        'summary': {}
    }

    # --- Summary Stats ---
    result['summary'][group1] = {
        'n': len(data1),
        'mean': data1.mean(),
        'std': data1.std() if test_family in ['t_test', 'non_parametric'] else None,
        'sum': data1.sum() if test_family == 'z_test' else None
    }
    result['summary'][group2] = {
        'n': len(data2),
        'mean': data2.mean(),
        'std': data2.std() if test_family in ['t_test', 'non_parametric'] else None,
        'sum': data2.sum() if test_family == 'z_test' else None
    }

    # --- Binary Proportions (Z-Test) ---
    if test_family == 'z_test':
        x1, n1 = data1.sum(), len(data1)
        x2, n2 = data2.sum(), len(data2)
        p_pooled = (x1 + x2) / (n1 + n2)
        se = np.sqrt(p_pooled * (1 - p_pooled) * (1/n1 + 1/n2))
        z_stat = (x2/n2 - x1/n1) / se
        p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))
        result.update({'test': 'z-test for proportions', 'z_stat': z_stat, 'p_value': p_value})

    # --- Continuous (T-Test) ---
    elif test_family == 't_test':
        if variant == 'independent':
            t_stat, p_value = stats.ttest_ind(data1, data2, equal_var=False)
            result.update({'test': 'independent t-test', 't_stat': t_stat, 'p_value': p_value})
        elif variant == 'paired':
            if len(data1) != len(data2):
                raise ValueError("Paired test requires equal-length matching samples.")
            t_stat, p_value = stats.ttest_rel(data1, data2)
            result.update({'test': 'paired t-test', 't_stat': t_stat, 'p_value': p_value})
        else:
            raise ValueError("Missing or invalid variant for t-test.")

    # --- Continuous (Non-parametric) ---
    elif test_family == 'non_parametric':
        u_stat, p_value = stats.mannwhitneyu(data1, data2, alternative='two-sided')
        result.update({'test': 'Mann-Whitney U Test', 'u_stat': u_stat, 'p_value': p_value})

    # --- Categorical (Chi-square) ---
    elif test_family == 'chi_square':
        contingency = pd.crosstab(df[group_col], df[metric_col])
        chi2, p_value, _, _ = stats.chi2_contingency(contingency)
        result.update({'test': 'chi-square test', 'chi2_stat': chi2, 'p_value': p_value})

    else:
        raise ValueError(f"❌ Unsupported test_family: {test_family}")

    return result

result = run_ab_test(
    df=users,
    group_col='group',
    metric_col=test_config['outcome_metric_col'],
    group_labels=test_config['group_labels'],
    test_family=test_config['family'],
    variant=test_config.get('variant'),
    alpha=0.05
)
result

{'test_family': 't_test',
 'variant': 'independent',
 'group_labels': ('control', 'treatment'),
 'alpha': 0.05,
 'summary': {'control': {'n': 510,
   'mean': 51.02515924585304,
   'std': 15.60949221088591,
   'sum': None},
  'treatment': {'n': 490,
   'mean': 50.31059463655223,
   'std': 15.354069556161138,
   'sum': None}},
 'test': 'independent t-test',
 't_stat': 0.7297273039756266,
 'p_value': 0.4657282187278262}

def summarize_ab_test_result(result):
    """
    Prints A/B test results summary with statistical test outputs and lift analysis.
    """
    test_family = result['test_family']
    variant = result.get('variant')
    group1, group2 = result['group_labels']
    p_value = result.get('p_value')
    alpha = result.get('alpha', 0.05)

    print("\n" + "="*45)
    print(f"🧪 A/B Test Result Summary [{test_family.upper()}]")
    print("="*45)

    # ---- Hypothesis Test Output ----
    print("\n📊 Hypothesis Test Result")
    print(f"Test used: {result.get('test', 'N/A')}")
    if 'z_stat' in result:
        print(f"Z-statistic: {result['z_stat']:.4f}")
    elif 't_stat' in result:
        print(f"T-statistic: {result['t_stat']:.4f}")
    elif 'chi2_stat' in result:
        print(f"Chi2-statistic: {result['chi2_stat']:.4f}")
    elif 'u_stat' in result:
        print(f"U-statistic: {result['u_stat']:.4f}")

    if p_value is not None:
        print(f"P-value    : {p_value:.4f}")
        print("✅ Statistically significant difference detected." if p_value < alpha else "🚫 No significant difference detected.")
    else:
        print("⚠️ P-value not found.")

    # ---- Summary Table ----
    print("\n📋 Group Summary:\n")
    display(pd.DataFrame(result['summary']).T)

    # ---- Lift Analysis (for Z-test or T-test (independent)) ----
    if test_family in ['z_test', 't_test'] and (variant == 'independent' or test_family == 'z_test'):
        group1_mean = result['summary'][group1]['mean']
        group2_mean = result['summary'][group2]['mean']
        lift = group2_mean - group1_mean
        pct_lift = lift / group1_mean if group1_mean else np.nan

        print("\n📈 Lift Analysis")
        print(f"- Absolute Lift   : {lift:.4f}")
        print(f"- Percentage Lift : {pct_lift:.2%}")

        try:
            n1 = result['summary'][group1]['n']
            n2 = result['summary'][group2]['n']

            if test_family == 'z_test':
                se = np.sqrt(group1_mean * (1 - group1_mean) / n1 + group2_mean * (1 - group2_mean) / n2)
            else:
                sd1 = result['summary'][group1].get('std')
                sd2 = result['summary'][group2].get('std')
                se = np.sqrt((sd1 ** 2) / n1 + (sd2 ** 2) / n2)

            z = 1.96
            ci_low = lift - z * se
            ci_high = lift + z * se
            print(f"- 95% CI for Lift : [{ci_low:.4f}, {ci_high:.4f}]")
        except Exception as e:
            print(f"⚠️ Could not compute confidence interval: {e}")

    print("="*45 + "\n")

summarize_ab_test_result(result)

=============================================
🧪 A/B Test Result Summary [T_TEST]
=============================================

📊 Hypothesis Test Result
Test used: independent t-test
T-statistic: 0.7297
P-value    : 0.4657
🚫 No significant difference detected.

📋 Group Summary:

📈 Lift Analysis
- Absolute Lift   : -0.7146
- Percentage Lift : -1.40%
- 95% CI for Lift : [-2.6338, 1.2047]
=============================================

def plot_ab_test_results(result):
    """
    Plots A/B test results by group mean or distribution depending on test family.
    """
    test_family = result['test_family']
    variant = result.get('variant')
    group1, group2 = result['group_labels']

    print("\n📊 Visualization:")

    if test_family in ['z_test', 't_test', 'non_parametric']:
        labels = [group1, group2]
        values = [result['summary'][group1]['mean'], result['summary'][group2]['mean']]
        plt.bar(labels, values, color=['gray', 'skyblue'])

        for i, val in enumerate(values):
            label = f"{val:.2%}" if test_family == 'z_test' else f"{val:.2f}"
            plt.text(i, val + 0.01, label, ha='center')

        ylabel = "Conversion Rate" if test_family == 'z_test' else "Average Value"
        plt.ylabel(ylabel)
        plt.title(f"{ylabel} by Group")
        plt.ylim(0, max(values) * 1.2)
        plt.grid(axis='y', linestyle='--', alpha=0.6)
        plt.show()

    elif test_family == 'chi_square':
        dist = pd.DataFrame(result['summary'])
        dist.T.plot(kind='bar', stacked=True)
        plt.title(f"Categorical Distribution by Group")
        plt.ylabel("Proportion")
        plt.xlabel("Group")
        plt.grid(axis='y', linestyle='--', alpha=0.6)
        plt.show()

plot_ab_test_results(result)

📊 Visualization:

def plot_confidence_intervals(result, z=1.96):
    """
    Plot 95% confidence intervals for group means (conversion rate or continuous).
    """
    test_family = result['test_family']
    variant = result.get('variant')
    group1, group2 = result['group_labels']
    summary = result['summary']

    if test_family not in ['z_test', 't_test']:
        print(f"⚠️ CI plotting not supported for test family: {test_family}")
        return
    if test_family == 't_test' and variant != 'independent':
        print(f"⚠️ CI plotting only supported for independent t-tests.")
        return

    p1, p2 = summary[group1]['mean'], summary[group2]['mean']
    n1, n2 = summary[group1]['n'], summary[group2]['n']

    if test_family == 'z_test':
        se1 = np.sqrt(p1 * (1 - p1) / n1)
        se2 = np.sqrt(p2 * (1 - p2) / n2)
        ylabel = "Conversion Rate"
    else:
        sd1 = summary[group1]['std']
        sd2 = summary[group2]['std']
        se1 = sd1 / np.sqrt(n1)
        se2 = sd2 / np.sqrt(n2)
        ylabel = "Mean Outcome"

    ci1 = (p1 - z * se1, p1 + z * se1)
    ci2 = (p2 - z * se2, p2 + z * se2)

    plt.errorbar([group1, group2],
                 [p1, p2],
                 yerr=[[p1 - ci1[0], p2 - ci2[0]], [ci1[1] - p1, ci2[1] - p2]],
                 fmt='o', capsize=10, color='black')
    plt.ylabel(ylabel)
    plt.title(f"{ylabel} with 95% Confidence Intervals")
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()

plot_confidence_intervals(result)

def compute_lift_confidence_interval(result):
    """
    Compute CI for lift in binary or continuous-independent tests.
    """
    test_family = result['test_family']
    variant = result.get('variant')
    group1, group2 = result['group_labels']
    alpha = result.get('alpha', 0.05)
    z = 1.96

    print("\n" + "="*45)
    print(f"📈 95% CI for Difference in Outcome [{test_family}]")
    print("="*45)

    if test_family == 'z_test' or (test_family == 't_test' and variant == 'independent'):
        m1 = result['summary'][group1]['mean']
        m2 = result['summary'][group2]['mean']
        lift = m2 - m1
        n1 = result['summary'][group1]['n']
        n2 = result['summary'][group2]['n']

        if test_family == 'z_test':
            se = np.sqrt(m1 * (1 - m1) / n1 + m2 * (1 - m2) / n2)
        else:
            sd1 = result['summary'][group1]['std']
            sd2 = result['summary'][group2]['std']
            se = np.sqrt((sd1 ** 2) / n1 + (sd2 ** 2) / n2)

        ci_low = lift - z * se
        ci_high = lift + z * se

        print(f"- Absolute Lift         : {lift:.4f}")
        print(f"- 95% Confidence Interval: [{ci_low:.4f}, {ci_high:.4f}]")

        if ci_low > 0:
            print("✅ Likely positive impact (CI > 0)")
        elif ci_high < 0:
            print("🚫 Likely negative impact (CI < 0)")
        else:
            print("🤷 CI includes 0 — not statistically significant.")

    elif test_family == 't_test' and variant == 'paired':
        print("- Paired test: CI already accounted for in test logic.")

    elif test_family == 'chi_square':
        print("- Categorical test: per-category lift analysis required (not implemented).")

    print("="*45 + "\n")

compute_lift_confidence_interval(result)

=============================================
📈 95% CI for Difference in Outcome [t_test]
=============================================
- Absolute Lift         : -0.7146
- 95% Confidence Interval: [-2.6338, 1.2047]
🤷 CI includes 0 — not statistically significant.
=============================================

def print_final_ab_test_summary(result):
    """
    Final wrap-up of results with summary stats and verdict.
    """
    test_family = result['test_family']
    variant = result.get('variant')
    group1, group2 = result['group_labels']
    p_value = result.get('p_value')
    alpha = result.get('alpha', 0.05)

    print("="*40)
    print("          📊 FINAL A/B TEST SUMMARY")
    print("="*40)

    if test_family == 'z_test' or (test_family == 't_test' and variant == 'independent'):
        mean1 = result['summary'][group1]['mean']
        mean2 = result['summary'][group2]['mean']
        lift = mean2 - mean1
        pct_lift = lift / mean1 if mean1 else np.nan

        label = "Conversion rate" if test_family == 'z_test' else "Avg outcome"
        test_name = result.get("test", "A/B test")

        print(f"👥  {group1.capitalize()} {label:<20}:  {mean1:.4f}")
        print(f"🧪  {group2.capitalize()} {label:<20}:  {mean2:.4f}")
        print(f"📈  Absolute lift              :  {lift:.4f}")
        print(f"📊  Percentage lift            :  {pct_lift:.2%}")
        print(f"🧪  P-value (from {test_name}) :  {p_value:.4f}")

    elif test_family == 't_test' and variant == 'paired':
        print("🧪 Paired T-Test was used to compare within-user outcomes.")
        print(f"🧪 P-value: {p_value:.4f}")

    elif test_family == 'chi_square':
        print("🧪 Chi-square test was used to compare categorical distributions.")
        print(f"🧪 P-value: {p_value:.4f}")

    else:
        print("⚠️ Unsupported test type.")

    print("-" * 40)

    if p_value is not None:
        if p_value < alpha:
            print("✅ RESULT: Statistically significant difference detected.")
        else:
            print("❌ RESULT: No statistically significant difference detected.")
    else:
        print("⚠️ No p-value available.")

    print("="*40 + "\n")

print_final_ab_test_summary(result)

========================================
          📊 FINAL A/B TEST SUMMARY
========================================
👥  Control Avg outcome         :  51.0252
🧪  Treatment Avg outcome         :  50.3106
📈  Absolute lift              :  -0.7146
📊  Percentage lift            :  -1.40%
🧪  P-value (from independent t-test) :  0.4657
----------------------------------------
❌ RESULT: No statistically significant difference detected.
========================================

def estimate_test_duration(
    required_sample_size_per_group,
    daily_eligible_users,
    allocation_ratios=(0.5, 0.5),
    buffer_days=2,
    test_family=None  # renamed from experiment_type
):
    """
    Estimate test duration based on sample size, traffic, and allocation.

    Parameters:
    - required_sample_size_per_group: int
    - daily_eligible_users: int — total incoming traffic per day
    - allocation_ratios: tuple — traffic share per group (e.g., 50/50)
    - buffer_days: int — extra time for ramp-up or anomalies
    - test_family: str — optional metadata for clarity

    Returns:
    - dict with group durations and total estimated runtime
    """
    group_durations = []
    for alloc in allocation_ratios:
        users_per_day = daily_eligible_users * alloc
        days = required_sample_size_per_group / users_per_day if users_per_day else float('inf')
        group_durations.append(np.ceil(days))

    longest_group_runtime = int(max(group_durations))
    total_with_buffer = longest_group_runtime + buffer_days

    print("\n🧮 Estimated Test Duration")
    if test_family:
        print(f"- Test family               : {test_family}")
    print(f"- Required sample per group : {required_sample_size_per_group}")
    print(f"- Daily eligible traffic    : {daily_eligible_users}")
    print(f"- Allocation ratio          : {allocation_ratios}")
    print(f"- Longest group runtime     : {longest_group_runtime} days")
    print(f"- Buffer days               : {buffer_days}")
    print(f"✅ Total estimated duration : {total_with_buffer} days\n")

    return {
        'test_family': test_family,
        'per_group_days': group_durations,
        'longest_group_runtime': longest_group_runtime,
        'recommended_total_duration': total_with_buffer
    }

daily_eligible_users = 1000
allocation_ratios = (0.5, 0.5)
buffer_days = 2

test_duration_result = estimate_test_duration(
    required_sample_size_per_group=test_config['required_sample_size'],
    daily_eligible_users=daily_eligible_users,
    allocation_ratios=allocation_ratios,
    buffer_days=buffer_days,
    test_family=test_config['family']
)
test_duration_result

🧮 Estimated Test Duration
- Test family               : t_test
- Required sample per group : 154
- Daily eligible traffic    : 1000
- Allocation ratio          : (0.5, 0.5)
- Longest group runtime     : 1 days
- Buffer days               : 2
✅ Total estimated duration : 3 days

{'test_family': 't_test',
 'per_group_days': [1.0, 1.0],
 'longest_group_runtime': 1,
 'recommended_total_duration': 3}

def visualize_segment_lift(df_segment, segment_col):
    """
    Plots horizontal bar chart of mean lift per segment (Treatment - Control).
    """
    df_viz = df_segment.dropna(subset=['lift']).sort_values(by='lift', ascending=False)
    if df_viz.empty:
        print(f"⚠️ No lift data to visualize for '{segment_col}'\n")
        return

    plt.figure(figsize=(8, 0.4 * len(df_viz) + 2))
    bars = plt.barh(df_viz[segment_col], df_viz['lift'], color='skyblue')
    for bar, val in zip(bars, df_viz['lift']):
        plt.text(bar.get_width(), bar.get_y() + bar.get_height()/2, f"{val:.2f}", va='center', ha='left', fontsize=9)
    plt.axvline(0, color='gray', linestyle='--')
    plt.title(f"Lift from Control to Treatment by {segment_col}")
    plt.xlabel("Mean Difference (Treatment – Control)")
    plt.grid(axis='x', linestyle='--', alpha=0.6)
    plt.tight_layout()
    plt.show()

def analyze_segment_lift(
    df,
    test_config,
    segment_cols=['platform', 'device_type', 'user_tier', 'region'],
    min_count_per_group=30,
    visualize=True
):
    """
    Post-hoc lift analysis per segment (e.g., by platform or region).
    """

    group_col = 'group'
    group1, group2 = test_config['group_labels']
    metric_col = test_config['outcome_metric_col']
    outcome_type = test_config['outcome_metric_datatype']
    variant = test_config['variant']
    test_family = test_config['family']

    for segment in segment_cols:
        print(f"\n🔎 Segmenting by: {segment}")
        seg_data = []

        for val in df[segment].dropna().unique():
            subset = df[df[segment] == val]
            g1 = subset[subset[group_col] == group1][metric_col]
            g2 = subset[subset[group_col] == group2][metric_col]

            if len(g1) < min_count_per_group or len(g2) < min_count_per_group:
                print(f"⚠️ Skipping '{val}' under '{segment}' — too few users.")
                continue

            lift = g2.mean() - g1.mean()
            p_value = None

            if test_family == 'z_test':
                # Binary: z-test on proportions
                p1, n1 = g1.mean(), len(g1)
                p2, n2 = g2.mean(), len(g2)
                pooled_p = (g1.sum() + g2.sum()) / (n1 + n2)
                se = np.sqrt(pooled_p * (1 - pooled_p) * (1/n1 + 1/n2))
                p_value = 2 * (1 - stats.norm.cdf(abs((p2 - p1) / se)))

            elif test_family == 't_test':
                if variant == 'independent':
                    _, p_value = stats.ttest_ind(g1, g2)
                elif variant == 'paired':
                    print(f"⚠️ Paired test not supported in segmented lift — skipped '{val}' under '{segment}'.")
                    lift, p_value = np.nan, None

            elif test_family == 'chi_square':
                print(f"⚠️ Categorical data — lift not defined for '{val}' in '{segment}'.")
                lift, p_value = np.nan, None

            seg_data.append({
                segment: val,
                'count_control': len(g1),
                'count_treatment': len(g2),
                'mean_control': g1.mean(),
                'mean_treatment': g2.mean(),
                'std_control': g1.std(),
                'std_treatment': g2.std(),
                'lift': lift,
                'p_value_lift': p_value
            })

        df_segment = pd.DataFrame(seg_data)
        display(df_segment)

        if visualize:
            visualize_segment_lift(df_segment, segment)

analyze_segment_lift(
    df=users,
    test_config=test_config,
    segment_cols=['platform', 'device_type', 'user_tier', 'region'],
    min_count_per_group=30,
    visualize=True
)

🔎 Segmenting by: platform

🔎 Segmenting by: device_type

🔎 Segmenting by: user_tier

🔎 Segmenting by: region

# Quick average check by group
guardrail_avg = users.groupby('group')['bounce_rate'].mean()

print("🚦 Average Bounce Rate by Group:")
for grp, val in guardrail_avg.items():
    print(f"- {grp}: {val:.4f}")

🚦 Average Bounce Rate by Group:
- control: 0.5464
- treatment: 0.5679

def evaluate_guardrail_metric(
    df,
    test_config,
    guardrail_metric_col='bounce_rate',
    alpha=0.05
):
    """
    Checks for statistically significant changes in guardrail metric (e.g., bounce rate).

    Parameters:
    - df : pd.DataFrame — experiment dataset
    - test_config : dict — contains group info, variant, etc.
    - guardrail_metric_col : str — column name of guardrail metric
    - alpha : float — significance level (default 0.05)

    Returns:
    - None (prints result)
    """

    group_col = 'group'
    control, treatment = test_config['group_labels']

    control_vals = df[df[group_col] == control][guardrail_metric_col]
    treatment_vals = df[df[group_col] == treatment][guardrail_metric_col]

    mean_control = control_vals.mean()
    mean_treatment = treatment_vals.mean()
    diff = mean_treatment - mean_control

    t_stat, p_val = ttest_ind(treatment_vals, control_vals)

    print(f"\n🚦 Guardrail Metric Check → '{guardrail_metric_col}'\n")
    print(f"- {control:10}: {mean_control:.4f}")
    print(f"- {treatment:10}: {mean_treatment:.4f}")
    print(f"- Difference   : {diff:+.4f}")
    print(f"- P-value (t-test): {p_val:.4f}")

    if p_val < alpha:
        if diff > 0:
            print("❌ Significant *increase* — potential negative impact on guardrail.")
        else:
            print("✅ Significant *decrease* — potential positive impact.")
    else:
        print("🟡 No statistically significant change — guardrail looks stable.")

evaluate_guardrail_metric(
    df=users,
    test_config=test_config,
    guardrail_metric_col='bounce_rate',
    alpha=0.05
)

🚦 Guardrail Metric Check → 'bounce_rate'

- control   : 0.5464
- treatment : 0.5679
- Difference   : +0.0215
- P-value (t-test): 0.0325
❌ Significant *increase* — potential negative impact on guardrail.

import pandas as pd
from statsmodels.stats.multitest import multipletests

# Original inputs
segment_names = ['North', 'South', 'East', 'West']
p_vals = [0.03, 0.06, 0.02, 0.10]

# Create DataFrame and sort by raw p-values BEFORE correction
df = pd.DataFrame({
    'Segment': segment_names,
    'Raw_pValue': p_vals
}).sort_values('Raw_pValue').reset_index(drop=True)

# Apply corrections to the sorted p-values
_, bonf, _, _ = multipletests(df['Raw_pValue'], alpha=0.05, method='bonferroni')
_, bh, _, _ = multipletests(df['Raw_pValue'], alpha=0.05, method='fdr_bh')

# Add to DataFrame
df['Bonferroni_Adj_pValue'] = bonf
df['BH_Adj_pValue'] = bh
df

# Plot p values - raw and adjusted

plt.figure(figsize=(8, 5))

# Plot lines
plt.plot(df.index + 1, df['Raw_pValue'], marker='o', label='Raw p-value')
plt.plot(df.index + 1, df['Bonferroni_Adj_pValue'], marker='^', label='Bonferroni Adj p-value')
plt.plot(df.index + 1, df['BH_Adj_pValue'], marker='s', label='BH Adj p-value')

# Add value labels next to each point
for i in range(len(df)):
    x = i + 1
    plt.text(x + 0.05, df['Raw_pValue'][i], f"{df['Raw_pValue'][i]:.2f}", va='center')
    plt.text(x + 0.05, df['Bonferroni_Adj_pValue'][i], f"{df['Bonferroni_Adj_pValue'][i]:.2f}", va='center')
    plt.text(x + 0.05, df['BH_Adj_pValue'][i], f"{df['BH_Adj_pValue'][i]:.2f}", va='center')

# Axis & labels
plt.xticks(df.index + 1, df['Segment']);
plt.axhline(0.05, color='gray', linestyle='--', label='α = 0.05');
plt.xlabel("Segment (Ranked by Significance)");
plt.ylabel("p-value");
plt.title("p-value Correction: Bonferroni vs Benjamini Hochberg (FDR)");
plt.legend();
plt.tight_layout();
plt.show();

def simulate_rollout_impact(
    experiment_result,
    daily_eligible_observations,
    metric_unit='conversions'
):
    """
    Estimate potential impact of rolling out the treatment to all eligible traffic.

    Parameters:
    - experiment_result: dict
        Output of `run_ab_test()` — must contain summary + group_labels
    - daily_eligible_observations: int
        Number of eligible units per day (users, sessions, transactions, etc.)
    - metric_unit: str
        What the metric represents (e.g., 'conversions', 'revenue', 'clicks')

    Prints daily and monthly lift estimates.
    """

    group1, group2 = experiment_result['group_labels']
    summary = experiment_result['summary']

    # Extract means
    mean_control = summary[group1]['mean']
    mean_treatment = summary[group2]['mean']
    observed_lift = mean_treatment - mean_control

    # Impact calculation
    daily_impact = observed_lift * daily_eligible_observations
    monthly_impact = daily_impact * 30

    # Output
    print("\n📦 Rollout Simulation")
    print(f"- Outcome Metric      : {metric_unit}")
    print(f"- Observed Lift       : {observed_lift:.4f} per unit")
    print(f"- Daily Eligible Units: {daily_eligible_observations}")
    print(f"- Estimated Daily Impact   : {daily_impact:,.0f} {metric_unit}/day")
    print(f"- Estimated Monthly Impact : {monthly_impact:,.0f} {metric_unit}/month\n")

# Derive daily volume from actual data
daily_traffic_estimate = users.shape[0]  # Assuming full traffic per day

simulate_rollout_impact(
    experiment_result=result,                         # Output from run_ab_test()
    daily_eligible_observations=daily_traffic_estimate,
    metric_unit=test_config['outcome_metric_col']     # Dynamic label like 'engagement_score' or 'revenue'
)

📦 Rollout Simulation
- Outcome Metric      : engagement_score
- Observed Lift       : -0.7146 per unit
- Daily Eligible Units: 1000
- Estimated Daily Impact   : -715 engagement_score/day
- Estimated Monthly Impact : -21,437 engagement_score/month

Strategy	Description
Cluster Randomization	Randomize at group level (e.g., friend group, region, org ID)
Isolation Experiments	Only roll out to fully disconnected segments (e.g., one region only)
Network-Based Metrics	Include network centrality / exposure as covariates
Post-Experiment Checks	Monitor if control group was exposed indirectly (e.g., referrals, shared UIs)
Simulation-Based Designs	Use agent-based or graph simulations to estimate contamination risk

Outcome Metric	Normality	Group Count	Selected Test Family
binary	—	2	`z_test`
binary	—	3+	`chi_square`
continuous	✅	2	`t_test`
continuous	✅	3+	`anova`
continuous	❌	2	`non_parametric` (Mann-Whitney U)
continuous	❌	3+	`non_parametric` (Kruskal-Wallis)
categorical	—	2	`chi_square`
categorical	—	3+	`chi_square`

Result	Interpretation
No significant difference	✅ Randomization looks good. Test setup is sound.
Statistically significant difference	⚠️ Something’s off — check assignment logic, instrumentation, or sample leakage

Scenario	Split	Why
Default A/B	50 / 50	Maximizes power and ensures fairness
Risky feature	10 / 90 or 20 / 80	Limits user exposure to minimize risk
Ramp-up	Step-wise (1-5-25-50…)	Gradual rollout to catch issues early
A/B/C Test	33 / 33 / 33 or weighted	Compare multiple variants fairly or with bias
High control confidence needed	70 / 30 or 60 / 40	More stability in baseline comparisons

Parameter	Meaning
alpha (α)	Significance level (probability of false positive), e.g. 0.05
Power (1 - β)	Probability of detecting a true effect, e.g. 0.80 or 0.90
Baseline	Current outcome (e.g., 10% conversion, $50 revenue)
MDE	Minimum detectable effect — the smallest meaningful lift (e.g., +2% or +$5)
Std Dev	Standard deviation of the metric (for continuous outcomes)
Effect Size	Optional: Cohen's d (for t-tests) or f (for ANOVA)
Groups	Number of groups (relevant for ANOVA)

	user_id	platform	device_type	user_tier	region	plan_type	city	engagement_score	converted	past_purchase_count	bounce_rate
0	1	iOS	mobile	new	North	premium	austin	53.437537	0	50.653869	0.660153
1	2	iOS	mobile	returning	North	basic	ny	48.924673	1	26.451597	0.126471
2	3	iOS	desktop	returning	South	premium	austin	80.179294	0	43.112520	0.552955
3	4	iOS	mobile	new	North	premium	austin	40.441478	0	48.339368	0.665883
4	5	Android	mobile	returning	West	basic	chicago	54.171571	0	56.578205	0.503212
...	...	...	...	...	...	...	...	...	...	...	...
995	996	iOS	desktop	returning	North	basic	sf	35.814776	0	55.156336	0.722611
996	997	iOS	mobile	new	South	pro	ny	35.693639	0	48.434500	0.522644
997	998	Android	desktop	returning	South	premium	chicago	33.913119	0	30.591967	0.644316
998	999	iOS	mobile	new	West	premium	sf	40.789684	0	44.760253	0.540618
999	1000	iOS	mobile	returning	South	basic	austin	71.341543	0	30.985932	0.542153

	n	mean	std	sum
control	510.0	51.025159	15.609492	NaN
treatment	490.0	50.310595	15.354070	NaN

Area	What to Show
Segment Analysis	Table or chart showing lift per segment, sorted by effect size or risk
Guardrail Metrics	Summary table of guardrails vs baseline, with thresholds or annotations
Revenue Simulation	Projected uplift × traffic volume × conversion = business impact
Confidence Range	95% CI for key metrics per segment (wherever possible)
Rollout Readiness	Any blockers, mitigations, or next steps if full rollout is considered

	platform	count_control	count_treatment	mean_control	mean_treatment	std_control	std_treatment	lift	p_value_lift
0	iOS	306	283	51.473511	51.038554	15.720604	15.505342	-0.434958	0.735711
1	Android	204	207	50.352631	49.315366	15.455377	15.125414	-1.037265	0.492071

	device_type	count_control	count_treatment	mean_control	mean_treatment	std_control	std_treatment	lift	p_value_lift
0	mobile	343	346	50.996739	50.737705	15.483445	15.166347	-0.259034	0.824508
1	desktop	167	144	51.083532	49.284344	15.912052	15.802427	-1.799188	0.319324

	user_tier	count_control	count_treatment	mean_control	mean_treatment	std_control	std_treatment	lift	p_value_lift
0	new	188	192	50.270166	52.095652	15.670024	15.297974	1.825486	0.251242
1	returning	322	298	51.465963	49.160490	15.581513	15.305550	-2.305472	0.063864

	region	count_control	count_treatment	mean_control	mean_treatment	std_control	std_treatment	lift	p_value_lift
0	North	135	120	51.205860	51.554078	15.118047	15.206938	0.348218	0.854882
1	South	147	113	50.736052	50.185396	16.563319	15.340048	-0.550655	0.784045
2	West	106	126	50.974336	50.275715	13.928319	15.292805	-0.698620	0.718471
3	East	122	131	51.217713	49.313070	16.501732	15.653598	-1.904643	0.347038

Type	Examples
UX Health	Bounce Rate, Session Length, Engagement
Performance	Page Load Time, API Latency, CPU Usage
Reliability	Error Rate, Crash Rate, Timeout Errors
Behavioral	Scroll Depth, Page Views per Session

Method	Use Case	Risk
Bonferroni	Very strict, controls Family-Wise Error Rate (FWER)	❄️ Conservative
Benjamini-Hochberg	Controls False Discovery Rate (FDR)	🔥 Balanced

	Segment	Raw_pValue	Bonferroni_Adj_pValue	BH_Adj_pValue
0	East	0.02	0.08	0.06
1	North	0.03	0.12	0.06
2	South	0.06	0.24	0.08
3	West	0.10	0.40	0.10

📖 Table of Contents¶

🗂️ Data Setup¶

🧾 Sample data¶

🛠️ Experiment Setup¶

⚙️ Other Experiment Parameters¶

🔧 Central Control Panel¶

🔀 Randomization Methods

🔄 Simple Randomization¶

🔄 Stratified Sampling

When to Use

How It Works

🔄 Block Randomization

When to Use

How It Works

🔄 Match Pair Randomization

When to Use

How It Works

🔄 Cluster Randomization

When to Use

How It Works

🔄 CUPED

When to Use

How It Works

🕸️ Network Effects & SUTVA Violations

🧪 Why It Matters¶

⚠️ Common Violation Scenarios¶

🧩 Solutions (If You Suspect Interference)¶

📈 EDA¶

🔍 Normality

🔍 Variance Homogeneity Check

🔍 Test Family

🧪 Experiment Type → Test Family Mapping

🧪 AA Testing

🧬 Outcome Similarity Test

🧬 Sample Ratio Mismatch

📊 AA Test Visualization¶

🎲 Type I Error Simulation

🔁 Repeated A/A Tests

📊 What this tells you:

⚡ Power Analysis

Why It Matters:

Key Inputs:

⚙️ Setup Inputs + Config Values

📈 Baseline Estimation from Data

📈 Minimum Detectable Effect

📐 Required Sample Size¶

📊 Power Analysis Summary¶

🧪 A/B Testing

🧪 A/B Testing - Outcome Comparison

📌 Metric Tracked:

🔬 Outcome Analysis Steps:

📉 Results¶

🧾 Summaries¶

📊 Visualization¶

🎯 95% Confidence Intervalsfor outcome in groups

📈 Lift AnalysisAKA 95% Confidence Intervals for (difference in outcomes)

✅ Final Conclusion¶

⏱️ How Long

to run the test?

✅ Key Inputs

🧮 Formula

💡 Planning Tips

🧠 Real-World Considerations

🗣️ Common Practitioner Advice

🧭 Monitoring Dashboard Components

🔍 Post Hoc Analysis

🧩 Segmented Lift

🚦 Guardrail Metrics

🧪 How We Check

📊 Common Guardrail Metrics

✅ When to Act

🧠 Why Guardrails Matter

🧠 Correcting for Multiple Comparisons¶

📉 Example Scenario:¶

✅ Correction Methods¶

🧠 In Practice:¶

❄️ Bonferroni Correction¶

🔬 Benjamini-Hochberg (BH) Procedure¶

🪄 Novelty Effects & Behavioral Decay

🪄 Novelty Effects & Behavioral Decay¶

🎯 95% Confidence Intervals
for `outcome in groups`

📈 Lift Analysis
AKA 95% Confidence Intervals for (difference in outcomes)