import numpy as np
import matplotlib.pyplot as plt

# Number of trials
num_trials = 10000

# Simulate dice rolls
dice_rolls = np.random.randint(1, 7, size=num_trials)

# Compute cumulative averages
cumulative_averages = np.cumsum(dice_rolls) / np.arange(1, num_trials + 1)

# Plot the convergence
plt.figure(figsize=(10, 6))
plt.plot(cumulative_averages, label="Cumulative Average")
plt.axhline(y=3.5, color='r', linestyle='--', label="Expected Value (3.5)")
plt.title("Law of Large Numbers - Dice Rolls")
plt.xlabel("Number of Rolls")
plt.ylabel("Cumulative Average")
plt.legend()
plt.grid()
plt.show()

# Generate a population
population_mean = 50
population_std = 10
population = np.random.normal(loc=population_mean, scale=population_std, size=100000)

# Draw increasing sample sizes and compute means
sample_sizes = np.arange(1, 10001)
sample_means = [np.mean(np.random.choice(population, size=n)) for n in sample_sizes]

# Plot the convergence
plt.figure(figsize=(10, 6))
plt.plot(sample_sizes, sample_means, label="Sample Mean")
plt.axhline(y=population_mean, color='r', linestyle='--', label="Population Mean (50)")
plt.title("Law of Large Numbers - Population Sampling")
plt.xlabel("Sample Size")
plt.ylabel("Sample Mean")
plt.legend()
plt.grid()
plt.show()

# Number of rolls
num_rolls = 10000

# Simulate die rolls
rolls = np.random.randint(1, 7, size=num_rolls)

# Calculate cumulative averages
cumulative_averages = np.cumsum(rolls) / np.arange(1, num_rolls + 1)

# Plot the results
plt.figure(figsize=(10, 6))
plt.plot(cumulative_averages, label="Cumulative Average", color="blue")
plt.axhline(y=3.5, color='red', linestyle='--', label="Expected Value (3.5)")
plt.title("Law of Large Numbers - Rolling a Die")
plt.xlabel("Number of Rolls")
plt.ylabel("Cumulative Average")
plt.legend()
plt.grid()
plt.show()

# Parameters
population_mean = 35  # The mean (average) value of the simulated population.
population_std = 15 # The standard deviation of the population, indicating how spread out.
sample_size = 50 # The number of individuals (data points) in each random sample drawn from the population.
num_samples = 10000 # The total number of samples to draw from the population.
num_bins = 30 # The number of bins (intervals) used to create histograms for visualizing the distributions.

# Distributions to plot
distributions = {
    "Exponential": lambda: np.random.exponential(scale=population_std, size=100000) + population_mean,
    "Uniform": lambda: np.random.uniform(low=population_mean - 3*population_std, 
                                         high=population_mean + 3*population_std, size=100000),
    "Normal": lambda: np.random.normal(loc=population_mean, scale=population_std, size=100000),
    "Lognormal": lambda: np.random.lognormal(mean=np.log(population_mean), sigma=0.5, size=100000),
    "Poisson": lambda: np.random.poisson(lam=population_mean, size=100000),
    "Chi-Square": lambda: np.random.chisquare(df=10, size=100000),
    "Beta": lambda: np.random.beta(a=2, b=5, size=100000) * 70  # Scale beta for a better range
}

# Initialize the figure
fig, axes = plt.subplots(len(distributions), 2, figsize=(14, 4 * len(distributions)))

# Iterate through distributions
for i, (dist_name, dist_func) in enumerate(distributions.items()):
    # Generate population data
    population = dist_func()
    
    # Collect sample means
    sample_means = [np.mean(np.random.choice(population, sample_size)) for _ in range(num_samples)]
    
    # Plot the population distribution
    axes[i, 0].hist(population, bins=num_bins, color='skyblue', edgecolor='black')
    axes[i, 0].set_title(f'{dist_name} Distribution (Population)')
    axes[i, 0].set_xlabel('Value')
    axes[i, 0].set_ylabel('Frequency')
    
    # Plot the sample means distribution
    axes[i, 1].hist(sample_means, bins=num_bins, color='lightgreen', edgecolor='black')
    axes[i, 1].set_title(f'{dist_name} Distribution (Sample Means)')
    axes[i, 1].set_xlabel('Sample Mean')
    axes[i, 1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

# Set seed for reproducibility
np.random.seed(0)

# Simulate a non-normal distribution: exponential
data = np.random.exponential(scale=1.0, size=10_000)

# Compute mean and standard deviation
mu = np.mean(data)
sigma = np.std(data)

print(f"Simulated a non-normal dataset (exponential).")
print(f"Mean (μ): {mu:.2f}")
print(f"Standard Deviation (σ): {sigma:.2f}")

Simulated a non-normal dataset (exponential).
Mean (μ): 0.99
Standard Deviation (σ): 1.00

# Define k values
k_values = np.arange(1, 6)

# Empirical proportions within ±kσ
empirical_props = [(np.abs(data - mu) < k * sigma).mean() for k in k_values]

# Chebyshev lower bounds
chebyshev_bounds = [1 - 1 / (k ** 2) for k in k_values]

print("Proportion of data within ±k standard deviations vs Chebyshev bound:")
print(f"{'k':<5} {'Empirical Proportion':<25} {'Chebyshev Bound':<20}")
for k, emp, bound in zip(k_values, empirical_props, chebyshev_bounds):
    print(f"{k:<5} {emp:<25.3f} {bound:<20.3f}")

Proportion of data within ±k standard deviations vs Chebyshev bound:
k     Empirical Proportion      Chebyshev Bound     
1     0.864                     0.000               
2     0.950                     0.750               
3     0.983                     0.889               
4     0.994                     0.938               
5     0.998                     0.960

plt.figure(figsize=(8, 5))
plt.plot(k_values, empirical_props, marker='o', label='Empirical Proportion')
plt.plot(k_values, chebyshev_bounds, marker='x', linestyle='--', label="Chebyshev's Bound")
for i, k in enumerate(k_values):
    plt.text(k, empirical_props[i] + 0.02, f"{empirical_props[i]:.2f}", ha='center')
    plt.text(k, chebyshev_bounds[i] - 0.08, f"{chebyshev_bounds[i]:.2f}", ha='center', color='orange')
plt.xlabel("k (standard deviations)")
plt.ylabel("Proportion within ±kσ")
plt.ylim(0, 1.05)
plt.grid(True)
plt.legend()
plt.show()

# Example: Medical testing for a rare disease
# Given:
# - 1% of people have the disease (Prior)
# - Test is 90% sensitive (true positive rate)
# - Test is 95% specific (true negative rate)
# - A person tests positive. What's the probability they actually have the disease?

prior = 0.01
sensitivity = 0.90         # P(Positive | Disease)
specificity = 0.95         # P(Negative | No Disease)
false_positive = 1 - specificity

# P(Positive)
evidence = (sensitivity * prior) + (false_positive * (1 - prior))

# P(Disease | Positive)
posterior = (sensitivity * prior) / evidence

# Business-friendly output
print("🔍 Medical Testing Example")
print(f"Chance of disease before test: {prior:.2%}")
print(f"Chance of actually having disease after positive test: {posterior:.2%}")

🔍 Medical Testing Example
Chance of disease before test: 1.00%
Chance of actually having disease after positive test: 15.38%

def bayes_update(prior, sensitivity, specificity):
    """
    Compute posterior probability using Bayes' Theorem.
    
    Parameters:
    - prior: P(Disease), base rate
    - sensitivity: P(Positive | Disease)
    - specificity: P(Negative | No Disease)
    
    Returns:
    - posterior: P(Disease | Positive)
    """
    false_positive = 1 - specificity
    evidence = (sensitivity * prior) + (false_positive * (1 - prior))
    posterior = (sensitivity * prior) / evidence
    return posterior

# Example run
posterior = bayes_update(prior=0.01, sensitivity=0.90, specificity=0.95)

# Interpretation print block
print("📊 Bayesian Update")
print(f"🧠 Prior belief (base rate): {0.01:.2%}")
print(f"✅ Sensitivity (true positive rate): {0.90:.2%}")
print(f"❌ False positive rate: {1 - 0.95:.2%}")
print(f"\n📈 Updated belief after a positive test: {posterior:.2%}")
print(f"\n🧾 Interpretation:")
print(f"Out of 100 people who test positive, about {posterior * 100:.1f} are actually positive.")

📊 Bayesian Update
🧠 Prior belief (base rate): 1.00%
✅ Sensitivity (true positive rate): 90.00%
❌ False positive rate: 5.00%

📈 Updated belief after a positive test: 15.38%

🧾 Interpretation:
Out of 100 people who test positive, about 15.4 are actually positive.

# Varying the prior to see its impact on the posterior
priors = [0.001, 0.01, 0.05, 0.1, 0.25]
sensitivity = 0.90
specificity = 0.95

print("🎯 How prior belief affects the updated probability after a positive test:")
print(f"{'Prior':<10} {'Posterior':<15} {'Interpretation'}")
for p in priors:
    post = bayes_update(p, sensitivity, specificity)
    print(f"{p:<10.3f} {post:<15.3f} About {post * 100:.1f} out of 100 positives are truly positive")

🎯 How prior belief affects the updated probability after a positive test:
Prior      Posterior       Interpretation
0.001      0.018           About 1.8 out of 100 positives are truly positive
0.010      0.154           About 15.4 out of 100 positives are truly positive
0.050      0.486           About 48.6 out of 100 positives are truly positive
0.100      0.667           About 66.7 out of 100 positives are truly positive
0.250      0.857           About 85.7 out of 100 positives are truly positive

import matplotlib.pyplot as plt

# Prepare data
priors = np.linspace(0.001, 0.5, 100)
posteriors = [bayes_update(p, sensitivity, specificity) for p in priors]

# Plot
plt.figure(figsize=(8, 5))
plt.plot(priors, posteriors, label="Posterior", color='blue')
plt.xlabel("Prior Probability (P(Disease))")
plt.ylabel("Posterior Probability (P(Disease | Positive Test))")
plt.title("How Prior Belief Affects Updated Belief")
plt.grid(True)
plt.legend()
plt.show()

k (σ)	Minimum % of data within ±kσ
1	0%
2	75%
3	88.9%
4	93.75%
5	96%

Term	Meaning
Prior	Your initial belief before seeing any new evidence
Likelihood	How well the new evidence fits with that belief
Evidence	The overall probability of seeing the evidence
Posterior	Your updated belief after incorporating the evidence

📖 Table of Contents¶

📊 Law of Large Numbers¶

📌 Key Points

🎲 Examples

`Observations`¶

📈 Central Limit Theorem¶

📌 Key Components

❓ Why is the CLT Important?

✅ Conditions for the CLT

📐 Formula for Standard Error of the Mean

🎯 Chebyshev’s Inequality¶

🧠 Theorem¶

💡 Why It Matters in Data Science¶

`Key Thresholds`¶

🧠 `Observations`¶

🔁 Bayes' Theorem¶

🧠 Theorem¶

💡 Why It Matters¶

🧩 `Bayes' Theorem Components`¶

🏢 `Business Relevance`¶

🧠 `Observations & Takeaways`¶

📖 Table of Contents¶

📊 Law of Large Numbers¶

📌 Key Points

🎲 Examples

Observations¶

📈 Central Limit Theorem¶

📌 Key Components

❓ Why is the CLT Important?

✅ Conditions for the CLT

📐 Formula for Standard Error of the Mean

🎯 Chebyshev’s Inequality¶

🧠 Theorem¶

💡 Why It Matters in Data Science¶

Key Thresholds¶

🧠 Observations¶

🔁 Bayes' Theorem¶

🧠 Theorem¶

💡 Why It Matters¶

🧩 Bayes' Theorem Components¶

🏢 Business Relevance¶

🧠 Observations & Takeaways¶

`Observations`¶

`Key Thresholds`¶

🧠 `Observations`¶

🧩 `Bayes' Theorem Components`¶

🏢 `Business Relevance`¶

🧠 `Observations & Takeaways`¶