# Core libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn modules
from sklearn.datasets import load_digits, load_iris
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Load iris
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.Series(iris.target, name='target')

# Optionally attach for LDA later
df = X.copy()
df['segment'] = y.replace({0: 'Setosa', 1: 'Versicolor', 2: 'Virginica'})

# Preview
print(df.shape)
df.head()

(150, 5)

# Automatically select only numeric columns
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df.select_dtypes(include=[np.number]))
scaled_data[0:5]

array([[-0.90068117,  1.01900435, -1.34022653, -1.3154443 ],
       [-1.14301691, -0.13197948, -1.34022653, -1.3154443 ],
       [-1.38535265,  0.32841405, -1.39706395, -1.3154443 ],
       [-1.50652052,  0.09821729, -1.2833891 , -1.3154443 ],
       [-1.02184904,  1.24920112, -1.34022653, -1.3154443 ]])

# Fit PCA to get all components
pca_scree_plot = PCA()
pca_scree_plot.fit(scaled_data)

# Calculate cumulative explained variance
explained_variance = pca_scree_plot.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

# Plot
plt.figure(figsize=(10, 5))
plt.plot(
    range(1, len(cumulative_variance) + 1),
    cumulative_variance,
    marker='o',
    label='Cumulative Variance',
    linewidth=2,
    color='#007acc'
)

# Annotate percentage values
for i, var in enumerate(cumulative_variance):
    plt.text(i + 1, var, f'{var * 100:.1f}%', ha='center', va='bottom', fontsize=9)

# Bold red 90% threshold line
threshold = 0.90
plt.axhline(y=threshold, color='red', linestyle='--', linewidth=2)
plt.text(1, threshold + 0.015, f'{int(threshold * 100)}% Threshold', color='red', fontsize=10)

# Styling
plt.xticks(range(1, len(cumulative_variance) + 1))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Scree Plot: Variance Retained by Principal Components', fontsize=13)
plt.legend(loc='lower right', fontsize=9)
plt.tight_layout()
plt.show()

# Fit full PCA
# pca_scree_plot = PCA()
# pca_scree_plot.fit(scaled_data)

# Get explained variance ratio and cumulative variance
explained_variance = pca_scree_plot.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

# Print marginal gain and cumulative contribution per PC
print("Principal Component Breakdown:\n")
print(f"{'Component':<10}{'Explains':>15}{'Cumulative':>20}{'Δ Cumulative':>20}")
print("-" * 70)

for i, (individual, cumulative) in enumerate(zip(explained_variance, cumulative_variance)):
    delta_cum = cumulative if i == 0 else cumulative - cumulative_variance[i - 1]
    print(f"PC{i+1:<8}{individual * 100:>13.2f}%{cumulative * 100:>18.2f}%{delta_cum * 100:>18.2f}%")

# Business-friendly summary + automated component selector
low_gain_threshold = 0.05  # Define marginal gain threshold
selected_n_components = 1  # Always keep at least 1
print("\nBusiness-Friendly Summary:")
for i in range(len(cumulative_variance)):
    if i == 0:
        print(f"- PC1 explains {explained_variance[i]*100:.2f}% of total variance.")
    else:
        gain = cumulative_variance[i] - cumulative_variance[i - 1]
        print(f"- Adding PC{i+1} increases cumulative variance to {cumulative_variance[i]*100:.2f}% (+{gain*100:.2f}%).")        
        if gain < low_gain_threshold:
            print(f"  ⚠️  Marginal gain is low — you may consider stopping before PC{i+1}.")
            break
        else:
            selected_n_components = i + 1

print(f"\n✅ Auto-selected number of components to retain: {selected_n_components}")

Principal Component Breakdown:

Component        Explains          Cumulative        Δ Cumulative
----------------------------------------------------------------------
PC1               72.96%             72.96%             72.96%
PC2               22.85%             95.81%             22.85%
PC3                3.67%             99.48%              3.67%
PC4                0.52%            100.00%              0.52%

Business-Friendly Summary:
- PC1 explains 72.96% of total variance.
- Adding PC2 increases cumulative variance to 95.81% (+22.85%).
- Adding PC3 increases cumulative variance to 99.48% (+3.67%).
  ⚠️  Marginal gain is low — you may consider stopping before PC3.

✅ Auto-selected number of components to retain: 2

# Apply PCA
pca = PCA(n_components=selected_n_components)  # Reduce to n components for visualization
pca_result = pca.fit_transform(scaled_data)

# Convert results to DataFrame
pc_labels = [f'PC{i+1}' for i in range(selected_n_components)]
pca_df = pd.DataFrame(data=pca_result, columns=pc_labels)
pca_df

# Add target labels for visualization (optional)
# pca_df['Target'] = iris.target

loadings = pd.DataFrame(
    pca.components_.T,
    columns=[f'PC{i+1}' for i in range(pca.n_components_)],
    index=df.select_dtypes(include=[np.number]).columns)

print("PCA Loadings:")
print(loadings)

# Sanity check: sum of squared loadings for each PC ≈ 1
print("\nSum of Squared Loadings per PC:")
print((loadings**2).sum(axis=0))

PCA Loadings:
                        PC1       PC2
sepal length (cm)  0.521066  0.377418
sepal width (cm)  -0.269347  0.923296
petal length (cm)  0.580413  0.024492
petal width (cm)   0.564857  0.066942

Sum of Squared Loadings per PC:
PC1    1.0
PC2    1.0
dtype: float64

print("📐 PCA Component Equations:\n")

for pc in loadings.columns:
    coeffs = loadings[pc]
    terms = []
    
    for i, (feature, coeff) in enumerate(coeffs.items()):
        sign = "−" if coeff < 0 else "+"  # Note: en-dash for clean minus
        formatted = f"{abs(coeff):.2f} × {feature}"
        if i == 0:
            terms.append(f"{formatted}" if coeff >= 0 else f"− {formatted}")
        else:
            terms.append(f"{sign} {formatted}")
    
    equation = " ".join(terms)
    print(f"{pc} = {equation}\n")

📐 PCA Component Equations:

PC1 = 0.52 × sepal length (cm) − 0.27 × sepal width (cm) + 0.58 × petal length (cm) + 0.56 × petal width (cm)

PC2 = 0.38 × sepal length (cm) + 0.92 × sepal width (cm) + 0.02 × petal length (cm) + 0.07 × petal width (cm)

import matplotlib.pyplot as plt
from matplotlib.cm import get_cmap

# Generate color palette based on number of PCs
cmap = get_cmap('tab10')  # Use 'Set1', 'tab20', etc. for variety
num_pcs = len(loadings.columns)
colors = [cmap(i) for i in range(num_pcs)]

# Create the grouped bar chart
fig, ax = plt.subplots(figsize=(10, 6))
bars = loadings.plot(kind='bar', ax=ax, width=0.7, color=colors)

# Plot settings
ax.set_title('Raw Loadings: Feature Influence on Principal Components', fontsize=14)
ax.set_ylabel('Loading Value')
ax.set_xlabel('Original Features')
ax.axhline(0, color='black', linewidth=0.8, linestyle='--')
ax.legend(title="Principal Components")
plt.xticks(rotation=45)

# Add numeric annotations on bars
for container in ax.containers:
    ax.bar_label(container, fmt='%.2f', padding=3, fontsize=9)

# Add explanation text under the plot
plt.figtext(
    0.5, -0.08,
    "Each bar shows how much a feature contributes to a principal component.\n"
    "Positive vs negative direction reflects correlation with that PC axis.",
    wrap=True,
    horizontalalignment='center',
    fontsize=10
)

plt.tight_layout()
plt.show()

/var/folders/dg/01ppfw3n6_jbnx4xdx0vdmj40000gn/T/ipykernel_4050/1736477496.py:5: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.
  cmap = get_cmap('tab10')  # Use 'Set1', 'tab20', etc. for variety

import matplotlib.pyplot as plt
from matplotlib.cm import get_cmap

# Compute squared loadings
squared_loadings = loadings ** 2

# Generate dynamic color palette
num_pcs = squared_loadings.shape[1]
cmap = get_cmap('tab10')
colors = [cmap(i) for i in range(num_pcs)]

# Create stacked bar chart (feature-wise variance contribution)
fig, ax = plt.subplots(figsize=(10, 6))
bars = squared_loadings.plot(kind='bar', stacked=True, ax=ax, width=0.7, color=colors)

# Formatting
ax.set_title('Variance Distribution Across PCs for Each Feature', fontsize=14)
ax.set_ylabel('Squared Loading Value')
ax.set_xlabel('Original Features')
ax.legend(title="Principal Components")
plt.xticks(rotation=45)

# Annotate center of each segment
for container in ax.containers:
    ax.bar_label(container, fmt='%.2f', label_type='center', fontsize=8)

# Generic explanation
plt.figtext(
    0.5, -0.08,
    "Each bar shows how a feature’s variance is distributed across principal components.\n"
    "A more concentrated bar means a feature aligns strongly with fewer PCs.",
    wrap=True,
    horizontalalignment='center',
    fontsize=10
)

plt.tight_layout()
plt.show()

/var/folders/dg/01ppfw3n6_jbnx4xdx0vdmj40000gn/T/ipykernel_4050/480588593.py:9: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.
  cmap = get_cmap('tab10')

from matplotlib.cm import get_cmap

# Transpose squared loadings: PCs on x-axis, features as stack
squared_T = squared_loadings.T

# Dynamically generate colors based on feature count
num_features = squared_T.shape[1]
cmap = get_cmap('tab20')  # Wider palette for more features
colors = [cmap(i) for i in range(num_features)]

# Plot
fig, ax = plt.subplots(figsize=(10, 6))
bars = squared_T.plot(kind='bar', stacked=True, ax=ax, width=0.7, color=colors)

# Styling
ax.set_title('Feature Composition of Each Principal Component', fontsize=14)
ax.set_ylabel('Squared Loading Value')
ax.set_xlabel('Principal Components')
ax.legend(title="Original Features", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=0)

# Annotate each bar segment
for container in ax.containers:
    ax.bar_label(container, fmt='%.2f', label_type='center', fontsize=8)

# Explanation
plt.figtext(
    0.5, -0.08,
    "Each bar shows how much each original feature contributes to that principal component.\n"
    "Wider segments = stronger influence of that feature on the PC.",
    wrap=True,
    horizontalalignment='center',
    fontsize=10
)

plt.tight_layout()
plt.show()

/var/folders/dg/01ppfw3n6_jbnx4xdx0vdmj40000gn/T/ipykernel_4050/1328071015.py:8: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.
  cmap = get_cmap('tab20')  # Wider palette for more features

import matplotlib.pyplot as plt
from matplotlib.cm import get_cmap
from mpl_toolkits.mplot3d import Axes3D  # noqa: F401

def plot_pca_projection(pca_df, target_col=None, dim=2, fallback_to_2d=True):
    """
    Visualizes PCA projection in 2D or 3D space.
    
    Parameters:
    - pca_df: DataFrame with PCA result columns (PC1, PC2, [PC3])
    - target_col: str or None — optional column for coloring/grouping
    - dim: int — 2 or 3
    - fallback_to_2d: if True and 3D not possible, auto-downgrade to 2D
    
    Raises:
    - ValueError if required PC columns are missing and fallback is False
    """
    assert dim in [2, 3], "Only 2D or 3D plots are supported."
    
    pcs = [f'PC{i+1}' for i in range(dim)]
    missing_pcs = [pc for pc in pcs if pc not in pca_df.columns]

    if missing_pcs:
        if fallback_to_2d and dim == 3:
            print(f"⚠️ PC3 not found. Falling back to 2D projection.")
            return plot_pca_projection(pca_df, target_col=target_col, dim=2, fallback_to_2d=False)
        else:
            raise ValueError(f"PCA DataFrame must contain columns: {pcs}")
    
    # Setup plot
    fig = plt.figure(figsize=(8, 6))
    ax = fig.add_subplot(111, projection='3d' if dim == 3 else None)

    # Determine coloring logic
    if target_col and target_col in pca_df.columns:
        groups = pca_df[target_col].unique()
        cmap = get_cmap('Set1', len(groups))
        colors = [cmap(i) for i in range(len(groups))]

        for group, color in zip(groups, colors):
            subset = pca_df[pca_df[target_col] == group]
            coords = [subset[pc] for pc in pcs]
            ax.scatter(*coords, label=group, color=color, alpha=0.7, edgecolor='k', s=60)
    else:
        coords = [pca_df[pc] for pc in pcs]
        ax.scatter(*coords, color='steelblue', alpha=0.7, edgecolor='k', s=60)

    # Titles and labels
    ax.set_title(f'PCA Projection ({dim}D)', fontsize=14)
    ax.set_xlabel(pcs[0], fontsize=12)
    ax.set_ylabel(pcs[1], fontsize=12)
    if dim == 3:
        ax.set_zlabel(pcs[2], fontsize=12)
    else:
        ax.axhline(0, color='grey', linewidth=0.5, linestyle='--')
        ax.axvline(0, color='grey', linewidth=0.5, linestyle='--')

    if target_col and target_col in pca_df.columns:
        ax.legend(title=target_col, fontsize=9)

    plt.tight_layout()
    plt.show()

# 🟦 2D PCA without any target grouping (all points same color)
plot_pca_projection(pca_df, dim=2)

# 🟨 2D PCA with target-based color grouping (e.g. segment, species, etc.)
# Make sure to attach the original target/segment column from the full dataset
# pca_df['segment'] = df['segment'].values
# plot_pca_projection(pca_df, target_col='segment', dim=2)

# 🟥 3D PCA plot (if at least 3 principal components exist)
# Will automatically downgrade to 2D if PC3 is missing
# plot_pca_projection(pca_df, target_col='segment', dim=3)

# Explained variance by each component
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

print("📊 Variance Explained by Principal Components:\n")

for i, (ev, cv) in enumerate(zip(explained_variance, cumulative_variance), start=1):
    if i == 1:
        print(f"• PC{i} alone explains {ev*100:.2f}% of the total variance.")
    else:
        delta = ev * 100
        print(f"• Adding PC{i} contributes an additional {delta:.2f}%, bringing cumulative variance to {cv*100:.2f}%.")

📊 Variance Explained by Principal Components:

• PC1 alone explains 72.96% of the total variance.
• Adding PC2 contributes an additional 22.85%, bringing cumulative variance to 95.81%.

from IPython.display import display, HTML

# Limit to first 5 rows for visual comparison
original = df.select_dtypes(include=[np.number]).sample(10, random_state=42)
transformed = pca_df.loc[original.index]  # Keep rows aligned

# Convert to HTML with inline styling
html = f"""
<div style="display: flex; gap: 40px;">
  <div>
    <h4>📥 Original Data</h4>
    {original.to_html(index=False)}
  </div>
  <div>
    <h4>📤 PCA-Transformed Data</h4>
    {transformed.to_html(index=False)}
  </div>
</div>
"""
display(HTML(html))

from sklearn.manifold import TSNE

# Automatically select only numeric columns
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df.select_dtypes(include=[np.number]))
scaled_data[0:5]

array([[-0.90068117,  1.01900435, -1.34022653, -1.3154443 ],
       [-1.14301691, -0.13197948, -1.34022653, -1.3154443 ],
       [-1.38535265,  0.32841405, -1.39706395, -1.3154443 ],
       [-1.50652052,  0.09821729, -1.2833891 , -1.3154443 ],
       [-1.02184904,  1.24920112, -1.34022653, -1.3154443 ]])

#t-SNE is not a feature reducer for modeling like PCA. It’s a projection for human eyes. tsne_n_components>3 is never used
tsne_n_components = 2

tsne = TSNE(
    n_components=tsne_n_components,
    perplexity=30,
    n_iter=1000,
    learning_rate=200,
    init='pca',
    random_state=42
)
tsne_result = tsne.fit_transform(scaled_data)

# Auto-generate column names: t-SNE1, t-SNE2, ...
tsne_cols = [f"t-SNE{i+1}" for i in range(tsne_n_components)]
tsne_df = pd.DataFrame(tsne_result, columns=tsne_cols)
tsne_df

from mpl_toolkits.mplot3d import Axes3D  # needed for 3D projection
import matplotlib.pyplot as plt
import numpy as np

def plot_tsne_projection(tsne_df, target_col=None):
    """
    Plots t-SNE output in 2D or 3D depending on number of components available.
    Colors by target_col if provided and valid. Always prints how many components are being plotted.
    """
    tsne_cols = [col for col in tsne_df.columns if col.startswith('t-SNE')]
    n_components = len(tsne_cols)

    if n_components < 2:
        raise ValueError(f"❌ Found only {n_components} t-SNE component(s). Need at least 2 to plot.")

    if n_components >= 3:
        print(f"📢 Found {n_components} t-SNE components — plotting the first 3 in 3D.")
        x_col, y_col, z_col = tsne_cols[:3]
        
        fig = plt.figure(figsize=(10, 7))
        ax = fig.add_subplot(111, projection='3d')

        if target_col and target_col in tsne_df.columns:
            unique_vals = tsne_df[target_col].unique()
            colors = plt.cm.get_cmap('Set1', len(unique_vals)).colors

            for val, color in zip(unique_vals, colors):
                subset = tsne_df[tsne_df[target_col] == val]
                ax.scatter(subset[x_col], subset[y_col], subset[z_col],
                           label=val, color=color, alpha=0.7, s=60, edgecolor='k')
            ax.legend(title=target_col)
        else:
            if target_col:
                print(f"⚠️ Column '{target_col}' not found. Falling back to plain scatter.")
            ax.scatter(tsne_df[x_col], tsne_df[y_col], tsne_df[z_col],
                       color='steelblue', alpha=0.7, s=60, edgecolor='k')

        ax.set_title('t-SNE Projection (3D)', fontsize=14)
        ax.set_xlabel(x_col)
        ax.set_ylabel(y_col)
        ax.set_zlabel(z_col)
        plt.tight_layout()
        plt.show()

    else:
        print(f"📢 Found {n_components} t-SNE components — plotting in 2D.")
        x_col, y_col = tsne_cols[:2]
        plt.figure(figsize=(8, 6))

        if target_col and target_col in tsne_df.columns:
            unique_vals = tsne_df[target_col].unique()
            colors = plt.cm.get_cmap('Set1', len(unique_vals)).colors

            for val, color in zip(unique_vals, colors):
                subset = tsne_df[tsne_df[target_col] == val]
                plt.scatter(subset[x_col], subset[y_col], label=val,
                            color=color, alpha=0.7, edgecolor='k', s=60)
            plt.legend(title=target_col, fontsize=10)
        else:
            if target_col:
                print(f"⚠️ Column '{target_col}' not found. Falling back to plain scatter.")
            plt.scatter(tsne_df[x_col], tsne_df[y_col], alpha=0.7,
                        color='steelblue', edgecolor='k', s=60)

        plt.title('t-SNE Projection (2D)', fontsize=14)
        plt.xlabel(x_col)
        plt.ylabel(y_col)
        plt.axhline(0, color='grey', linewidth=0.5, linestyle='--')
        plt.axvline(0, color='grey', linewidth=0.5, linestyle='--')
        plt.grid(True)
        plt.tight_layout()
        plt.show()

plot_tsne_projection(tsne_df)  # Plain scatter
plot_tsne_projection(tsne_df, target_col='segment')  # Colored by group

📢 Found 2 t-SNE components — plotting in 2D.

📢 Found 2 t-SNE components — plotting in 2D.
⚠️ Column 'segment' not found. Falling back to plain scatter.

def plot_tsne_density(tsne_df):
    tsne_cols = [col for col in tsne_df.columns if col.startswith('t-SNE')]

    if len(tsne_cols) < 2:
        print(f"❌ Density plot skipped — requires at least 2 t-SNE components, found {len(tsne_cols)}.")
        return
    elif len(tsne_cols) > 2:
        print(f"⚠️ Found {len(tsne_cols)} t-SNE components — plotting only the first two: {tsne_cols[:2]}")

    x_col, y_col = tsne_cols[:2]

    plt.figure(figsize=(8, 6))
    sns.kdeplot(x=tsne_df[x_col], y=tsne_df[y_col], fill=True, cmap='Blues')
    plt.title('Density Plot of t-SNE Outputs')
    plt.xlabel(x_col)
    plt.ylabel(y_col)
    plt.grid()
    plt.tight_layout()
    plt.show()

plot_tsne_density(tsne_df)

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import seaborn as sns
import matplotlib.pyplot as plt

def cluster_tsne(tsne_df, n_clusters=None, cluster_col='Cluster', max_k=10):
    """
    Runs KMeans on first 2 t-SNE components and visualizes clusters.
    If n_clusters is None, auto-selects best k using silhouette score.
    """
    tsne_cols = [col for col in tsne_df.columns if col.startswith('t-SNE')]
    
    if len(tsne_cols) < 2:
        print(f"❌ Clustering skipped — need at least 2 t-SNE dimensions, found {len(tsne_cols)}.")
        return

    x_col, y_col = tsne_cols[:2]
    X_cluster = tsne_df[[x_col, y_col]]

    # Auto-determine optimal k
    if n_clusters is None:
        print("ℹ️ No cluster count provided — finding optimal k using silhouette score...")
        scores = {}
        for k in range(2, max_k + 1):
            model = KMeans(n_clusters=k, random_state=42).fit(X_cluster)
            score = silhouette_score(X_cluster, model.labels_)
            scores[k] = score
        n_clusters = max(scores, key=scores.get)
        print(f"✅ Best k based on silhouette score: {n_clusters}")

    # Handle existing column
    if cluster_col in tsne_df.columns:
        print(f"⚠️ Cluster column '{cluster_col}' already exists. Overwriting...")

    # KMeans clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    tsne_df[cluster_col] = kmeans.fit_predict(X_cluster)

    # Scatter plot
    plt.figure(figsize=(8, 6))
    sns.scatterplot(data=tsne_df, x=x_col, y=y_col, hue=cluster_col, palette='Set1', alpha=0.7, edgecolor='k')
    plt.title(f't-SNE with KMeans Clustering (k = {n_clusters})', fontsize=14)
    plt.xlabel(x_col)
    plt.ylabel(y_col)
    plt.legend(title="Cluster", fontsize=10)
    plt.grid()
    plt.tight_layout()
    plt.show()

cluster_tsne(tsne_df)  # Auto-k based on silhouette
cluster_tsne(tsne_df, n_clusters=4)  # Manual override

ℹ️ No cluster count provided — finding optimal k using silhouette score...
✅ Best k based on silhouette score: 2

/Users/ashrithreddy/anaconda3/lib/python3.11/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/Users/ashrithreddy/anaconda3/lib/python3.11/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/Users/ashrithreddy/anaconda3/lib/python3.11/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/Users/ashrithreddy/anaconda3/lib/python3.11/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/Users/ashrithreddy/anaconda3/lib/python3.11/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/Users/ashrithreddy/anaconda3/lib/python3.11/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/Users/ashrithreddy/anaconda3/lib/python3.11/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/Users/ashrithreddy/anaconda3/lib/python3.11/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/Users/ashrithreddy/anaconda3/lib/python3.11/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(
/Users/ashrithreddy/anaconda3/lib/python3.11/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(

⚠️ Cluster column 'Cluster' already exists. Overwriting...

/Users/ashrithreddy/anaconda3/lib/python3.11/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(

def plot_tsne_by_feature(tsne_df, df, feature_col=None):
    """
    Visualizes t-SNE projection colored by a specific feature (continuous or categorical).
    
    - tsne_df: DataFrame with t-SNE columns (e.g., t-SNE1, t-SNE2, ...)
    - df: Original DataFrame containing the feature
    - feature_col: Name of the column to visualize (optional). If None, uses first numeric column.
    """
    tsne_cols = [col for col in tsne_df.columns if col.startswith('t-SNE')]

    if len(tsne_cols) < 2:
        print("❌ Cannot plot — t-SNE must have at least 2 components.")
        return

    if feature_col is None:
        numeric_cols = df.select_dtypes(include='number').columns
        if len(numeric_cols) == 0:
            print("❌ No numeric feature found in DataFrame to color by.")
            return
        feature_col = numeric_cols[0]
        print(f"ℹ️ No feature provided — defaulting to first numeric column: '{feature_col}'")

    if feature_col not in df.columns:
        print(f"❌ Feature '{feature_col}' not found in DataFrame.")
        return

    feature = df[feature_col]
    x_col, y_col = tsne_cols[:2]

    plt.figure(figsize=(8, 6))

    if pd.api.types.is_numeric_dtype(feature):
        scatter = plt.scatter(
            tsne_df[x_col],
            tsne_df[y_col],
            c=feature,
            cmap='coolwarm',
            edgecolor='k',
            alpha=0.7,
            s=60
        )
        plt.colorbar(scatter, label=feature_col)
    else:
        unique_vals = feature.unique()
        colors = plt.cm.get_cmap('Set1', len(unique_vals)).colors
        for val, color in zip(unique_vals, colors):
            subset = tsne_df[feature == val]
            plt.scatter(
                subset[x_col],
                subset[y_col],
                label=val,
                color=color,
                alpha=0.7,
                edgecolor='k',
                s=60
            )
        plt.legend(title=feature_col)

    plt.title(f't-SNE Colored by {feature_col}')
    plt.xlabel(x_col)
    plt.ylabel(y_col)
    plt.grid(True)
    plt.tight_layout()
    plt.show()

plot_tsne_by_feature(tsne_df, df, feature_col='sepal length (cm)')  # Specific continuous column
plot_tsne_by_feature(tsne_df, df, 'segment')             # Specific categorical 
plot_tsne_by_feature(tsne_df, df) # Defaults to first numeric
plot_tsne_by_feature(tsne_df, df, 'nonexistent_column')  # Clean error

/var/folders/dg/01ppfw3n6_jbnx4xdx0vdmj40000gn/T/ipykernel_4050/1409261539.py:45: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.
  colors = plt.cm.get_cmap('Set1', len(unique_vals)).colors

ℹ️ No feature provided — defaulting to first numeric column: 'sepal length (cm)'

❌ Feature 'nonexistent_column' not found in DataFrame.

import umap.umap_ as umap
from sklearn.preprocessing import StandardScaler

def run_umap(df, n_neighbors=15, min_dist=0.1, n_components=2, random_state=42):
    numeric_data = df.select_dtypes(include='number')
    scaled = StandardScaler().fit_transform(numeric_data)

    reducer = umap.UMAP(
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        n_components=n_components,
        random_state=random_state
    )
    embedding = reducer.fit_transform(scaled)
    return pd.DataFrame(embedding, columns=[f"UMAP_{i+1}" for i in range(n_components)])

umap_2d = run_umap(X, n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
umap_2d.head()

/Users/ashrithreddy/anaconda3/lib/python3.11/site-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(

import seaborn as sns
import matplotlib.pyplot as plt

umap_2d["label"] = y

plt.figure(figsize=(8, 6))
sns.scatterplot(data=umap_2d, x="UMAP_1", y="UMAP_2", hue="label", palette="Set1", s=70, edgecolor="k")
plt.title("UMAP 2D Projection with Labels")
plt.xlabel("UMAP 1")
plt.ylabel("UMAP 2")
plt.legend(title="Target")
plt.tight_layout()
plt.show()

from mpl_toolkits.mplot3d import Axes3D

umap_3d = run_umap(X, n_neighbors=15, min_dist=0.1, n_components=3, random_state=42)
umap_3d["label"] = y

fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(umap_3d["UMAP_1"], umap_3d["UMAP_2"], umap_3d["UMAP_3"],
                     c=umap_3d["label"], cmap="Set1", s=50)
ax.set_title("UMAP 3D Projection")
ax.set_xlabel("UMAP 1")
ax.set_ylabel("UMAP 2")
ax.set_zlabel("UMAP 3")
plt.legend(*scatter.legend_elements(), title="Target")
plt.tight_layout()
plt.show()

/Users/ashrithreddy/anaconda3/lib/python3.11/site-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(

plt.figure(figsize=(8, 6))
sns.kdeplot(data=umap_2d, x="UMAP_1", y="UMAP_2", fill=True, cmap="viridis", thresh=0.05)
plt.title("UMAP 2D Density via KDE")
plt.xlabel("UMAP 1")
plt.ylabel("UMAP 2")
plt.tight_layout()
plt.show()

# Example: color by petal length
feature_col = X.iloc[:, 2]  # Iris petal length
umap_2d["feature"] = feature_col

plt.figure(figsize=(8, 6))
sns.scatterplot(data=umap_2d, x="UMAP_1", y="UMAP_2", hue="feature", palette="coolwarm", s=70, edgecolor="k")
plt.title("UMAP 2D Colored by Feature")
plt.xlabel("UMAP 1")
plt.ylabel("UMAP 2")
plt.legend(title=iris.feature_names[2])
plt.tight_layout()
plt.show()

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, random_state=42)
umap_2d["cluster"] = kmeans.fit_predict(umap_2d[["UMAP_1", "UMAP_2"]])

plt.figure(figsize=(8, 6))
sns.scatterplot(data=umap_2d, x="UMAP_1", y="UMAP_2", hue="cluster", palette="Set2", s=70, edgecolor="k")
plt.title("UMAP + KMeans Clustering")
plt.xlabel("UMAP 1")
plt.ylabel("UMAP 2")
plt.legend(title="Cluster")
plt.tight_layout()
plt.show()

/Users/ashrithreddy/anaconda3/lib/python3.11/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  warnings.warn(

# Simulate new unseen data
X_train = X.sample(frac=0.8, random_state=42)
X_new = X.drop(X_train.index)

# Fit UMAP on training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_new_scaled = scaler.transform(X_new)

umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
X_train_embed = umap_model.fit_transform(X_train_scaled)
X_new_embed = umap_model.transform(X_new_scaled)  # <- Reuse on new data

# Combine for downstream ML or clustering
X_combined = np.vstack([X_train_embed, X_new_embed])
X_combined_df = pd.DataFrame(X_combined, columns=["UMAP_1", "UMAP_2"])
X_combined_df["source"] = ["train"] * len(X_train) + ["new"] * len(X_new)

# Visualize train vs new points
plt.figure(figsize=(8, 6))
sns.scatterplot(data=X_combined_df, x="UMAP_1", y="UMAP_2", hue="source", palette="Set1", s=70)
plt.title("UMAP Projection: Train vs New Data")
plt.tight_layout()
plt.show()

/Users/ashrithreddy/anaconda3/lib/python3.11/site-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import StandardScaler

# Standardize features
X_scaled = StandardScaler().fit_transform(X)

# Fit LDA (for 3 classes, max 2 components)
lda_model = LDA(n_components=2)
X_lda = lda_model.fit_transform(X_scaled, y)

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Combine with labels for plotting
lda_df = pd.DataFrame(X_lda, columns=["LDA_1", "LDA_2"])
lda_df["label"] = y

# 2D scatter plot
plt.figure(figsize=(8, 6))
sns.scatterplot(data=lda_df, x="LDA_1", y="LDA_2", hue="label", palette="Set1", s=70)
plt.title("LDA Projection (2D)")
plt.xlabel("LDA 1")
plt.ylabel("LDA 2")
plt.legend(title="Target")
plt.tight_layout()
plt.show()

from sklearn.cross_decomposition import CCA
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

# Simulate two views from original data
np.random.seed(42)
n_samples = X.shape[0]

# View 1: Original iris features
X_view = X.values

# View 2: Linearly correlated view + noise
Y_view = X.values @ np.array([[0.5, 0.2, 0.1],
                              [0.3, 0.1, 0.4],
                              [0.2, 0.3, 0.1],
                              [0.1, 0.5, 0.3]]) + np.random.normal(0, 0.5, size=(n_samples, 3))

# Standardize both views
X_scaled = StandardScaler().fit_transform(X_view)
Y_scaled = StandardScaler().fit_transform(Y_view)

# Run CCA
cca = CCA(n_components=2)
X_c, Y_c = cca.fit_transform(X_scaled, Y_scaled)

# Store projections
cca_proj = pd.DataFrame(X_c, columns=["CCA_1", "CCA_2"])
cca_proj["label"] = y

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8, 6))
sns.scatterplot(data=cca_proj, x="CCA_1", y="CCA_2", hue="label", palette="Set1", s=70, edgecolor="k")
plt.title("CCA Projection (Shared Latent Space)")
plt.xlabel("CCA 1")
plt.ylabel("CCA 2")
plt.legend(title="Target")
plt.tight_layout()
plt.show()

	PC1	PC2
0	-2.264703	0.480027
1	-2.080961	-0.674134
2	-2.364229	-0.341908
3	-2.299384	-0.597395
4	-2.389842	0.646835
...	...	...
145	1.870503	0.386966
146	1.564580	-0.896687
147	1.521170	0.269069
148	1.372788	1.011254
149	0.960656	-0.024332

sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
6.1	2.8	4.7	1.2
5.7	3.8	1.7	0.3
7.7	2.6	6.9	2.3
6.0	2.9	4.5	1.5
6.8	2.8	4.8	1.4
5.4	3.4	1.5	0.4
5.6	2.9	3.6	1.3
6.9	3.1	5.1	2.3
6.2	2.2	4.5	1.5
5.8	2.7	3.9	1.2

PC1	PC2
0.632858	-0.416388
-1.898572	1.405019
3.310696	0.017781
0.664800	-0.225928
1.256509	-0.077256
-1.831595	0.423695
-0.033355	-0.439003
1.901784	0.689575
1.225094	-1.622244
0.241538	-0.777256

	t-SNE1	t-SNE2
0	17.524487	10.526419
1	13.745171	10.214914
2	14.959588	9.684340
3	14.277590	9.399287
4	17.879566	11.128079
...	...	...
145	-13.833820	-5.847079
146	-6.999432	-7.042307
147	-12.770849	-5.746600
148	-14.140002	-7.868678
149	-10.026910	-4.874825

	UMAP_1	UMAP_2
0	17.400507	6.469345
1	14.964482	7.453436
2	15.676697	7.867920
3	15.372849	7.996042
4	17.660715	6.402417

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	segment
0	5.1	3.5	1.4	0.2	Setosa
1	4.9	3.0	1.4	0.2	Setosa
2	4.7	3.2	1.3	0.2	Setosa
3	4.6	3.1	1.5	0.2	Setosa
4	5.0	3.6	1.4	0.2	Setosa

sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
6.1	2.8	4.7	1.2
5.7	3.8	1.7	0.3
7.7	2.6	6.9	2.3
6.0	2.9	4.5	1.5
6.8	2.8	4.8	1.4
5.4	3.4	1.5	0.4
5.6	2.9	3.6	1.3
6.9	3.1	5.1	2.3
6.2	2.2	4.5	1.5
5.8	2.7	3.9	1.2

sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
6.1	2.8	4.7	1.2
5.7	3.8	1.7	0.3
7.7	2.6	6.9	2.3
6.0	2.9	4.5	1.5
6.8	2.8	4.8	1.4
5.4	3.4	1.5	0.4
5.6	2.9	3.6	1.3
6.9	3.1	5.1	2.3
6.2	2.2	4.5	1.5
5.8	2.7	3.9	1.2

📖 Dimension Reduction¶

📉 Dimensionality Reduction Overview¶

🚧 Key Challenges in High Dimensions¶

❌ Curse of Dimensionality¶

💻 Computational Overhead¶

👁️ Visualization Limitations¶

🗂️ Data Setup¶

🧾 Sample data¶

📊 Principal Component Analysis (PCA)¶

🔄 Standardizing the Features¶

📈 Scree Plot: Cumulative Explained Variance¶

📉 Apply PCA: Reducing to n Principal Components¶

📊 PCA Loadings: Feature Contributions¶

📊 Visualizing Feature Contributions to PCAs¶

🧭 Visualizing PCA Results in 2D¶

📈 Variance Explained by Each Component¶

🧾 Visual Comparison: Original vs PCA-Transformed Data¶

📥 Original Data

📤 PCA-Transformed Data

🌌 t-SNE (t-Distributed Stochastic Neighbor Embedding)¶

🔍 Intuition¶

🎯 When Should You Use t-SNE?¶

⚙️ How It Works¶

🧪 Key Parameters and Tuning Tips¶

📊 Visualizing t-SNE Output¶

🌐 KDE Density Plot (Optional Enhancement)¶

🧱 Clustering in t-SNE Space (e.g., KMeans)¶

🎨 t-SNE Colored by a Feature (Optional Insight)¶

🚫 Limitations and Cautions¶

🌐 UMAP (Uniform Manifold Approximation & Projection)¶

🔬 UMAP vs t-SNE¶

⚙️ How UMAP Works¶

🛠️ Implementation¶

🎛️ Key Parameters and Tradeoffs¶

📊 Visualizing UMAP Output¶

🌍 2D Scatter Plot¶

📈 3D Scatter Plot (Optional)¶

🌐 KDE Density Plot¶

🎨 Feature Coloring (Optional)¶

🧠 UMAP with Clustering (Optional)¶

🔁 Reusing UMAP for Feature Reduction¶

📐 Linear Discriminant Analysis (LDA)¶

🔢 How LDA Works¶

📐 What is LDA?¶

⚙️ How It Works¶

🎯 When to Use¶

🧮 Step-by-Step Breakdown¶

🛠️ Implementation¶

🧱 Why LDA Can’t Go Beyond (C - 1) Dimensions¶

🔍 Implication¶

📊 Visualizing LDA Output¶

🔗 Canonical Correlation Analysis (CCA)¶

🧬 When It’s Useful¶

🔗 Why Use CCA?¶

📌 Typical Use Cases¶

🧠 Practical Example¶

🔄 Intuition: PCA for Two Views¶

🧠 CCA vs PCA¶

🔍 Core Idea¶

🧭 Interpretation¶

📉 Dimensionality Reduction via Correlation¶

📉 CCA as a Dimensionality Reduction Technique¶

📌 Key Characteristics¶

✅ When to use¶

🛠️ Implementation¶

📊 Visualizing CCA Output¶

sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
6.1	2.8	4.7	1.2
5.7	3.8	1.7	0.3
7.7	2.6	6.9	2.3
6.0	2.9	4.5	1.5
6.8	2.8	4.8	1.4
5.4	3.4	1.5	0.4
5.6	2.9	3.6	1.3
6.9	3.1	5.1	2.3
6.2	2.2	4.5	1.5
5.8	2.7	3.9	1.2