# Create a more sophisticated, larger synthetic transaction dataset
import pandas as pd
import numpy as np

np.random.seed(42)

# Define a pool of items and simulate 50 orders
items = ['Milk', 'Bread', 'Butter', 'Eggs', 'Cheese', 'Apples', 'Bananas', 'Juice', 'Cereal', 'Yogurt']
n_orders = 50

# Generate synthetic transactions
transactions = []
for order_id in range(1001, 1001 + n_orders):
    basket_size = np.random.randint(2, 6)  # each order has 2–5 items
    selected_items = np.random.choice(items, size=basket_size, replace=False)
    for item in selected_items:
        transactions.append({'OrderID': order_id, 'Item': item})

df = pd.DataFrame(transactions)
df.head(10)

# Convert transaction data to basket-format binary matrix
basket = df.groupby(['OrderID', 'Item']).size().unstack(fill_value=0)

# Binarize: any count > 0 becomes 1 (transactional presence)
basket = basket.applymap(lambda x: 1 if x > 0 else 0)
basket.head(10)

import matplotlib.pyplot as plt

# Frequency of each item across all orders
item_freq = basket.sum().sort_values(ascending=False)

plt.figure(figsize=(10, 4))
item_freq.plot(kind='bar', color='skyblue')
plt.title("📈 Item Frequency Across Transactions")
plt.ylabel("Count")
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

/var/folders/dg/01ppfw3n6_jbnx4xdx0vdmj40000gn/T/ipykernel_3266/1315152232.py:12: UserWarning: Glyph 128200 (\N{CHART WITH UPWARDS TREND}) missing from current font.
  plt.tight_layout()
/Users/ashrithreddy/anaconda3/lib/python3.11/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 128200 (\N{CHART WITH UPWARDS TREND}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)

# basket['BasketSize'] = 
# basket['BasketSize'].describe()

mean_basket = basket.sum(axis=1).mean()
min_basket = basket.sum(axis=1).min()
max_basket = basket.sum(axis=1).max()

print(f"🧺 Average basket size: {mean_basket:.2f} items")
print(f"📉 Smallest basket: {min_basket} items")
print(f"📈 Largest basket: {max_basket} items")

🧺 Average basket size: 3.48 items
📉 Smallest basket: 2 items
📈 Largest basket: 5 items

# Set thresholds for rule mining
min_support = 0.1       # Appears in at least 10% of orders
min_confidence = 0.5    # At least 50% confidence in the rule
min_lift = 1.2          # Must be better than chance

from mlxtend.frequent_patterns import apriori

# Generate frequent itemsets
frequent_itemsets = apriori(basket, # .drop(columns='BasketSize') 
                            min_support=min_support, 
                            use_colnames=True)

# Sort by support to show top combos
frequent_itemsets.sort_values(by='support', ascending=False).head()

/Users/ashrithreddy/anaconda3/lib/python3.11/site-packages/mlxtend/frequent_patterns/fpcommon.py:109: DeprecationWarning: DataFrames with non-bool types result in worse computationalperformance and their support might be discontinued in the future.Please use a DataFrame with bool type
  warnings.warn(

from mlxtend.frequent_patterns import association_rules

# Build rules from itemsets
rules = association_rules(frequent_itemsets, 
                          metric="lift", 
                          min_threshold=min_lift)

# Filter by confidence
rules = rules[rules['confidence'] >= min_confidence]

# Show top rules
rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']] \
    .sort_values(by='lift', ascending=False).head()

rules[['support', 'confidence', 'lift']].describe()

rules[['leverage', 'conviction']].describe()

def display_top_rules(rules, metric, n=5):
    print(f"\n📌 Top {n} Rules by {metric}")
    return rules.sort_values(by=metric, ascending=False)[
        ['antecedents', 'consequents', 'support', 'confidence', 'lift', metric]
    ].head(n)

display_top_rules(rules, 'lift')
display_top_rules(rules, 'confidence')
display_top_rules(rules, 'leverage')

📌 Top 5 Rules by lift

📌 Top 5 Rules by confidence

📌 Top 5 Rules by leverage

def explain_rules(rules_df, n=5):
    top = rules_df.sort_values(by='lift', ascending=False).head(n)
    summaries = []

    for _, row in top.iterrows():
        ant = ', '.join(list(row['antecedents']))
        con = ', '.join(list(row['consequents']))
        support = row['support']
        confidence = row['confidence']
        lift = row['lift']

        sentence = (f"If a customer buys {ant}, there's a {confidence:.0%} chance they'll also buy {con} "
                    f"(Lift: {lift:.2f}, Support: {support:.0%})")

        summaries.append(sentence)

    for s in summaries:
        print("🧠", s)

# Call the function to generate plain-English summaries for top 5 rules
explain_rules(rules)

🧠 If a customer buys Cereal, Apples, there's a 86% chance they'll also buy Cheese (Lift: 2.38, Support: 12%)
🧠 If a customer buys Eggs, there's a 78% chance they'll also buy Yogurt (Lift: 1.85, Support: 14%)
🧠 If a customer buys Cheese, Cereal, there's a 60% chance they'll also buy Apples (Lift: 1.76, Support: 12%)
🧠 If a customer buys Cheese, Apples, there's a 75% chance they'll also buy Cereal (Lift: 1.63, Support: 12%)
🧠 If a customer buys Cheese, there's a 56% chance they'll also buy Cereal (Lift: 1.21, Support: 20%)

# Filter: keep only rules with small antecedents (e.g. 1–2 items)
rules_filtered = rules[rules['antecedents'].apply(lambda x: len(x) <= 2)]

# Optional: drop rules that are subsets of others (basic deduplication)
unique_rules = rules_filtered.drop_duplicates(subset=['antecedents', 'consequents'])

print(f"🧹 Final set: {len(unique_rules)} rules after removing long/duplicated patterns")
unique_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head()

🧹 Final set: 6 rules after removing long/duplicated patterns

import seaborn as sns
import matplotlib.pyplot as plt

co_matrix = basket.T @ basket # .drop(columns='BasketSize'), .drop(columns='BasketSize')
plt.figure(figsize=(8, 6))
sns.heatmap(co_matrix, cmap="Blues", annot=False)
plt.title("🧱 Item Co-Occurrence Matrix")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

/var/folders/dg/01ppfw3n6_jbnx4xdx0vdmj40000gn/T/ipykernel_3266/1560166304.py:9: UserWarning: Glyph 129521 (\N{BRICK}) missing from current font.
  plt.tight_layout()
/Users/ashrithreddy/anaconda3/lib/python3.11/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 129521 (\N{BRICK}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)

import networkx as nx

G = nx.DiGraph()

for _, row in rules.head(10).iterrows():
    for antecedent in row['antecedents']:
        for consequent in row['consequents']:
            G.add_edge(antecedent, consequent, weight=row['lift'])

plt.figure(figsize=(8, 6))
pos = nx.spring_layout(G, k=0.5, seed=42)

edges = G.edges(data=True)
weights = [d['weight'] for _, _, d in edges]

nx.draw_networkx(G, pos,
    with_labels=True,
    node_color='lightblue',
    edge_color=weights,
    edge_cmap=plt.cm.Blues,
    width=2.0
)

plt.title("🕸 Association Rule Network")
plt.tight_layout()
plt.show()

/var/folders/dg/01ppfw3n6_jbnx4xdx0vdmj40000gn/T/ipykernel_3266/3258460624.py:25: UserWarning: Glyph 128376 (\N{SPIDER WEB}) missing from current font.
  plt.tight_layout()
/Users/ashrithreddy/anaconda3/lib/python3.11/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 128376 (\N{SPIDER WEB}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)

top_rules = rules.sort_values(by='lift', ascending=False).head(10)
labels = [f"{', '.join(ant)} → {', '.join(con)}" 
          for ant, con in zip(top_rules['antecedents'], top_rules['consequents'])]

plt.figure(figsize=(10, 4))
plt.barh(labels, top_rules['lift'], color='purple')
plt.xlabel("Lift")
plt.title("📊 Top 10 Rules by Lift")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

/var/folders/dg/01ppfw3n6_jbnx4xdx0vdmj40000gn/T/ipykernel_3266/212789155.py:10: UserWarning: Glyph 128202 (\N{BAR CHART}) missing from current font.
  plt.tight_layout()
/Users/ashrithreddy/anaconda3/lib/python3.11/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 128202 (\N{BAR CHART}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)

	support	itemsets
4	0.46	(Cereal)
2	0.44	(Bread)
9	0.42	(Yogurt)
5	0.36	(Cheese)
0	0.34	(Apples)

	antecedents	consequents	support	confidence	lift
8	(Cereal, Apples)	(Cheese)	0.12	0.857143	2.380952
5	(Eggs)	(Yogurt)	0.14	0.777778	1.851852
6	(Cheese, Cereal)	(Apples)	0.12	0.600000	1.764706
7	(Cheese, Apples)	(Cereal)	0.12	0.750000	1.630435
2	(Cheese)	(Cereal)	0.20	0.555556	1.207729

	support	confidence	lift
count	6.000000	6.000000	6.000000
mean	0.133333	0.682672	1.673901
std	0.035024	0.128973	0.441895
min	0.100000	0.555556	1.207729
25%	0.120000	0.566667	1.313406
50%	0.120000	0.675000	1.697570
75%	0.135000	0.770833	1.830065
max	0.200000	0.857143	2.380952

	leverage	conviction
count	6.000000	6.000000
mean	0.047333	2.221667
std	0.019417	1.233550
min	0.017200	1.215000
25%	0.037400	1.323750
50%	0.049200	1.905000
75%	0.061300	2.497500
max	0.069600	4.480000

	antecedents	consequents	support	confidence	lift	leverage
8	(Cereal, Apples)	(Cheese)	0.12	0.857143	2.380952	0.0696
5	(Eggs)	(Yogurt)	0.14	0.777778	1.851852	0.0644
6	(Cheese, Cereal)	(Apples)	0.12	0.600000	1.764706	0.0520
7	(Cheese, Apples)	(Cereal)	0.12	0.750000	1.630435	0.0464
2	(Cheese)	(Cereal)	0.20	0.555556	1.207729	0.0344

📖 Associative Rule Mining¶

🧭 Objective¶

🛒 What is Association Rule Mining?¶

🛒 What is Association Rule Mining?¶

🧠 Breakdown¶

📌 Use Cases¶

📌 Real-World Use Cases of Association Rules¶

📦 Data Setup¶

🧾 Load Dataset¶

🧹 Preprocessing / Transaction Formatting¶

🧮 Frequency Encoding (Optional)¶

📊 Exploratory Data Analysis¶

📈 Item Frequency Plot¶

📉 Itemset Statistics¶

🧰 Apriori Algorithm¶

⚙️ Setup Parameters¶

📜 Generate Frequent Itemsets¶

🔗 Build Association Rules¶

🧪 Rule Evaluation¶

📏 Support, Confidence, Lift¶

📐 Conviction, Leverage (Bonus)¶

📋 Top Rules by Metric¶

🧠 Interpretation¶

🧭 Business Context for Rules¶

📉 Redundant Rules / Filtering¶

📊 Visualizations¶

🧱 Heatmaps, Matrix¶

🕸 Network Graphs¶

📊 Bar Chart of Top Rules¶

	OrderID	Item
0	1001	Apples
1	1001	Milk
2	1001	Yogurt
3	1001	Bananas
4	1002	Milk
5	1002	Bananas
6	1002	Cereal
7	1002	Bread
8	1002	Yogurt
9	1003	Bananas

Item	Apples	Bananas	Bread	Butter	Cereal	Cheese	Eggs	Juice	Milk	Yogurt
OrderID
1001	1	1	0	0	0	0	0	0	1	1
1002	0	1	1	0	1	0	0	0	1	1
1003	0	1	0	1	1	0	0	0	0	0
1004	1	0	1	0	1	1	0	0	0	0
1005	1	0	0	1	0	0	0	1	0	1
1006	0	0	0	0	0	0	0	1	1	1
1007	1	0	0	0	1	1	0	0	0	1
1008	0	0	0	1	0	0	0	1	1	0
1009	0	0	0	1	1	1	0	0	0	0
1010	0	1	0	1	0	0	0	0	0	0