import pandas as pd
import re
import nltk

# Download only what we still use
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Load data
df = pd.read_csv('datasets/amazon.csv')
df = df.dropna(subset=['text']).reset_index(drop=True)

# Setup stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# 🔧 Updated cleaning function (no NLTK tokenizer)
def clean_text(text):
    try:
        text = text.lower()
        text = re.sub(r'[^a-z\s]', '', text)                      # Remove non-letters
        tokens = re.findall(r'\b[a-z]{3,}\b', text)               # Only words with 3+ letters
        tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
        return ' '.join(tokens)
    except Exception as e:
        print(f"[Error cleaning text]: {text[:50]}...\nReason: {e}")
        return ""

# Apply cleaning
df['clean_text'] = df['text'].apply(clean_text)

# Preview
df[['text', 'clean_text']].head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ashrithreddy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ashrithreddy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/ashrithreddy/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Use only clean text
texts = df['clean_text'].tolist()

# Count Vectorizer
count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000)
dtm_count = count_vectorizer.fit_transform(texts)

# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=1000)
dtm_tfidf = tfidf_vectorizer.fit_transform(texts)

# Show dimensions
print("Count DTM shape:", dtm_count.shape)
print("TF-IDF DTM shape:", dtm_tfidf.shape)

Count DTM shape: (20000, 1000)
TF-IDF DTM shape: (20000, 1000)

from sklearn.decomposition import LatentDirichletAllocation

# We'll reuse earlier DTM (if not run, re-vectorize here)
from sklearn.feature_extraction.text import CountVectorizer

texts = df['clean_text'].tolist()

count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000)
dtm_count = count_vectorizer.fit_transform(texts)

# Fit LDA
lda_model = LatentDirichletAllocation(n_components=10, random_state=42)
lda_model.fit(dtm_count)

# Display top words per topic
def display_topics(model, feature_names, top_n=10):
    for topic_idx, topic in enumerate(model.components_):
        top_features = [feature_names[i] for i in topic.argsort()[:-top_n - 1:-1]]
        print(f"Topic {topic_idx + 1}: {', '.join(top_features)}")

print("🧠 Top Words per Topic:\n")
display_topics(lda_model, count_vectorizer.get_feature_names_out())

🧠 Top Words per Topic:

Topic 1: game, fun, play, like, level, time, great, get, playing, really
Topic 2: app, use, easy, love, great, bible, recommend, like, read, day
Topic 3: dont, get, app, like, game, know, bird, free, people, even
Topic 4: app, alarm, free, version, use, set, one, clock, great, ive
Topic 5: love, game, old, fun, kid, app, great, play, year, time
Topic 6: app, download, get, video, free, love, music, like, awesome, want
Topic 7: kindle, fire, work, app, great, got, problem, well, downloaded, close
Topic 8: app, like, used, good, one, great, work, better, ive, find
Topic 9: app, would, get, star, work, cant, time, review, amazon, even
Topic 10: app, phone, use, great, work, apps, one, need, calendar, like

import matplotlib.pyplot as plt

scores = []
k_values = list(range(2, 21, 2))

for k in k_values:
    lda = LatentDirichletAllocation(n_components=k, random_state=42)
    lda.fit(dtm_count)
    perplexity = lda.perplexity(dtm_count)
    scores.append(perplexity)

# Plot
plt.plot(k_values, scores, marker='o')
plt.title("Perplexity vs Number of Topics")
plt.xlabel("n_topics")
plt.ylabel("Perplexity (lower is better)")
plt.grid(True)
plt.show()

from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize with TF-IDF (if not already)
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=1000)
dtm_tfidf = tfidf_vectorizer.fit_transform(df['clean_text'])

# Fit NMF
nmf_model = NMF(n_components=10, random_state=42)
nmf_model.fit(dtm_tfidf)

# Display top words
def display_nmf_topics(model, feature_names, top_n=10):
    for idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-top_n - 1:-1]]
        print(f"Topic {idx + 1}: {', '.join(top_words)}")

print("📌 NMF Top Words per Topic:\n")
display_nmf_topics(nmf_model, tfidf_vectorizer.get_feature_names_out())

📌 NMF Top Words per Topic:

Topic 1: app, free, would, one, apps, good, day, best, download, got
Topic 2: game, play, playing, one, level, time, good, played, graphic, would
Topic 3: kindle, fire, work, downloaded, problem, doesnt, browser, video, got, well
Topic 4: use, easy, bible, simple, time, note, find, make, list, quick
Topic 5: love, old, kid, much, year, bird, awesome, also, angry, music
Topic 6: great, work, well, tablet, way, keep, phone, graphic, thanks, lot
Topic 7: fun, lot, play, much, really, bird, kid, challenging, playing, angry
Topic 8: like, really, good, one, would, look, thing, lot, better, didnt
Topic 9: get, dont, time, waste, even, know, cant, say, stupid, people
Topic 10: alarm, clock, set, wake, time, day, different, one, option, snooze

print("🔁 Top Words from LDA Topics:\n")
display_topics(lda_model, count_vectorizer.get_feature_names_out())

print("\n🧮 Top Words from NMF Topics:\n")
display_nmf_topics(nmf_model, tfidf_vectorizer.get_feature_names_out())

🔁 Top Words from LDA Topics:

Topic 1: game, fun, play, like, level, time, great, get, playing, really
Topic 2: app, use, easy, love, great, bible, recommend, like, read, day
Topic 3: dont, get, app, like, game, know, bird, free, people, even
Topic 4: app, alarm, free, version, use, set, one, clock, great, ive
Topic 5: love, game, old, fun, kid, app, great, play, year, time
Topic 6: app, download, get, video, free, love, music, like, awesome, want
Topic 7: kindle, fire, work, app, great, got, problem, well, downloaded, close
Topic 8: app, like, used, good, one, great, work, better, ive, find
Topic 9: app, would, get, star, work, cant, time, review, amazon, even
Topic 10: app, phone, use, great, work, apps, one, need, calendar, like

🧮 Top Words from NMF Topics:

Topic 1: app, free, would, one, apps, good, day, best, download, got
Topic 2: game, play, playing, one, level, time, good, played, graphic, would
Topic 3: kindle, fire, work, downloaded, problem, doesnt, browser, video, got, well
Topic 4: use, easy, bible, simple, time, note, find, make, list, quick
Topic 5: love, old, kid, much, year, bird, awesome, also, angry, music
Topic 6: great, work, well, tablet, way, keep, phone, graphic, thanks, lot
Topic 7: fun, lot, play, much, really, bird, kid, challenging, playing, angry
Topic 8: like, really, good, one, would, look, thing, lot, better, didnt
Topic 9: get, dont, time, waste, even, know, cant, say, stupid, people
Topic 10: alarm, clock, set, wake, time, day, different, one, option, snooze

# LDA assignments
lda_doc_topic_dist = lda_model.transform(dtm_count)
df['lda_topic'] = lda_doc_topic_dist.argmax(axis=1) + 1  # +1 for human-friendly topic numbers

# NMF assignments
nmf_doc_topic_dist = nmf_model.transform(dtm_tfidf)
df['nmf_topic'] = nmf_doc_topic_dist.argmax(axis=1) + 1

# Preview tagged docs
df[['text', 'lda_topic', 'nmf_topic']].head(10)

from gensim.models import CoherenceModel
from gensim.corpora import Dictionary

# Tokenize cleaned text
tokenized_docs = [doc.split() for doc in df['clean_text']]

# Gensim Dictionary and Corpus
id2word = Dictionary(tokenized_docs)
corpus = [id2word.doc2bow(text) for text in tokenized_docs]

# Convert LDA model from sklearn to Gensim format for coherence
lda_topics = []
for topic_weights in lda_model.components_:
    top_word_ids = topic_weights.argsort()[:-11:-1]
    lda_topics.append([count_vectorizer.get_feature_names_out()[i] for i in top_word_ids])

# Compute coherence
coherence_lda = CoherenceModel(
    topics=lda_topics,
    texts=tokenized_docs,
    dictionary=id2word,
    coherence='c_v'
)

coherence_score = coherence_lda.get_coherence()
print(f"🧠 LDA Coherence Score (c_v): {coherence_score:.4f}")

🧠 LDA Coherence Score (c_v): 0.4363

import pyLDAvis
from sklearn.preprocessing import normalize

# Inline display
pyLDAvis.enable_notebook()

# Calculate the required inputs manually
topic_term_dists = normalize(lda_model.components_)
doc_topic_dists = normalize(lda_model.transform(dtm_count))
doc_lengths = dtm_count.sum(axis=1).A.ravel()
vocab = count_vectorizer.get_feature_names_out()
term_frequency = dtm_count.sum(axis=0).A.ravel()

# Use the raw prepare function exposed in main pyLDAvis module
vis_data = pyLDAvis.prepare(
    topic_term_dists=topic_term_dists,
    doc_topic_dists=doc_topic_dists,
    doc_lengths=doc_lengths,
    vocab=vocab,
    term_frequency=term_frequency
)

# Show it
pyLDAvis.display(vis_data)

from wordcloud import WordCloud
import matplotlib.pyplot as plt

def plot_wordclouds(model, feature_names, n_topics=10, top_n=30):
    for topic_idx, topic in enumerate(model.components_[:n_topics]):
        topic_words = {feature_names[i]: topic[i] for i in topic.argsort()[:-top_n - 1:-1]}
        wc = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(topic_words)
        plt.figure(figsize=(10, 4))
        plt.imshow(wc, interpolation='bilinear')
        plt.axis('off')
        plt.title(f"Topic {topic_idx + 1}")
        plt.show()

# Plot for LDA (using count_vectorizer)
plot_wordclouds(lda_model, count_vectorizer.get_feature_names_out())

	text	clean_text
0	This is a one of the best apps acording to a b...	one best apps acording bunch people agree bomb...
1	This is a pretty good version of the game for ...	pretty good version game free lot different le...
2	this is a really cool game. there are a bunch ...	really cool game bunch level find golden egg s...
3	This is a silly game and can be frustrating, b...	silly game frustrating lot fun definitely reco...
4	This is a terrific game on any pad. Hrs of fun...	terrific game pad hr fun grandkids love great ...

Feature	LDA	NMF
Type	Probabilistic	Matrix factorization
Input	Count	TF-IDF
Output	Topic probabilities	Topic weights
Speed	Slower	Faster
Interpretability	Often better for LDA	Clean when using TF-IDF
Library	sklearn / gensim	sklearn

	text	lda_topic	nmf_topic
0	This is a one of the best apps acording to a b...	6	1
1	This is a pretty good version of the game for ...	1	2
2	this is a really cool game. there are a bunch ...	1	7
3	This is a silly game and can be frustrating, b...	1	7
4	This is a terrific game on any pad. Hrs of fun...	5	5
5	This is a very entertaining game! You don't h...	1	2
6	this is awesome and you don't need wi ti to pl...	3	9
7	this is awesome I bet no one even reads the re...	3	2
8	This is basicly the free version but with ads....	3	8
9	this is by far the best free app that is avail...	6	9

🧾 Topic Modeling¶

📄 Introduction¶

📦 Text Preprocessing for Topic Modeling¶

🧼 Cleaning for Unsupervised Models¶

🧹 Why Cleaning Matters More for Topic Modeling¶

🔁 Vectorization with Count/TF-IDF¶

🧾 Choosing Between Count and TF-IDF¶

📊 Latent Dirichlet Allocation (LDA)¶

🧠 LDA Intuition & Assumptions¶

🤯 What is LDA?¶

🧰 LDA Is Useful When:¶

⚙️ LDA Implementation (Sklearn / Gensim)¶

🧪 LDA via Scikit-Learn vs. Gensim¶

🔍 Tuning n_topics¶

🎛️ How Many Topics to Use?¶

🧮 Non-negative Matrix Factorization (NMF)¶

📘 NMF vs LDA¶

🔍 How NMF Differs from LDA¶

🔄 Summary: NMF vs LDA¶

⚙️ NMF Implementation¶

🛠️ NMF in Scikit-Learn¶

🧠 Interpreting Topics¶

📝 Top Words per Topic¶

🧾 What Makes a Topic Interpretable?¶

🧵 Assigning Topics to Documents¶

📌 Tagging Each Document with a Dominant Topic¶

📈 Topic Coherence & Quality Metrics¶

📊 Coherence Scores¶

📐 What is Topic Coherence?¶

🎯 Perplexity & Limitations¶

🎭 Why Perplexity Alone Isn’t Enough¶

🗂️ Visualizing Topics¶

📦 pyLDAvis¶

🔮 Interactive LDA Visualization¶

🖼️ Wordclouds by Topic¶

☁️ Wordclouds per Topic¶

🧪 Edge Cases & Troubleshooting¶

⚠️ Short Texts & Low Topic Separation¶

🪫 Why LDA Struggles with Short Texts¶

🧪 Poor Topic Coherence¶

🤷 Why Topics Sometimes Suck¶

🔍 Tuning `n_topics`¶