import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression

# Sample corpus
corpus = [
    "The cat sat on the mat.",
    "The dog chased the cat.",
    "The cat climbed the tree."
]

# Initialize CountVectorizer
vectorizer = CountVectorizer(stop_words='english')

# Fit-transform
X = vectorizer.fit_transform(corpus)

# View as DataFrame
pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Bigrams + unigrams
vectorizer_ngram = CountVectorizer(ngram_range=(1, 2), stop_words='english')
X_ngram = vectorizer_ngram.fit_transform(corpus)

# View result
pd.DataFrame(X_ngram.toarray(), columns=vectorizer_ngram.get_feature_names_out())

# Loosen pruning thresholds for toy data
vectorizer_pruned = CountVectorizer(
    stop_words='english',
    min_df=1,       # allow terms in at least 1 doc
    max_df=1.0      # allow all terms
)

X_pruned = vectorizer_pruned.fit_transform(corpus)

# View result
pd.DataFrame(X_pruned.toarray(), columns=vectorizer_pruned.get_feature_names_out())

# TF-IDF with defaults
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_tfidf = tfidf_vectorizer.fit_transform(corpus)

# View as DataFrame
pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Custom TF-IDF settings
tfidf_custom = TfidfVectorizer(
    stop_words='english',
    sublinear_tf=True,
    smooth_idf=False,
    norm=None,
    min_df=1,
    max_df=1.0
)

X_tfidf_custom = tfidf_custom.fit_transform(corpus)

# View results
pd.DataFrame(X_tfidf_custom.toarray(), columns=tfidf_custom.get_feature_names_out())

vectorizer_clean = CountVectorizer(
    lowercase=True,
    strip_accents='unicode',
    token_pattern=r'\b\w\w+\b',  # removes single-letter tokens
    stop_words='english'
)

X_clean = vectorizer_clean.fit_transform(corpus)

pd.DataFrame(X_clean.toarray(), columns=vectorizer_clean.get_feature_names_out())

# Custom tokenizer: split by words and keep hashtags
def custom_tokenizer(text):
    return re.findall(r'#?\b\w\w+\b', text.lower())

custom_vec = CountVectorizer(
    tokenizer=custom_tokenizer,
    token_pattern=None,       # must be None when using custom tokenizer
    stop_words='english'
)

X_custom = custom_vec.fit_transform(corpus)

pd.DataFrame(X_custom.toarray(), columns=custom_vec.get_feature_names_out())

vectorizer = CountVectorizer(stop_words='english')
X_sparse = vectorizer.fit_transform(corpus)

print("Shape:", X_sparse.shape)
print("Non-zero entries:", X_sparse.nnz)
print("Sparsity: {:.2f}%".format(100 * (1 - X_sparse.nnz / (X_sparse.shape[0] * X_sparse.shape[1]))))

Shape: (3, 7)
Non-zero entries: 9
Sparsity: 57.14%

# Apply SVD to TF-IDF vectors
svd = TruncatedSVD(n_components=2, random_state=42)
X_reduced = svd.fit_transform(X_tfidf)

# View as DataFrame
pd.DataFrame(X_reduced, columns=['svd_1', 'svd_2'])

# Fake labeled data for demo
docs = [
    "great movie and fantastic acting",
    "boring plot and bad acting",
    "what a wonderful film",
    "awful and dull",
    "loved it, great performance",
    "hated it, terrible movie"
]
labels = [1, 0, 1, 0, 1, 0]  # 1 = positive, 0 = negative

vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(docs)
features = np.array(vectorizer.get_feature_names_out())

# Separate by class
pos_rows = X[np.array(labels) == 1]
neg_rows = X[np.array(labels) == 0]

# Sum term frequencies by class
pos_freq = np.asarray(pos_rows.sum(axis=0)).flatten()
neg_freq = np.asarray(neg_rows.sum(axis=0)).flatten()

# Top 5 per class
top_pos = features[np.argsort(pos_freq)[-5:]]
top_neg = features[np.argsort(neg_freq)[-5:]]

print("Top words in positive class:", top_pos[::-1])
print("Top words in negative class:", top_neg[::-1])

Top words in positive class: ['great' 'wonderful' 'performance' 'movie' 'loved']
Top words in negative class: ['terrible' 'plot' 'movie' 'hated' 'dull']

# Use TF-IDF for modeling
tfidf = TfidfVectorizer(stop_words='english')
X_tfidf = tfidf.fit_transform(docs)

model = LogisticRegression()
model.fit(X_tfidf, labels)

# Get feature weights
feature_names = np.array(tfidf.get_feature_names_out())
coefs = model.coef_[0]

top_pos_coef = feature_names[np.argsort(coefs)[-5:]]
top_neg_coef = feature_names[np.argsort(coefs)[:5]]

print("Most positive influence:", top_pos_coef[::-1])
print("Most negative influence:", top_neg_coef)

Most positive influence: ['great' 'wonderful' 'film' 'fantastic' 'performance']
Most negative influence: ['awful' 'dull' 'hated' 'terrible' 'bad']

print("Shape:", X_tfidf.shape)
print("Sparsity: {:.2f}%".format(100 * (1 - X_tfidf.nnz / (X_tfidf.shape[0] * X_tfidf.shape[1]))))

Shape: (6, 15)
Sparsity: 80.00%

	cat	chased	climbed	dog	mat	sat	tree
0	0.385372	0.000000	0.000000	0.000000	0.652491	0.652491	0.000000
1	0.385372	0.652491	0.000000	0.652491	0.000000	0.000000	0.000000
2	0.385372	0.000000	0.652491	0.000000	0.000000	0.000000	0.652491

	cat	chased	climbed	dog	mat	sat	tree
0	1.0	0.000000	0.000000	0.000000	2.098612	2.098612	0.000000
1	1.0	2.098612	0.000000	2.098612	0.000000	0.000000	0.000000
2	1.0	0.000000	2.098612	0.000000	0.000000	0.000000	2.098612

	svd_1	svd_2
0	0.657526	-1.687260e-17
1	0.657526	-6.524909e-01
2	0.657526	6.524909e-01

🧾 Vectorization¶

📄 Introduction¶

📄 Introduction¶

📦 Bag of Words (BoW)¶

🏗️ CountVectorizer Mechanics¶

🔢 N-grams¶

🔠 Vocabulary Size & Pruning¶

📈 TF-IDF¶

📚 What is TF-IDF?¶

¶

🔧 Tfidf Vectorizer Implementation¶

🎛️ Adjusting TF/IDF Weights¶

⚙️ Vectorizer Options & Preprocessing¶

🔤 Lowercasing, Strip Accents, Token Patterns¶

🧰 Custom Tokenizer / Analyzer¶

🪪 Sparse Matrix Handling¶

🧮 Memory vs Interpretability¶

🧹 Dimensionality Reduction (optional preview)¶

🧠 Feature Interpretation¶

🔍 Top Words by Class¶

🎯 Important Features in Classification¶

🧪 Edge Cases & Practical Tips¶

📉 Extremely Sparse Inputs¶