import pandas as pd
import re
import string

# !pip3 install nltk
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.chunk import ChunkParserI
from nltk import pos_tag
from nltk.tokenize import TreebankWordTokenizer

import pickle
import io
import spacy

# Load the dataset
df = pd.read_csv("datasets/amazon.csv")

# Preview the data
print(df.shape)
df.head()

(20000, 2)

sample_review = df['text'].iloc[0]
print("Original Review:\n", sample_review)

# Word tokenization using regex
word_tokens = re.findall(r'\b\w+\b', sample_review.lower())

print("\nTokenized Words:")
print(word_tokens)

Original Review:
 This is a one of the best apps acording to a bunch of people and I agree it has bombs eggs pigs TNT king pigs and realustic stuff

Tokenized Words:
['this', 'is', 'a', 'one', 'of', 'the', 'best', 'apps', 'acording', 'to', 'a', 'bunch', 'of', 'people', 'and', 'i', 'agree', 'it', 'has', 'bombs', 'eggs', 'pigs', 'tnt', 'king', 'pigs', 'and', 'realustic', 'stuff']

# Sentence tokenization using regex
sentence_tokens = re.split(r'(?<=[.!?]) +', sample_review)

print("Tokenized Sentences:")
for i, sent in enumerate(sentence_tokens):
    print(f"{i+1}. {sent}")

Tokenized Sentences:
1. This is a one of the best apps acording to a bunch of people and I agree it has bombs eggs pigs TNT king pigs and realustic stuff

# Lowercasing text column
df['text_lower'] = df['text'].str.lower()

# Show before and after for a single row
print("Original:", df['text'].iloc[0])
print("Lowercased:", df['text_lower'].iloc[0])

Original: This is a one of the best apps acording to a bunch of people and I agree it has bombs eggs pigs TNT king pigs and realustic stuff
Lowercased: this is a one of the best apps acording to a bunch of people and i agree it has bombs eggs pigs tnt king pigs and realustic stuff

# Remove punctuation
df['text_no_punct'] = df['text_lower'].str.replace(f"[{re.escape(string.punctuation)}]", "", regex=True)

print("Before:", df['text_lower'].iloc[0])
print("After:", df['text_no_punct'].iloc[0])

Before: this is a one of the best apps acording to a bunch of people and i agree it has bombs eggs pigs tnt king pigs and realustic stuff
After: this is a one of the best apps acording to a bunch of people and i agree it has bombs eggs pigs tnt king pigs and realustic stuff

# Regex-based cleaning: remove URLs and excess whitespace
df['text_cleaned'] = df['text_no_punct'].str.replace(r"http\S+|www\S+|https\S+", "", regex=True)
df['text_cleaned'] = df['text_cleaned'].str.replace(r"\s+", " ", regex=True).str.strip()

print("Before:", df['text_no_punct'].iloc[0])
print("After:", df['text_cleaned'].iloc[0])

Before: this is a one of the best apps acording to a bunch of people and i agree it has bombs eggs pigs tnt king pigs and realustic stuff
After: this is a one of the best apps acording to a bunch of people and i agree it has bombs eggs pigs tnt king pigs and realustic stuff

# Load NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Remove stopwords from cleaned text
df['text_no_stopwords'] = df['text_cleaned'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in stop_words])
)

print("Before:", df['text_cleaned'].iloc[0])
print("After:", df['text_no_stopwords'].iloc[0])

Before: this is a one of the best apps acording to a bunch of people and i agree it has bombs eggs pigs tnt king pigs and realustic stuff
After: one best apps acording bunch people agree bombs eggs pigs tnt king pigs realustic stuff

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ashrithreddy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

# Example: Custom stopwords for Amazon reviews
custom_stopwords = set(["app", "product", "amazon", "device", "really", "very"])

# Remove custom stopwords
df['text_no_custom_stopwords'] = df['text_no_stopwords'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in custom_stopwords])
)

print("Before:", df['text_no_stopwords'].iloc[0])
print("After:", df['text_no_custom_stopwords'].iloc[0])

Before: one best apps acording bunch people agree bombs eggs pigs tnt king pigs realustic stuff
After: one best apps acording bunch people agree bombs eggs pigs tnt king pigs realustic stuff

stemmer = PorterStemmer()

# Apply stemming
df['text_stemmed'] = df['text_no_custom_stopwords'].apply(
    lambda x: ' '.join([stemmer.stem(word) for word in x.split()])
)

print("Before:", df['text_no_custom_stopwords'].iloc[0])
print("After:", df['text_stemmed'].iloc[0])

Before: one best apps acording bunch people agree bombs eggs pigs tnt king pigs realustic stuff
After: one best app acord bunch peopl agre bomb egg pig tnt king pig realust stuff

nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

# Apply lemmatization
df['text_lemmatized'] = df['text_no_custom_stopwords'].apply(
    lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()])
)

print("Before:", df['text_no_custom_stopwords'].iloc[0])
print("After:", df['text_lemmatized'].iloc[0])

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ashrithreddy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/ashrithreddy/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!

Before: one best apps acording bunch people agree bombs eggs pigs tnt king pigs realustic stuff
After: one best apps acording bunch people agree bomb egg pig tnt king pig realustic stuff

nltk.data.path.append('/Users/ashrithreddy/nltk_data')
# nltk.data.find('taggers/averaged_perceptron_tagger')  # Will throw if missing
# nltk.download('popular', download_dir='/Users/ashrithreddy/nltk_data')
# nltk.download('averaged_perceptron_tagger_eng')

# POS tagging on lemmatized text
df['pos_tags'] = df['text_lemmatized'].apply(lambda x: nltk.pos_tag(x.split()))

print("Sample Tagged Sentence:")
print(df['pos_tags'].iloc[0])

Sample Tagged Sentence:
[('one', 'CD'), ('best', 'JJS'), ('apps', 'NN'), ('acording', 'VBG'), ('bunch', 'JJ'), ('people', 'NNS'), ('agree', 'VBP'), ('bomb', 'NN'), ('egg', 'NN'), ('pig', 'NN'), ('tnt', 'NN'), ('king', 'VBG'), ('pig', 'JJ'), ('realustic', 'JJ'), ('stuff', 'NN')]

# Force NLTK to look in your custom path
nltk.data.path.clear()
nltk.data.path.append('/Users/ashrithreddy/nltk_data')

# Load the chunker manually using correct encoding
chunker_path = '/Users/ashrithreddy/nltk_data/chunkers/maxent_ne_chunker/english_ace_multiclass.pickle'
with open(chunker_path, 'rb') as f:
    chunker: ChunkParserI = pickle.load(io.BufferedReader(f), encoding='latin1')

# Tokenize and tag
text = df['text_lemmatized'].iloc[0]
tokens = TreebankWordTokenizer().tokenize(text)
pos_tags = pos_tag(tokens)

# Run NER
ner_tree = chunker.parse(pos_tags)

# Display entities
print("Named Entities:")
for subtree in ner_tree:
    if hasattr(subtree, 'label'):
        entity = " ".join([token for token, pos in subtree])
        print(f"{entity} → {subtree.label()}")

Named Entities:

try:
    nlp = spacy.load("en_core_web_sm")
    
    doc = nlp(df['text_lemmatized'].iloc[0])
    print("Dependency Parsing Output:\n")
    for token in doc:
        print(f"{token.text:15} → {token.dep_:10} → head: {token.head.text}")
except:
    print("spaCy model not found or not available. Dependency parsing skipped.")

spaCy model not found or not available. Dependency parsing skipped.

try:
    doc = nlp(df['text_lemmatized'].iloc[0])
    print("Noun Phrases:")
    for chunk in doc.noun_chunks:
        print("-", chunk.text)
except:
    print("spaCy model not found or not available. Chunking skipped.")

spaCy model not found or not available. Chunking skipped.

	text	sentiment
0	This is a one of the best apps acording to a b...	1
1	This is a pretty good version of the game for ...	1
2	this is a really cool game. there are a bunch ...	1
3	This is a silly game and can be frustrating, b...	1
4	This is a terrific game on any pad. Hrs of fun...	1

🧾 Text Cleaning & Parsing¶

🔍 Tokenization¶

What is Tokenization?¶

Types of Tokenization¶

Example:¶

🧱 Word Tokenization¶

What is Word Tokenization?¶

Example:¶

✂️ Sentence Tokenization¶

What is Sentence Tokenization?¶

Example:¶

🧼 Text Cleaning¶

What is Text Cleaning?¶

Common Steps:¶

Example:¶

Example:¶

🔤 Lowercasing and Normalization¶

What is Lowercasing?¶

Example:¶

✨ Punctuation & Special Character Removal¶

Why Remove Punctuation?¶

Example:¶

🧪 Regex Based Cleaning¶

What is Regex Cleaning?¶

Example:¶

🛑 Stopwords Removal¶

What are Stopwords?¶

Example:¶

📚 Using NLTK/Spacy Stopword Lists¶

Why use built-in stopwords?¶

Example:¶

🧰 Custom Stopword Handling¶

Why use custom stopwords?¶

Example:¶

🔁 Lemmatization and Stemming¶

What are Lemmatization and Stemming?¶

Example:¶

🌱 Stemming (Porter, Snowball)¶

What is Stemming?¶

Example:¶

🍃 Lemmatization (WordNet, spaCy)¶

What is Lemmatization?¶

Example:¶

🧠 POS Tagging & Named Entity Recognition¶

What are POS & NER?¶

Why it matters:¶

Example:¶

🏷️ POS Tagging¶

What is POS Tagging?¶

Example:¶

🧾 Named Entity Recognition (NER)¶

What is NER?¶

Example:¶

🔗 Dependency Parsing & Chunking¶

What is Dependency Parsing?¶

Example:¶

🕸️ Dependency Parsing¶

What is Dependency Parsing?¶

Example:¶

🧱 Phrase Chunking (Noun Phrases etc.)¶

What is Chunking?¶

Example:¶