# Dummy dataset
import pandas as pd

df = pd.DataFrame({
    'Size': ['Small', 'Medium', 'Large', 'Medium', 'Small'],
    'Color': ['Red', 'Blue', 'Green', 'Blue', 'Red'],
    'City': ['NY', 'LA', 'SF', 'NY', 'SF'],
    'Target': [0, 1, 0, 1, 0]
})
df

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Size_LabelEncoded'] = le.fit_transform(df['Size'])
df[['Size', 'Size_LabelEncoded']].sort_values(by='Size')

df_onehot = pd.get_dummies(df, columns=['Color'], prefix='Color')
df_onehot

# Dummy encoding: One-hot encoding with drop_first=True
df_dummy = pd.get_dummies(df, columns=['Color'], prefix='Color', drop_first=True)
df_dummy

# Manually map Size: Small < Medium < Large
size_mapping = {'Small': 1, 'Medium': 2, 'Large': 3}
df['Size_OrdinalEncoded'] = df['Size'].map(size_mapping)
df[['Size', 'Size_OrdinalEncoded']]

count_map = df['City'].value_counts().to_dict()
df['City_CountEncoded'] = df['City'].map(count_map)
df[['City', 'City_CountEncoded']]

# Mean target for each City
target_mean = df.groupby('City')['Target'].mean().to_dict()
df['City_TargetEncoded'] = df['City'].map(target_mean)
df[['City', 'City_TargetEncoded']]

def binary_encode(series):
    categories = series.astype('category').cat.codes
    max_len = int(categories.max()).bit_length()  # cast to Python int
    binary_cols = categories.apply(lambda x: list(map(int, bin(int(x))[2:].zfill(max_len))))
    return pd.DataFrame(binary_cols.tolist(), columns=[f"{series.name}_bin_{i}" for i in range(max_len)])

df_binary = binary_encode(df['City'])
df_binary.head()

import hashlib

def hash_encode(series, n_components=4):
    def hash_string(val):
        h = int(hashlib.md5(val.encode()).hexdigest(), 16)
        return [int(b) for b in bin(h)[2:].zfill(n_components)[-n_components:]]

    hashed = series.astype(str).apply(hash_string)
    return pd.DataFrame(hashed.tolist(), columns=[f"{series.name}_hash_{i}" for i in range(n_components)])

hash_encode(df['City'], n_components=4)

	City_hash_0	City_hash_1	City_hash_3
0	1	0	0
1	0	0	1
2	1	1	1
3	1	0	0
4	1	1	1

📖 Categorical Features¶

🔢 Label Encoding¶

🟦 One-Hot Encoding¶

🧱 Dummy Encoding

🧮 Ordinal Encoding¶

📊 Frequency / Count Encoding¶

🎯 Target Encoding¶

#️⃣ Binary Encoding¶

💠 Hashing Encoding¶

	Size	Color	City	Target
0	Small	Red	NY	0
1	Medium	Blue	LA	1
2	Large	Green	SF	0
3	Medium	Blue	NY	1
4	Small	Red	SF	0

	Size	City	Target	Size_LabelEncoded	Size_OrdinalEncoded	City_CountEncoded	City_TargetEncoded	Color_Blue	Color_Green	Color_Red
0	Small	NY	0	2	1	2	0.5	0	0	1
1	Medium	LA	1	1	2	1	1.0	1	0	0
2	Large	SF	0	0	3	2	0.0	0	1	0
3	Medium	NY	1	1	2	2	0.5	1	0	0
4	Small	SF	0	2	1	2	0.0	0	0	1

	Size	City	Target	Size_LabelEncoded	Size_OrdinalEncoded	City_CountEncoded	City_TargetEncoded	Color_Green	Color_Red
0	Small	NY	0	2	1	2	0.5	0	1
1	Medium	LA	1	1	2	1	1.0	0	0
2	Large	SF	0	0	3	2	0.0	1	0
3	Medium	NY	1	1	2	2	0.5	0	0
4	Small	SF	0	2	1	2	0.0	0	1

	City	City_CountEncoded
0	NY	2
1	LA	1
2	SF	2
3	NY	2
4	SF	2

	City	City_TargetEncoded
0	NY	0.5
1	LA	1.0
2	SF	0.0
3	NY	0.5
4	SF	0.0