🗂️ Data Analysis¶

  • 🧮 Numpy

    • 📥 Creating Arrays
    • 🔍 Inspect Array
    • 🧾 Aggregate Functions
    • 🔄 Reshape
    • ✂️ Slice and Dice
    • 🛠️ Array Manipulation
    • ➕ Concatenate
  • 🐼 Pandas

    • 📥 Create a Series/DataFrame
    • 📊 Sort
    • 🔢 Index – by Row/Column Position
    • 🏷️ Index – by Row/Column Label
    • 🧭 Index – by Label/Position
    • ✅ Index – by Boolean Mask
    • 🧩 Access Columns
    • 🧮 Functions on DataFrame
    • 📊 Aggregate DataFrame
    • 🧬 Grouped Mutate
    • 🔄 Reshape

🧮 Numpy¶

📥 Creating Arrays¶

In [ ]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
In [ ]:
import numpy as np
np.array([1,2,3])
np.array(range(100))

a = np.array([1,2,3])
type(a) # numpy.ndarray
a.dtype
In [ ]:
b = np.array([(1.5,2,3), (4,5,6)]) # , dtype = float
type(b) # numpy.ndarray
b.dtype # float64

b2 = np.array([(1.5,2,3), (4,5,6)], dtype = float) # 
type(b2) # numpy.ndarray
b2.dtype # float64

c = np.array([[(1.5,2,3), (4,5,6)],[(3,2,1), (4,5,6)]], dtype = float)
type(c) # numpy.ndarray
c.dtype # float64
In [ ]:
np.zeros((3,4)) #Create an array of zeros

np.ones((3,4)) 
np.ones((2,3,4),dtype=np.int16) #Create an array of ones

np.arange(10,26,5)
d = np.arange(10,25,5) #Create an array of evenly spaced values (step value)
print(d)
In [ ]:
np.linspace(0,2,9) #Create an array of evenly spaced values (number of samples)

np.full((2),7) # tuple for dimensions
e = np.full((2,2),7) #Create a constant array
print(e)

f = np.eye(2) #Create a 2X2 identity matrix
f

np.random.random((2,2)) #Create an array with random values
# np.random.random((2,2), dtype = int) # dtype not allowed?

np.empty((3,2)) #Create an empty array (garbage numbers assigned)
In [ ]:
#### I/O
# np.save('my_array' , a)
# np.savez('array.npz', a, b)
# np.load('my_array.npy')

# np.info(np.ndarray.dtype)

🔍 Inspect Array¶

In [ ]:
a.shape #Array dimensions
len(a) #Length of array
b.ndim #Number of array dimensions
b.size # 3 x 2  = 6
e.size #Number of array elements
b.dtype #Data type of array elements
b.dtype.name #Name of data type
b.astype(int) #Convert an array to a different type
In [ ]:
a==b
a[1] # returns 2. type = numpy.int32 not np.ndarray
a[a>1] # retuns array [2,3]

b_temp = np.array([(1,2,3),(1,2,4) ])
b_temp 
b_temp  == a
a == b_temp
np.array_equal(a,b) # False
np.array_equal(a,a) # True

🧾 Aggregate Functions¶

In [ ]:
a
a.sum()
a.max()

b
b.max(axis = 0)
b.max(axis = 1)
In [ ]:
h = a.view()
a_copy = np.copy(a)
del(h)
h = a.copy()
h
In [ ]:
a = np.array([2,3,1])
a.sort() # sorts in place

a = np.array([2,3,1])
sorted(a) # does not solve in place, returns a copy
In [ ]:
b_copy = np.random.random((2,3))
b_copy 
b_copy.sort() # sorts each row left to right
b_copy 
In [ ]:
b_copy = np.random.random((2,3))
b_copy 
# sorted(b_copy)
b_copy.sort(axis=0) # sorts each row left to right
b_copy 

🔄 Reshape¶

In [ ]:
# flatten
np.array([[1, 2, 3], [2, 4, 5], [1, 2, 3]])
np.array([[1, 2, 3], [2, 4, 5], [1, 2, 3]]).flatten()
np.array([[1, 2, 3], [2, 4, 5], [1, 2, 3]]).flatten(order = "F")

np.array([[1, 2, 3], [2, 4, 5], [1, 2, 3]]).reshape([1,9])
np.array([[1, 2, 3], [2, 4, 5], [1, 2, 3]]).reshape((1,9))

✂️ Slice and Dice¶

In [ ]:
a = np.array([1,2,3,4,5,6,7])
a[2]
a[a>1]
a
a[0:2]
a[:4]
a[::-1]
a[5:0:-1]
# a.rev

b
b[1,2]
b[b<3]
b[:,2] # same as b[...,2]

🛠️ Array Manipulation¶

In [ ]:
b
b.T
np.transpose(b)

a = np.array([1,2,3])
a
b
g = a-b

b.ravel()
g.reshape(3,2)
g.reshape(3,-2)
In [ ]:
h = a.copy()
# h.resize((2,6))
h.resize((6,2))
h

h
g
np.append(h, g)

a = np.array([1,2,3])
a = np.insert(a,1,5) # returns a copy, not in place
a
a = np.delete(a, 1)
a

➕ Concatenate¶

In [ ]:
a
d
np.concatenate((a,d),axis=0)
np.concatenate((a,d),axis=1)

a
b
np.vstack((a,b))

np.vstack((a,a)) # stack them vertically, one below the other
np.c_[a,a] # stack them as columns

np.hstack((a,a)) # stack them hori, one besides the other
np.r_[a,a] # ?
In [ ]:
a
d
np.concatenate((a,d),axis=0)

a
b
np.vstack((a,b))

e
f
np.r_[e,f]
np.vstack((e,f))

np.hstack((e,f))
In [ ]:
a
d
np.column_stack((a,d)) # same as np.c_[a,d]

a
np.hsplit(a,3)

c
np.hsplit(c,2)

Back to the top


🐼 Pandas¶

📥 Create a Series/DataFrame¶

In [ ]:
import pandas as pd

pd.Series([3,-5,7,4])
pd.Series([3,-5,7,4], index = [11,12,13,14])
s = pd.Series([3,-5,7,4], index = ["a","b","c","d"])

s.drop(["a"]) # returns a value not inplace. Inplace = F by default

df = {
      "country" : ["belgium","india","brazil"],
      "capital" : ["brussles","delhi","brasilia"],
      "population":[10,30,20]      
      }
df = pd.DataFrame(df)
df

df.drop(1)
df.drop([1,2])

df.drop("capital", axis=1)
df.drop(["capital","country"], axis=1)

📊 Sort¶

In [ ]:
mtcars = pd.read_csv("data\\mtcars_rownames.csv", index_col = "Unnamed: 0")
mtcars = pd.read_excel("data\\mtcars_rownames.xlsx", index_col = "Unnamed: 0")

df
df.sort_values(by="country")
df.sort_values(["country"])
              
mtcars.sort_values(["cyl","vs","am","drat"])
mtcars.sort_index()

mtcars = pd.read_csv("data\\mtcars_rownames.csv", index_col = "Unnamed: 0").head(5)
iris = pd.read_csv("data\\iris_rownames.csv", index_col = "Unnamed: 0").head(5)

🔢 Index – by Row/Column Position¶

In [ ]:
# give integer row positions and column positions only
mtcars.iloc[1,1] # int
mtcars.iloc[1,:] # series, regardless of datatypes inside the series, colon optional
# iris.iloc[:1] # first row, not recommended due to complexity of syntax
iris.iloc[[1],:] # DataFrame, and not a series since the inputs is list and not vector

mtcars.iloc[:,1] # series, regardless of datatypes inside the series, colon mandatory

mtcars.iloc[[1,2] ,[1,2]] # DataFrame
mtcars.iloc[: ,[1,2]] # DataFrame

mtcars.iat[1,1] # int, iat needs both x and y position, not recommended to use
# mtcars.iat[1,] # do not work
# mtcars.iat[:,1] # do not work
# mtcars.iat[[1,2] ,[1,2]] # do not work
# mtcars.iat[: ,[1,2]] # do not work

🏷️ Index – by Row/Column Label¶

In [ ]:
# Input row names and column names only
mtcars.loc["Datsun 710","cyl"]
mtcars.loc["Datsun 710",] # series, colon optional
mtcars.loc["Datsun 710",:] # series, colon optional
# mtcars.loc[,"cyl"] # series, colon mandatory
mtcars.loc[:,"cyl"] # series, colon mandatory
# mtcars.loc[4,"cyl"] # failed, no rowname called 4
# mtcars.loc["Datsun 710","2"] # failed, no colname called 2

🧭 Index – by Label/Position¶

In [ ]:
# stopped working in newer version

✅ Index – by Boolean Mask¶

In [ ]:
mtcars.cyl # dtype int, type series 
type(mtcars.cyl)
(mtcars.cyl).dtypes

mtcars.cyl > 4 # dtype boolean, type series
type(mtcars.cyl > 4)
(mtcars.cyl > 4).dtypes

mtcars[mtcars.cyl > 4]
# mtcars[some_boolean_series]

s>3 #type series, dtype boolean
s[s>3] #type series, dtype int
s[(s>3)]
s[~(s>3)]
# s[!(s>3)] # does not work, negation happens using tilde ~
s[(s>3) | (s < -3) ]

🧩 Access Columns¶

In [ ]:
mtcars.hp
mtcars["hp"]

# s["b"]
# df[1]
# mtcars[1]

🧮 Functions on DataFrame¶

In [ ]:
mtcars.shape
mtcars.index # type = pandas.core.indexes.base.Index
mtcars.columns # # type = pandas.core.indexes.base.Index
mtcars.info()
mtcars.dtypes # a series
mtcars.count() # didnt understand

df.sum() # sum by each colunm, a series
mtcars.sum()
mtcars.cumsum() # a dataframe
mtcars.min() # a series
mtcars.idxmin() # rowlabel corresponding to lowest value in each column
mtcars.describe() # summary statistics
mtcars.mean() # mean of each colunm, a series

Apply function

In [ ]:
f = lambda x: x + 1000
mtcars.apply(f)

f_mean = lambda x: x.mean()
mtcars.apply(f_mean) # mean of each column

f = lambda x: x + 1000
mtcars.applymap(f)

📊 Aggregate DataFrame¶

In [ ]:
import pandas as pd
df = pd.read_csv(r"C:\Users\Ashrith Reddy\My Drive\02_learning\python\01_datatypes\mtcars.csv")

# aggregate entire dataset
f_mean = lambda x: x.mean()
df.apply(f_mean) # mean of each column, returns a series

df.groupby('am').apply(f_mean) # # mean of each column, for each group

df_grouped = df.groupby('am')
df_grouped
type(df)
type(df_grouped)
df_grouped.obj # a pandas dataframe
In [ ]:
# all attributes of grouped dataframe
df_grouped.__dict__.keys() # same as vars(df_grouped)

f_sum = lambda x: x.mean()
temp = df_grouped.apply(f_sum)
temp # wierd behavior: aggregate the grouping column
In [ ]:
df.groupby(['vs','am']).agg(sum).reset_index()
df.groupby(['vs','am']).agg("sum").reset_index() # quotes work for built-in functions

df.groupby(['vs','am']).agg(sum,max).reset_index() # fails
df.groupby(['vs','am']).agg(["sum","max"]).reset_index()
In [ ]:
my_sum = lambda x: x.sum()
# df.groupby(['vs','am']).agg("my_sum").reset_index() # quotes fail for UDF
df.groupby(['vs','am']).agg(my_sum).reset_index() # no quotes preferred
df.groupby(['vs','am']).agg(lambda x: x.sum()).reset_index()
In [ ]:
def my_sum_2(x):
    return x.sum()
df.groupby(['vs','am']).agg(my_sum_2).reset_index()
In [ ]:
# single groupby variable, single aggregating column
df['disp'].groupby('vs') # wont work
df['disp'].groupby(df.vs) # works
df['disp'].groupby(df['vs']) # works
df['disp'].groupby(df.vs).mean() 
In [ ]:
# multiple groupby variable, single aggregating column
df['disp'].groupby(df.vs, df.am) # fails
df['disp'].groupby([df.vs, df.am]) # works
df['disp'].groupby([df.vs, df.am]).mean().reset_index()
In [ ]:
# multiple groupby variable, multiple aggregating column
df[['disp','hp']].groupby([df.vs, df.am]).mean().reset_index()

# extras
df.groupby(['vs', 'am']).size().reset_index()

for group, group_data in df.groupby(['vs','am']):
    print("###############")
    print(group)
    print(group_data)

# interchangable order of writing
df[['disp','hp']].groupby([df.vs, df.am]).mean() # is same as 
df.groupby([df.vs, df.am])[['disp','hp']].mean() # is same as 
df.groupby(["vs", "am"])[['disp','hp']].mean() # preferred
In [ ]:
# multiple aggregating functions
df.groupby(["vs", "am"])[['disp','hp']].agg(max) # max applied on disp and hp
df.groupby(["vs", "am"])[['disp','hp']].agg([max,min]) # max applied on disp and hp, and min applied on disp and hp

df.groupby(["vs", "am"])[['disp','hp']].agg({'disp':'max', 'hp':'min'}) # works
df.groupby(["vs", "am"])[['disp','hp']].agg({'disp':max, 'hp':min}) # works
df.groupby(["vs", "am"]).agg({'disp':max, 'hp':min}) # works
In [ ]:
# multiple aggregating functions - different functions for each aggregating column
df.groupby(["vs", "am"]).agg({
    'disp':max, 
    'hp':min,
    'gear':lambda x: x.max() - x.min()
    })

df.groupby(["vs", "am"]).agg({
    'disp':[max, min, lambda x: x.max() - x.min()], 
    'hp':min,    
    })

df.groupby(["vs", "am"]).agg({
    'disp':lambda x: x.max() - x.min(), 
    'hp':lambda x: x.min() - x.max(),    
    })
In [ ]:
# rename columns after aggregating
temp = df.groupby(["vs", "am"]).agg({
    'disp':[max, min, lambda x: x.max() - x.min()], 
    'hp':min,    
    })
print(temp)
temp.columns = ['_'.join(x) for x in temp.columns]
print(temp.reset_index())
In [ ]:
# name columns while aggregating :)
df.groupby(["vs", "am"]).agg(
    min_disp=('disp','min'),
    max_disp=('disp','max'),
    max_hp=('hp','max'),
    min_hp=('hp',min),
    range_hp=('hp',lambda x: x.max()-x.min()),  # interestingly, a comma is allowed after last aggegation
    )
In [ ]:
# additional features
df.groupby(["vs", "am"], as_index = False).agg(
    min_disp=('disp','min'),
    )# avoid having to reset_index()

df.groupby(["vs", "am"]).agg(
    min_disp=('disp','min'),
    )# avoid having to reset_index()

🧬 Grouped Mutate¶

In [ ]:
# df.groupby("cyl").apply(max(hp))
df.groupby("cyl")["hp"].transform("sum") 
df.groupby("cyl")["hp"].transform(sum)
df.groupby("cyl")["hp"].transform(lambda x: sum(x)/(1000 + 10))
df.groupby("cyl")[["hp","mpg"]].transform(lambda x: sum(x))

def temp_fun(x):
    # print("###########################")
    answer = sum(x.vs) - sum(x.am)
    return answer
# df.groupby("cyl").transform(temp_fun)

temp = df.groupby("cyl").\
    apply(temp_fun).\
    to_frame().\
    rename(columns={0:"vs_as"}).\
    reset_index()
df.merge(temp)

🔄 Reshape¶

Back to the top