from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import numpy as np
np.array([1,2,3])
np.array(range(100))
a = np.array([1,2,3])
type(a) # numpy.ndarray
a.dtype
b = np.array([(1.5,2,3), (4,5,6)]) # , dtype = float
type(b) # numpy.ndarray
b.dtype # float64
b2 = np.array([(1.5,2,3), (4,5,6)], dtype = float) #
type(b2) # numpy.ndarray
b2.dtype # float64
c = np.array([[(1.5,2,3), (4,5,6)],[(3,2,1), (4,5,6)]], dtype = float)
type(c) # numpy.ndarray
c.dtype # float64
np.zeros((3,4)) #Create an array of zeros
np.ones((3,4))
np.ones((2,3,4),dtype=np.int16) #Create an array of ones
np.arange(10,26,5)
d = np.arange(10,25,5) #Create an array of evenly spaced values (step value)
print(d)
np.linspace(0,2,9) #Create an array of evenly spaced values (number of samples)
np.full((2),7) # tuple for dimensions
e = np.full((2,2),7) #Create a constant array
print(e)
f = np.eye(2) #Create a 2X2 identity matrix
f
np.random.random((2,2)) #Create an array with random values
# np.random.random((2,2), dtype = int) # dtype not allowed?
np.empty((3,2)) #Create an empty array (garbage numbers assigned)
#### I/O
# np.save('my_array' , a)
# np.savez('array.npz', a, b)
# np.load('my_array.npy')
# np.info(np.ndarray.dtype)
a.shape #Array dimensions
len(a) #Length of array
b.ndim #Number of array dimensions
b.size # 3 x 2 = 6
e.size #Number of array elements
b.dtype #Data type of array elements
b.dtype.name #Name of data type
b.astype(int) #Convert an array to a different type
a==b
a[1] # returns 2. type = numpy.int32 not np.ndarray
a[a>1] # retuns array [2,3]
b_temp = np.array([(1,2,3),(1,2,4) ])
b_temp
b_temp == a
a == b_temp
np.array_equal(a,b) # False
np.array_equal(a,a) # True
a
a.sum()
a.max()
b
b.max(axis = 0)
b.max(axis = 1)
h = a.view()
a_copy = np.copy(a)
del(h)
h = a.copy()
h
a = np.array([2,3,1])
a.sort() # sorts in place
a = np.array([2,3,1])
sorted(a) # does not solve in place, returns a copy
b_copy = np.random.random((2,3))
b_copy
b_copy.sort() # sorts each row left to right
b_copy
b_copy = np.random.random((2,3))
b_copy
# sorted(b_copy)
b_copy.sort(axis=0) # sorts each row left to right
b_copy
# flatten
np.array([[1, 2, 3], [2, 4, 5], [1, 2, 3]])
np.array([[1, 2, 3], [2, 4, 5], [1, 2, 3]]).flatten()
np.array([[1, 2, 3], [2, 4, 5], [1, 2, 3]]).flatten(order = "F")
np.array([[1, 2, 3], [2, 4, 5], [1, 2, 3]]).reshape([1,9])
np.array([[1, 2, 3], [2, 4, 5], [1, 2, 3]]).reshape((1,9))
a = np.array([1,2,3,4,5,6,7])
a[2]
a[a>1]
a
a[0:2]
a[:4]
a[::-1]
a[5:0:-1]
# a.rev
b
b[1,2]
b[b<3]
b[:,2] # same as b[...,2]
b
b.T
np.transpose(b)
a = np.array([1,2,3])
a
b
g = a-b
b.ravel()
g.reshape(3,2)
g.reshape(3,-2)
h = a.copy()
# h.resize((2,6))
h.resize((6,2))
h
h
g
np.append(h, g)
a = np.array([1,2,3])
a = np.insert(a,1,5) # returns a copy, not in place
a
a = np.delete(a, 1)
a
a
d
np.concatenate((a,d),axis=0)
np.concatenate((a,d),axis=1)
a
b
np.vstack((a,b))
np.vstack((a,a)) # stack them vertically, one below the other
np.c_[a,a] # stack them as columns
np.hstack((a,a)) # stack them hori, one besides the other
np.r_[a,a] # ?
a
d
np.concatenate((a,d),axis=0)
a
b
np.vstack((a,b))
e
f
np.r_[e,f]
np.vstack((e,f))
np.hstack((e,f))
a
d
np.column_stack((a,d)) # same as np.c_[a,d]
a
np.hsplit(a,3)
c
np.hsplit(c,2)
import pandas as pd
pd.Series([3,-5,7,4])
pd.Series([3,-5,7,4], index = [11,12,13,14])
s = pd.Series([3,-5,7,4], index = ["a","b","c","d"])
s.drop(["a"]) # returns a value not inplace. Inplace = F by default
df = {
"country" : ["belgium","india","brazil"],
"capital" : ["brussles","delhi","brasilia"],
"population":[10,30,20]
}
df = pd.DataFrame(df)
df
df.drop(1)
df.drop([1,2])
df.drop("capital", axis=1)
df.drop(["capital","country"], axis=1)
mtcars = pd.read_csv("data\\mtcars_rownames.csv", index_col = "Unnamed: 0")
mtcars = pd.read_excel("data\\mtcars_rownames.xlsx", index_col = "Unnamed: 0")
df
df.sort_values(by="country")
df.sort_values(["country"])
mtcars.sort_values(["cyl","vs","am","drat"])
mtcars.sort_index()
mtcars = pd.read_csv("data\\mtcars_rownames.csv", index_col = "Unnamed: 0").head(5)
iris = pd.read_csv("data\\iris_rownames.csv", index_col = "Unnamed: 0").head(5)
# give integer row positions and column positions only
mtcars.iloc[1,1] # int
mtcars.iloc[1,:] # series, regardless of datatypes inside the series, colon optional
# iris.iloc[:1] # first row, not recommended due to complexity of syntax
iris.iloc[[1],:] # DataFrame, and not a series since the inputs is list and not vector
mtcars.iloc[:,1] # series, regardless of datatypes inside the series, colon mandatory
mtcars.iloc[[1,2] ,[1,2]] # DataFrame
mtcars.iloc[: ,[1,2]] # DataFrame
mtcars.iat[1,1] # int, iat needs both x and y position, not recommended to use
# mtcars.iat[1,] # do not work
# mtcars.iat[:,1] # do not work
# mtcars.iat[[1,2] ,[1,2]] # do not work
# mtcars.iat[: ,[1,2]] # do not work
# Input row names and column names only
mtcars.loc["Datsun 710","cyl"]
mtcars.loc["Datsun 710",] # series, colon optional
mtcars.loc["Datsun 710",:] # series, colon optional
# mtcars.loc[,"cyl"] # series, colon mandatory
mtcars.loc[:,"cyl"] # series, colon mandatory
# mtcars.loc[4,"cyl"] # failed, no rowname called 4
# mtcars.loc["Datsun 710","2"] # failed, no colname called 2
# stopped working in newer version
mtcars.cyl # dtype int, type series
type(mtcars.cyl)
(mtcars.cyl).dtypes
mtcars.cyl > 4 # dtype boolean, type series
type(mtcars.cyl > 4)
(mtcars.cyl > 4).dtypes
mtcars[mtcars.cyl > 4]
# mtcars[some_boolean_series]
s>3 #type series, dtype boolean
s[s>3] #type series, dtype int
s[(s>3)]
s[~(s>3)]
# s[!(s>3)] # does not work, negation happens using tilde ~
s[(s>3) | (s < -3) ]
mtcars.hp
mtcars["hp"]
# s["b"]
# df[1]
# mtcars[1]
mtcars.shape
mtcars.index # type = pandas.core.indexes.base.Index
mtcars.columns # # type = pandas.core.indexes.base.Index
mtcars.info()
mtcars.dtypes # a series
mtcars.count() # didnt understand
df.sum() # sum by each colunm, a series
mtcars.sum()
mtcars.cumsum() # a dataframe
mtcars.min() # a series
mtcars.idxmin() # rowlabel corresponding to lowest value in each column
mtcars.describe() # summary statistics
mtcars.mean() # mean of each colunm, a series
Apply
function
f = lambda x: x + 1000
mtcars.apply(f)
f_mean = lambda x: x.mean()
mtcars.apply(f_mean) # mean of each column
f = lambda x: x + 1000
mtcars.applymap(f)
import pandas as pd
df = pd.read_csv(r"C:\Users\Ashrith Reddy\My Drive\02_learning\python\01_datatypes\mtcars.csv")
# aggregate entire dataset
f_mean = lambda x: x.mean()
df.apply(f_mean) # mean of each column, returns a series
df.groupby('am').apply(f_mean) # # mean of each column, for each group
df_grouped = df.groupby('am')
df_grouped
type(df)
type(df_grouped)
df_grouped.obj # a pandas dataframe
# all attributes of grouped dataframe
df_grouped.__dict__.keys() # same as vars(df_grouped)
f_sum = lambda x: x.mean()
temp = df_grouped.apply(f_sum)
temp # wierd behavior: aggregate the grouping column
df.groupby(['vs','am']).agg(sum).reset_index()
df.groupby(['vs','am']).agg("sum").reset_index() # quotes work for built-in functions
df.groupby(['vs','am']).agg(sum,max).reset_index() # fails
df.groupby(['vs','am']).agg(["sum","max"]).reset_index()
my_sum = lambda x: x.sum()
# df.groupby(['vs','am']).agg("my_sum").reset_index() # quotes fail for UDF
df.groupby(['vs','am']).agg(my_sum).reset_index() # no quotes preferred
df.groupby(['vs','am']).agg(lambda x: x.sum()).reset_index()
def my_sum_2(x):
return x.sum()
df.groupby(['vs','am']).agg(my_sum_2).reset_index()
# single groupby variable, single aggregating column
df['disp'].groupby('vs') # wont work
df['disp'].groupby(df.vs) # works
df['disp'].groupby(df['vs']) # works
df['disp'].groupby(df.vs).mean()
# multiple groupby variable, single aggregating column
df['disp'].groupby(df.vs, df.am) # fails
df['disp'].groupby([df.vs, df.am]) # works
df['disp'].groupby([df.vs, df.am]).mean().reset_index()
# multiple groupby variable, multiple aggregating column
df[['disp','hp']].groupby([df.vs, df.am]).mean().reset_index()
# extras
df.groupby(['vs', 'am']).size().reset_index()
for group, group_data in df.groupby(['vs','am']):
print("###############")
print(group)
print(group_data)
# interchangable order of writing
df[['disp','hp']].groupby([df.vs, df.am]).mean() # is same as
df.groupby([df.vs, df.am])[['disp','hp']].mean() # is same as
df.groupby(["vs", "am"])[['disp','hp']].mean() # preferred
# multiple aggregating functions
df.groupby(["vs", "am"])[['disp','hp']].agg(max) # max applied on disp and hp
df.groupby(["vs", "am"])[['disp','hp']].agg([max,min]) # max applied on disp and hp, and min applied on disp and hp
df.groupby(["vs", "am"])[['disp','hp']].agg({'disp':'max', 'hp':'min'}) # works
df.groupby(["vs", "am"])[['disp','hp']].agg({'disp':max, 'hp':min}) # works
df.groupby(["vs", "am"]).agg({'disp':max, 'hp':min}) # works
# multiple aggregating functions - different functions for each aggregating column
df.groupby(["vs", "am"]).agg({
'disp':max,
'hp':min,
'gear':lambda x: x.max() - x.min()
})
df.groupby(["vs", "am"]).agg({
'disp':[max, min, lambda x: x.max() - x.min()],
'hp':min,
})
df.groupby(["vs", "am"]).agg({
'disp':lambda x: x.max() - x.min(),
'hp':lambda x: x.min() - x.max(),
})
# rename columns after aggregating
temp = df.groupby(["vs", "am"]).agg({
'disp':[max, min, lambda x: x.max() - x.min()],
'hp':min,
})
print(temp)
temp.columns = ['_'.join(x) for x in temp.columns]
print(temp.reset_index())
# name columns while aggregating :)
df.groupby(["vs", "am"]).agg(
min_disp=('disp','min'),
max_disp=('disp','max'),
max_hp=('hp','max'),
min_hp=('hp',min),
range_hp=('hp',lambda x: x.max()-x.min()), # interestingly, a comma is allowed after last aggegation
)
# additional features
df.groupby(["vs", "am"], as_index = False).agg(
min_disp=('disp','min'),
)# avoid having to reset_index()
df.groupby(["vs", "am"]).agg(
min_disp=('disp','min'),
)# avoid having to reset_index()
# df.groupby("cyl").apply(max(hp))
df.groupby("cyl")["hp"].transform("sum")
df.groupby("cyl")["hp"].transform(sum)
df.groupby("cyl")["hp"].transform(lambda x: sum(x)/(1000 + 10))
df.groupby("cyl")[["hp","mpg"]].transform(lambda x: sum(x))
def temp_fun(x):
# print("###########################")
answer = sum(x.vs) - sum(x.am)
return answer
# df.groupby("cyl").transform(temp_fun)
temp = df.groupby("cyl").\
apply(temp_fun).\
to_frame().\
rename(columns={0:"vs_as"}).\
reset_index()
df.merge(temp)