import pandas as pd
import numpy as np


# prepare some sample data - convert dictionary to dataframe
df = pd.DataFrame(
    {
        'key1' : ['a', 'a', 'b', 'b', 'a'],
        'key2' : ['one', 'two', 'one', 'two', 'one'],
        'data1' : np.random.randn(5),
        'data2' : np.random.randn(5)
    }
)


# display dataframe
df


# get dataframe dimension
df.shape

(5, 4)


# get list of dataframe's columns
list(df.columns)

['key1', 'key2', 'data1', 'data2']


# retrieve values from columns
df[['key1', 'data2']]


# get statistics
df.describe()


# get min and max for column data1
print("Min: ", df['data1'].min())
print("Max: ", df['data1'].max())

Min:  -2.7289215312993815
Max:  0.10213133931892565

df


# get unique values - categorical
list(df['key1'].unique())

['a', 'b']


list(df['key2'].unique())

['one', 'two']


# group records by key1 and assign to new dataframe
grouped = df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000020344161D30>


# do statistical operation on grouped/selected data
grouped.mean()

key1
a   -0.643889
b   -1.816408
Name: data1, dtype: float64


grouped.min()

key1
a   -1.317664
b   -2.728922
Name: data1, dtype: float64


grouped.max()

key1
a    0.102131
b   -0.903895
Name: data1, dtype: float64


grouped.sum()

key1
a   -1.931667
b   -3.632817
Name: data1, dtype: float64


grouped.count()

key1
a    3
b    2
Name: data1, dtype: int64


# group data1 by 2 columns (key1, key2)
data1_mean = df['data1'].groupby([df['key1'], df['key2']]).mean()
data1_mean

key1  key2
a     one    -0.607767
      two    -0.716133
b     one    -2.728922
      two    -0.903895
Name: data1, dtype: float64


# group data1 and data2 by columns key1 and key2
data_mean = df[['data1', 'data2']].groupby([df['key1'], df['key2']]).mean()
data_mean


data_mean = df['data1']
data_mean

0   -1.317664
1   -0.716133
2   -2.728922
3   -0.903895
4    0.102131
Name: data1, dtype: float64


data_mean = df[['data1', 'data2']]
data_mean


data_mean = df[['key1', 'data1', 'data2']]
data_mean


data_mean = df[['key1', 'data1', 'data2']].groupby(df['key1']).count()
data_mean


data_mean = df[['key1', 'data1', 'data2']].groupby([df['key1'], df['key2']]).count()
data_mean


# python list
values = [1, "one", True, 23.1]
values

[1, 'one', True, 23.1]


# prepare 2 new list of values
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
values = np.random.randn(5)


# combine all lists into 1 and create dataframe
combined = list(zip(states, years, values))
combined

[('Ohio', 2005, -1.019193102821183),
 ('California', 2005, -1.1974693608132378),
 ('California', 2006, 1.5979945384609038),
 ('Ohio', 2005, 0.5262442086258409),
 ('Ohio', 2006, -1.4846124376831964)]


data = pd.DataFrame(combined, columns=['states', 'years', 'values'])
data


# assign lists to a dictionary and convert to dataframe
data_dict = { 'states' : states, 'years' : years, 'values' : values }
data_dict

{'states': array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'], dtype='<U10'),
 'years': array([2005, 2005, 2006, 2005, 2006]),
 'values': array([-1.0191931 , -1.19746936,  1.59799454,  0.52624421, -1.48461244])}


data = pd.DataFrame(data_dict)
data


# group by states and get total values
data['values'].groupby(data['states']).sum()

states
California    0.400525
Ohio         -1.977561
Name: values, dtype: float64


# group by states and years and get total values
data['values'].groupby([data['states'], data['years']]).sum()

states      years
California  2005    -1.197469
            2006     1.597995
Ohio        2005    -0.492949
            2006    -1.484612
Name: values, dtype: float64


data


# group data1 and data2 by key1
data_mean = df[['data1', 'data2']].groupby(df['key1']).mean()
data_mean


# iterate/loop over group - 1 key
for name, grouped in df.groupby('key1') :
    print(name)
    print(grouped)

a
  key1 key2     data1     data2
0    a  one -1.317664 -0.667562
1    a  two -0.716133  1.687897
4    a  one  0.102131  0.977549
b
  key1 key2     data1     data2
2    b  one -2.728922 -0.170244
3    b  two -0.903895  0.305859


# iterate over group - 2 keys
for (k1, k2), grouped in df.groupby(['key1', 'key2']) :
    print((k1, k2))
    print(grouped)

('a', 'one')
  key1 key2     data1     data2
0    a  one -1.317664 -0.667562
4    a  one  0.102131  0.977549
('a', 'two')
  key1 key2     data1     data2
1    a  two -0.716133  1.687897
('b', 'one')
  key1 key2     data1     data2
2    b  one -2.728922 -0.170244
('b', 'two')
  key1 key2     data1     data2
3    b  two -0.903895  0.305859


pieces = dict(list(df.groupby('key1')))


type(pieces)

dict


pieces

{'a':   key1 key2     data1     data2
 0    a  one -1.317664 -0.667562
 1    a  two -0.716133  1.687897
 4    a  one  0.102131  0.977549,
 'b':   key1 key2     data1     data2
 2    b  one -2.728922 -0.170244
 3    b  two -0.903895  0.305859}


pieces['b']

df


# select 1 column to display
df['data1']

0   -1.317664
1   -0.716133
2   -2.728922
3   -0.903895
4    0.102131
Name: data1, dtype: float64


# display 2 columns
df[['data1', 'data2']]


# display all columns
df


# also display all columns
df[:]


# display all rows, but switched columns
df.iloc[:, [1, 0, 2, 3]]


df.iloc[:, [2, 3, 0, 1]]


df[['data1', 'data2', 'key1', 'key2']]


data


data[data['years'] == 2005]

df


# add new column
df['data3'] = np.random.randn(5)
df


# add new row
newrow = pd.DataFrame([['a', 'one', np.random.randn(1)[0], np.random.randn(1)[0], np.random.randn(1)[0]]], columns=['key1', 'key2', 'data1', 'data2', 'data3'], index=[5])


newrow


df = df.append(newrow)
df


# delete rows from dataframe
df = df.drop(5)
df


# check the data types of dataframe
df.dtypes

key1      object
key2      object
data1    float64
data2    float64
data3    float64
dtype: object


data.dtypes

states     object
years       int32
values    float64
dtype: object


# create another dataframe that contains duplicates
items = pd.DataFrame({
    'brand' : ['Maggi', 'Maggi', 'Indomie', 'Indomie', 'Indomie'],
    'style' : ['cup', 'cup', 'cup', 'pack', 'pack'],
    'rating': [4, 4, 3.5, 15, 5]
})


items


# by default, remove duplicate rows based on all columns
# keep the first and remove the rest
items.drop_duplicates()


# remove and keep the last
items.drop_duplicates(keep='last')


items


# remove duplicates based on selected column
items.drop_duplicates(subset=['brand'], keep='last')


items.drop_duplicates(subset=['brand'])


# remove duplicates based on 2 columns
items.drop_duplicates(subset=['brand', 'style'])

	key1	key2	data1	data2
0	a	one	-1.317664	-0.667562
1	a	two	-0.716133	1.687897
2	b	one	-2.728922	-0.170244
3	b	two	-0.903895	0.305859
4	a	one	0.102131	0.977549

	data1	data2
count	5.000000	5.000000
mean	-1.112897	0.426700
std	1.040536	0.930377
min	-2.728922	-0.667562
25%	-1.317664	-0.170244
50%	-0.903895	0.305859
75%	-0.716133	0.977549
max	0.102131	1.687897

		data1	data2
key1	key2
a	one	-0.607767	0.154994
a	two	-0.716133	1.687897
b	one	-2.728922	-0.170244
b	two	-0.903895	0.305859

	states	years	values
0	Ohio	2005	-1.019193
1	California	2005	-1.197469
2	California	2006	1.597995
3	Ohio	2005	0.526244
4	Ohio	2006	-1.484612

	key1	key2	data1	data2	data3
0	a	one	-1.317664	-0.667562	0.237096
1	a	two	-0.716133	1.687897	-0.571505
2	b	one	-2.728922	-0.170244	1.988587
3	b	two	-0.903895	0.305859	-2.926296
4	a	one	0.102131	0.977549	0.807568

	brand	style	rating
0	Maggi	cup	4.0
1	Maggi	cup	4.0
2	Indomie	cup	3.5
3	Indomie	pack	15.0
4	Indomie	pack	5.0

package: pandas¶