series - one dimensional array dataframe - two dimensional array
import pandas as pd
import numpy as np
# prepare some sample data - convert dictionary to dataframe
df = pd.DataFrame(
{
'key1' : ['a', 'a', 'b', 'b', 'a'],
'key2' : ['one', 'two', 'one', 'two', 'one'],
'data1' : np.random.randn(5),
'data2' : np.random.randn(5)
}
)
# display dataframe
df
| key1 | key2 | data1 | data2 | |
|---|---|---|---|---|
| 0 | a | one | -1.317664 | -0.667562 |
| 1 | a | two | -0.716133 | 1.687897 |
| 2 | b | one | -2.728922 | -0.170244 |
| 3 | b | two | -0.903895 | 0.305859 |
| 4 | a | one | 0.102131 | 0.977549 |
# get dataframe dimension
df.shape
(5, 4)
# get list of dataframe's columns
list(df.columns)
['key1', 'key2', 'data1', 'data2']
# retrieve values from columns
df[['key1', 'data2']]
| key1 | data2 | |
|---|---|---|
| 0 | a | -0.667562 |
| 1 | a | 1.687897 |
| 2 | b | -0.170244 |
| 3 | b | 0.305859 |
| 4 | a | 0.977549 |
# get statistics
df.describe()
| data1 | data2 | |
|---|---|---|
| count | 5.000000 | 5.000000 |
| mean | -1.112897 | 0.426700 |
| std | 1.040536 | 0.930377 |
| min | -2.728922 | -0.667562 |
| 25% | -1.317664 | -0.170244 |
| 50% | -0.903895 | 0.305859 |
| 75% | -0.716133 | 0.977549 |
| max | 0.102131 | 1.687897 |
# get min and max for column data1
print("Min: ", df['data1'].min())
print("Max: ", df['data1'].max())
Min: -2.7289215312993815 Max: 0.10213133931892565
df
| key1 | key2 | data1 | data2 | |
|---|---|---|---|---|
| 0 | a | one | -1.317664 | -0.667562 |
| 1 | a | two | -0.716133 | 1.687897 |
| 2 | b | one | -2.728922 | -0.170244 |
| 3 | b | two | -0.903895 | 0.305859 |
| 4 | a | one | 0.102131 | 0.977549 |
# get unique values - categorical
list(df['key1'].unique())
['a', 'b']
list(df['key2'].unique())
['one', 'two']
# group records by key1 and assign to new dataframe
grouped = df['data1'].groupby(df['key1'])
grouped
<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000020344161D30>
# do statistical operation on grouped/selected data
grouped.mean()
key1 a -0.643889 b -1.816408 Name: data1, dtype: float64
grouped.min()
key1 a -1.317664 b -2.728922 Name: data1, dtype: float64
grouped.max()
key1 a 0.102131 b -0.903895 Name: data1, dtype: float64
grouped.sum()
key1 a -1.931667 b -3.632817 Name: data1, dtype: float64
grouped.count()
key1 a 3 b 2 Name: data1, dtype: int64
# group data1 by 2 columns (key1, key2)
data1_mean = df['data1'].groupby([df['key1'], df['key2']]).mean()
data1_mean
key1 key2
a one -0.607767
two -0.716133
b one -2.728922
two -0.903895
Name: data1, dtype: float64
# group data1 and data2 by columns key1 and key2
data_mean = df[['data1', 'data2']].groupby([df['key1'], df['key2']]).mean()
data_mean
| data1 | data2 | ||
|---|---|---|---|
| key1 | key2 | ||
| a | one | -0.607767 | 0.154994 |
| two | -0.716133 | 1.687897 | |
| b | one | -2.728922 | -0.170244 |
| two | -0.903895 | 0.305859 |
data_mean = df['data1']
data_mean
0 -1.317664 1 -0.716133 2 -2.728922 3 -0.903895 4 0.102131 Name: data1, dtype: float64
data_mean = df[['data1', 'data2']]
data_mean
| data1 | data2 | |
|---|---|---|
| 0 | -1.317664 | -0.667562 |
| 1 | -0.716133 | 1.687897 |
| 2 | -2.728922 | -0.170244 |
| 3 | -0.903895 | 0.305859 |
| 4 | 0.102131 | 0.977549 |
data_mean = df[['key1', 'data1', 'data2']]
data_mean
| key1 | data1 | data2 | |
|---|---|---|---|
| 0 | a | -1.317664 | -0.667562 |
| 1 | a | -0.716133 | 1.687897 |
| 2 | b | -2.728922 | -0.170244 |
| 3 | b | -0.903895 | 0.305859 |
| 4 | a | 0.102131 | 0.977549 |
data_mean = df[['key1', 'data1', 'data2']].groupby(df['key1']).count()
data_mean
| key1 | data1 | data2 | |
|---|---|---|---|
| key1 | |||
| a | 3 | 3 | 3 |
| b | 2 | 2 | 2 |
data_mean = df[['key1', 'data1', 'data2']].groupby([df['key1'], df['key2']]).count()
data_mean
| key1 | data1 | data2 | ||
|---|---|---|---|---|
| key1 | key2 | |||
| a | one | 2 | 2 | 2 |
| two | 1 | 1 | 1 | |
| b | one | 1 | 1 | 1 |
| two | 1 | 1 | 1 |
# python list
values = [1, "one", True, 23.1]
values
[1, 'one', True, 23.1]
# prepare 2 new list of values
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
values = np.random.randn(5)
# combine all lists into 1 and create dataframe
combined = list(zip(states, years, values))
combined
[('Ohio', 2005, -1.019193102821183),
('California', 2005, -1.1974693608132378),
('California', 2006, 1.5979945384609038),
('Ohio', 2005, 0.5262442086258409),
('Ohio', 2006, -1.4846124376831964)]
data = pd.DataFrame(combined, columns=['states', 'years', 'values'])
data
| states | years | values | |
|---|---|---|---|
| 0 | Ohio | 2005 | -1.019193 |
| 1 | California | 2005 | -1.197469 |
| 2 | California | 2006 | 1.597995 |
| 3 | Ohio | 2005 | 0.526244 |
| 4 | Ohio | 2006 | -1.484612 |
# assign lists to a dictionary and convert to dataframe
data_dict = { 'states' : states, 'years' : years, 'values' : values }
data_dict
{'states': array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'], dtype='<U10'),
'years': array([2005, 2005, 2006, 2005, 2006]),
'values': array([-1.0191931 , -1.19746936, 1.59799454, 0.52624421, -1.48461244])}
data = pd.DataFrame(data_dict)
data
| states | years | values | |
|---|---|---|---|
| 0 | Ohio | 2005 | -1.019193 |
| 1 | California | 2005 | -1.197469 |
| 2 | California | 2006 | 1.597995 |
| 3 | Ohio | 2005 | 0.526244 |
| 4 | Ohio | 2006 | -1.484612 |
# group by states and get total values
data['values'].groupby(data['states']).sum()
states California 0.400525 Ohio -1.977561 Name: values, dtype: float64
# group by states and years and get total values
data['values'].groupby([data['states'], data['years']]).sum()
states years
California 2005 -1.197469
2006 1.597995
Ohio 2005 -0.492949
2006 -1.484612
Name: values, dtype: float64
data
| states | years | values | |
|---|---|---|---|
| 0 | Ohio | 2005 | -1.019193 |
| 1 | California | 2005 | -1.197469 |
| 2 | California | 2006 | 1.597995 |
| 3 | Ohio | 2005 | 0.526244 |
| 4 | Ohio | 2006 | -1.484612 |
# group data1 and data2 by key1
data_mean = df[['data1', 'data2']].groupby(df['key1']).mean()
data_mean
| data1 | data2 | |
|---|---|---|
| key1 | ||
| a | -0.643889 | 0.665961 |
| b | -1.816408 | 0.067808 |
# iterate/loop over group - 1 key
for name, grouped in df.groupby('key1') :
print(name)
print(grouped)
a key1 key2 data1 data2 0 a one -1.317664 -0.667562 1 a two -0.716133 1.687897 4 a one 0.102131 0.977549 b key1 key2 data1 data2 2 b one -2.728922 -0.170244 3 b two -0.903895 0.305859
# iterate over group - 2 keys
for (k1, k2), grouped in df.groupby(['key1', 'key2']) :
print((k1, k2))
print(grouped)
('a', 'one')
key1 key2 data1 data2
0 a one -1.317664 -0.667562
4 a one 0.102131 0.977549
('a', 'two')
key1 key2 data1 data2
1 a two -0.716133 1.687897
('b', 'one')
key1 key2 data1 data2
2 b one -2.728922 -0.170244
('b', 'two')
key1 key2 data1 data2
3 b two -0.903895 0.305859
pieces = dict(list(df.groupby('key1')))
type(pieces)
dict
pieces
{'a': key1 key2 data1 data2
0 a one -1.317664 -0.667562
1 a two -0.716133 1.687897
4 a one 0.102131 0.977549,
'b': key1 key2 data1 data2
2 b one -2.728922 -0.170244
3 b two -0.903895 0.305859}
pieces['b']
| key1 | key2 | data1 | data2 | |
|---|---|---|---|---|
| 2 | b | one | -2.728922 | -0.170244 |
| 3 | b | two | -0.903895 | 0.305859 |
df
| key1 | key2 | data1 | data2 | |
|---|---|---|---|---|
| 0 | a | one | -1.317664 | -0.667562 |
| 1 | a | two | -0.716133 | 1.687897 |
| 2 | b | one | -2.728922 | -0.170244 |
| 3 | b | two | -0.903895 | 0.305859 |
| 4 | a | one | 0.102131 | 0.977549 |
# select 1 column to display
df['data1']
0 -1.317664 1 -0.716133 2 -2.728922 3 -0.903895 4 0.102131 Name: data1, dtype: float64
# display 2 columns
df[['data1', 'data2']]
| data1 | data2 | |
|---|---|---|
| 0 | -1.317664 | -0.667562 |
| 1 | -0.716133 | 1.687897 |
| 2 | -2.728922 | -0.170244 |
| 3 | -0.903895 | 0.305859 |
| 4 | 0.102131 | 0.977549 |
# display all columns
df
| key1 | key2 | data1 | data2 | |
|---|---|---|---|---|
| 0 | a | one | -1.317664 | -0.667562 |
| 1 | a | two | -0.716133 | 1.687897 |
| 2 | b | one | -2.728922 | -0.170244 |
| 3 | b | two | -0.903895 | 0.305859 |
| 4 | a | one | 0.102131 | 0.977549 |
# also display all columns
df[:]
| key1 | key2 | data1 | data2 | |
|---|---|---|---|---|
| 0 | a | one | -1.317664 | -0.667562 |
| 1 | a | two | -0.716133 | 1.687897 |
| 2 | b | one | -2.728922 | -0.170244 |
| 3 | b | two | -0.903895 | 0.305859 |
| 4 | a | one | 0.102131 | 0.977549 |
# display all rows, but switched columns
df.iloc[:, [1, 0, 2, 3]]
| key2 | key1 | data1 | data2 | |
|---|---|---|---|---|
| 0 | one | a | -1.317664 | -0.667562 |
| 1 | two | a | -0.716133 | 1.687897 |
| 2 | one | b | -2.728922 | -0.170244 |
| 3 | two | b | -0.903895 | 0.305859 |
| 4 | one | a | 0.102131 | 0.977549 |
df.iloc[:, [2, 3, 0, 1]]
| data1 | data2 | key1 | key2 | |
|---|---|---|---|---|
| 0 | -1.317664 | -0.667562 | a | one |
| 1 | -0.716133 | 1.687897 | a | two |
| 2 | -2.728922 | -0.170244 | b | one |
| 3 | -0.903895 | 0.305859 | b | two |
| 4 | 0.102131 | 0.977549 | a | one |
df[['data1', 'data2', 'key1', 'key2']]
| data1 | data2 | key1 | key2 | |
|---|---|---|---|---|
| 0 | -1.317664 | -0.667562 | a | one |
| 1 | -0.716133 | 1.687897 | a | two |
| 2 | -2.728922 | -0.170244 | b | one |
| 3 | -0.903895 | 0.305859 | b | two |
| 4 | 0.102131 | 0.977549 | a | one |
data
| states | years | values | |
|---|---|---|---|
| 0 | Ohio | 2005 | -1.019193 |
| 1 | California | 2005 | -1.197469 |
| 2 | California | 2006 | 1.597995 |
| 3 | Ohio | 2005 | 0.526244 |
| 4 | Ohio | 2006 | -1.484612 |
data[data['years'] == 2005]
| states | years | values | |
|---|---|---|---|
| 0 | Ohio | 2005 | -1.019193 |
| 1 | California | 2005 | -1.197469 |
| 3 | Ohio | 2005 | 0.526244 |
df
| key1 | key2 | data1 | data2 | |
|---|---|---|---|---|
| 0 | a | one | -1.317664 | -0.667562 |
| 1 | a | two | -0.716133 | 1.687897 |
| 2 | b | one | -2.728922 | -0.170244 |
| 3 | b | two | -0.903895 | 0.305859 |
| 4 | a | one | 0.102131 | 0.977549 |
# add new column
df['data3'] = np.random.randn(5)
df
| key1 | key2 | data1 | data2 | data3 | |
|---|---|---|---|---|---|
| 0 | a | one | -1.317664 | -0.667562 | 0.237096 |
| 1 | a | two | -0.716133 | 1.687897 | -0.571505 |
| 2 | b | one | -2.728922 | -0.170244 | 1.988587 |
| 3 | b | two | -0.903895 | 0.305859 | -2.926296 |
| 4 | a | one | 0.102131 | 0.977549 | 0.807568 |
# add new row
newrow = pd.DataFrame([['a', 'one', np.random.randn(1)[0], np.random.randn(1)[0], np.random.randn(1)[0]]], columns=['key1', 'key2', 'data1', 'data2', 'data3'], index=[5])
newrow
| key1 | key2 | data1 | data2 | data3 | |
|---|---|---|---|---|---|
| 5 | a | one | -0.550009 | 0.967206 | -1.987207 |
df = df.append(newrow)
df
| key1 | key2 | data1 | data2 | data3 | |
|---|---|---|---|---|---|
| 0 | a | one | -1.317664 | -0.667562 | 0.237096 |
| 1 | a | two | -0.716133 | 1.687897 | -0.571505 |
| 2 | b | one | -2.728922 | -0.170244 | 1.988587 |
| 3 | b | two | -0.903895 | 0.305859 | -2.926296 |
| 4 | a | one | 0.102131 | 0.977549 | 0.807568 |
| 5 | a | one | -0.550009 | 0.967206 | -1.987207 |
# delete rows from dataframe
df = df.drop(5)
df
| key1 | key2 | data1 | data2 | data3 | |
|---|---|---|---|---|---|
| 0 | a | one | -1.317664 | -0.667562 | 0.237096 |
| 1 | a | two | -0.716133 | 1.687897 | -0.571505 |
| 2 | b | one | -2.728922 | -0.170244 | 1.988587 |
| 3 | b | two | -0.903895 | 0.305859 | -2.926296 |
| 4 | a | one | 0.102131 | 0.977549 | 0.807568 |
# check the data types of dataframe
df.dtypes
key1 object key2 object data1 float64 data2 float64 data3 float64 dtype: object
data.dtypes
states object years int32 values float64 dtype: object
# create another dataframe that contains duplicates
items = pd.DataFrame({
'brand' : ['Maggi', 'Maggi', 'Indomie', 'Indomie', 'Indomie'],
'style' : ['cup', 'cup', 'cup', 'pack', 'pack'],
'rating': [4, 4, 3.5, 15, 5]
})
items
| brand | style | rating | |
|---|---|---|---|
| 0 | Maggi | cup | 4.0 |
| 1 | Maggi | cup | 4.0 |
| 2 | Indomie | cup | 3.5 |
| 3 | Indomie | pack | 15.0 |
| 4 | Indomie | pack | 5.0 |
# by default, remove duplicate rows based on all columns
# keep the first and remove the rest
items.drop_duplicates()
| brand | style | rating | |
|---|---|---|---|
| 0 | Maggi | cup | 4.0 |
| 2 | Indomie | cup | 3.5 |
| 3 | Indomie | pack | 15.0 |
| 4 | Indomie | pack | 5.0 |
# remove and keep the last
items.drop_duplicates(keep='last')
| brand | style | rating | |
|---|---|---|---|
| 1 | Maggi | cup | 4.0 |
| 2 | Indomie | cup | 3.5 |
| 3 | Indomie | pack | 15.0 |
| 4 | Indomie | pack | 5.0 |
items
| brand | style | rating | |
|---|---|---|---|
| 0 | Maggi | cup | 4.0 |
| 1 | Maggi | cup | 4.0 |
| 2 | Indomie | cup | 3.5 |
| 3 | Indomie | pack | 15.0 |
| 4 | Indomie | pack | 5.0 |
# remove duplicates based on selected column
items.drop_duplicates(subset=['brand'], keep='last')
| brand | style | rating | |
|---|---|---|---|
| 1 | Maggi | cup | 4.0 |
| 4 | Indomie | pack | 5.0 |
items.drop_duplicates(subset=['brand'])
| brand | style | rating | |
|---|---|---|---|
| 0 | Maggi | cup | 4.0 |
| 2 | Indomie | cup | 3.5 |
# remove duplicates based on 2 columns
items.drop_duplicates(subset=['brand', 'style'])
| brand | style | rating | |
|---|---|---|---|
| 0 | Maggi | cup | 4.0 |
| 2 | Indomie | cup | 3.5 |
| 3 | Indomie | pack | 15.0 |