Taken from Chapter 3 of
Python Data Science Handbook,
by Jake VanderPlas,
O'Reilly Media, Inc
2016
Finding and removing
import numpy as np
import pandas as pd
# Some data with Nan & None
data = pd.Series([1, np.nan, 'hello', None])
data
0 1 1 NaN 2 hello 3 None dtype: object
# Finding all Nan & None locations
data.isnull()
0 False 1 True 2 False 3 True dtype: bool
# Using boolean indexing
data[data.notnull()]
0 1 2 hello dtype: object
# Simpler way to drop Nan & None
x = data.dropna()
x
0 1 2 hello dtype: object
# Dataframe with Nan
df = pd.DataFrame([[6, np.nan, 2],
[2, 3, 5],
[np.nan, 4, 6]])
df
0 | 1 | 2 | |
---|---|---|---|
0 | 6.0 | NaN | 2 |
1 | 2.0 | 3.0 | 5 |
2 | NaN | 4.0 | 6 |
# On DataFrame drop rows with NaN or None
df.dropna()
0 | 1 | 2 | |
---|---|---|---|
1 | 2.0 | 3.0 | 5 |
# Drop colums with Nan or None
df.dropna(axis='columns')
2 | |
---|---|
0 | 2 |
1 | 5 |
2 | 6 |
# Modifying data to show how options
df[3] = np.nan
df
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 6.0 | NaN | 2 | NaN |
1 | 2.0 | 3.0 | 5 | NaN |
2 | NaN | 4.0 | 6 | NaN |
# drop columns that are all NaN, options all, any (default)
df.dropna(axis='columns', how='all')
0 | 1 | 2 | |
---|---|---|---|
0 | 6.0 | NaN | 2 |
1 | 2.0 | 3.0 | 5 |
2 | NaN | 4.0 | 6 |
# thresh = number of on Nan values needed to keep
df.dropna(axis='rows', thresh=3)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
1 | 2.0 | 3.0 | 5 | NaN |
Some operations require that insices be in order
# Dataframe with unsorted indices
index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names = ['char', 'int']
data
char int a 1 0.532697 2 0.620827 c 1 0.435243 2 0.142022 b 1 0.408163 2 0.309437 dtype: float64
# Can't create slice do to indices being out of order
try:
data['a':'b']
except KeyError as e:
print(type(e))
print(e)
<class 'pandas.errors.UnsortedIndexError'> 'Key length (1) was greater than MultiIndex lexsort depth (0)'
# sort the index
data = data.sort_index()
data
char int a 1 0.532697 2 0.620827 b 1 0.408163 2 0.309437 c 1 0.435243 2 0.142022 dtype: float64
# Now can slice
data['a':'b']
char int a 1 0.532697 2 0.620827 b 1 0.408163 2 0.309437 dtype: float64
Multi-indexes allow Dataframes of higer dimensions
# hierarchical indices and columns
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
names=['subject', 'type'])
# mock some data
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37
# create the DataFrame
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data
subject | Bob | Guido | Sue | ||||
---|---|---|---|---|---|---|---|
type | HR | Temp | HR | Temp | HR | Temp | |
year | visit | ||||||
2013 | 1 | 50.0 | 38.2 | 39.0 | 37.7 | 48.0 | 37.8 |
2 | 31.0 | 35.9 | 25.0 | 36.0 | 24.0 | 37.1 | |
2014 | 1 | 23.0 | 34.4 | 48.0 | 35.5 | 29.0 | 36.1 |
2 | 31.0 | 37.2 | 42.0 | 38.3 | 37.0 | 35.8 |
health_data['Bob']
type | HR | Temp | |
---|---|---|---|
year | visit | ||
2013 | 1 | 50.0 | 38.2 |
2 | 31.0 | 35.9 | |
2014 | 1 | 23.0 | 34.4 |
2 | 31.0 | 37.2 |
health_data['Bob'].mean()
type HR 33.750 Temp 36.425 dtype: float64
# Our data
health_data
subject | Bob | Guido | Sue | ||||
---|---|---|---|---|---|---|---|
type | HR | Temp | HR | Temp | HR | Temp | |
year | visit | ||||||
2013 | 1 | 50.0 | 38.2 | 39.0 | 37.7 | 48.0 | 37.8 |
2 | 31.0 | 35.9 | 25.0 | 36.0 | 24.0 | 37.1 | |
2014 | 1 | 23.0 | 34.4 | 48.0 | 35.5 | 29.0 | 36.1 |
2 | 31.0 | 37.2 | 42.0 | 38.3 | 37.0 | 35.8 |
# mean of everyone's values per year
data_mean = health_data.mean(level='year')
data_mean
subject | Bob | Guido | Sue | |||
---|---|---|---|---|---|---|
type | HR | Temp | HR | Temp | HR | Temp |
year | ||||||
2013 | 40.5 | 37.05 | 32.0 | 36.85 | 36.0 | 37.45 |
2014 | 27.0 | 35.80 | 45.0 | 36.90 | 33.0 | 35.95 |
# How does this work?
data_mean.mean(axis=1, level='type')
type | HR | Temp |
---|---|---|
year | ||
2013 | 36.166667 | 37.116667 |
2014 | 35.000000 | 36.216667 |
Combining Series is straight forward
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([ser1, ser2])
1 A 2 B 3 C 4 D 5 E 6 F dtype: object
Used to keep slides small
def make_df(cols, ind):
"""Quickly make a DataFrame"""
data = {c: [str(c) + str(i) for i in ind]
for c in cols}
return pd.DataFrame(data, ind)
# example DataFrame
make_df('ABC', range(3))
A | B | C | |
---|---|---|---|
0 | A0 | B0 | C0 |
1 | A1 | B1 | C1 |
2 | A2 | B2 | C2 |
# More helper functions
def println(object):
print(object)
print("\n")
def show_concat(df1, df2):
println(df1)
println(df2)
println(pd.concat([df1, df2]))
# Add columns
AB12 = make_df('AB', [1, 2])
AB34 = make_df('AB', [3, 4])
show_concat(AB12, AB34)
A B 1 A1 B1 2 A2 B2 A B 3 A3 B3 4 A4 B4 A B 1 A1 B1 2 A2 B2 3 A3 B3 4 A4 B4
# Add Rows
AB01 = make_df('AB', [0, 1])
CD01 = make_df('CD', [0, 1])
println(AB01)
println(CD01)
pd.concat([AB01, CD01],axis=1)
A B 0 A0 B0 1 A1 B1 C D 0 C0 D0 1 C1 D1
A | B | C | D | |
---|---|---|---|---|
0 | A0 | B0 | C0 | D0 |
1 | A1 | B1 | C1 | D1 |
# Concat keeps duplicate indices
AB01 = make_df('AB', [0, 1])
AB23 = make_df('AB', [2, 3])
AB23.index = AB01.index # make duplicate indices!
show_concat(AB01, AB23)
A B 0 A0 B0 1 A1 B1 A B 0 A2 B2 1 A3 B3 A B 0 A0 B0 1 A1 B1 0 A2 B2 1 A3 B3
# Raise an error on dupilcate indices
try:
pd.concat([AB01, AB23], verify_integrity=True)
except ValueError as e:
print("ValueError:", e)
ValueError: Indexes have overlapping values: Int64Index([0, 1], dtype='int64')
# Ignore row index, reindex
println(AB01)
print(AB23)
pd.concat([AB01, AB23], ignore_index=True)
A B 0 A0 B0 1 A1 B1 A B 0 A2 B2 1 A3 B3
A | B | |
---|---|---|
0 | A0 | B0 |
1 | A1 | B1 |
2 | A2 | B2 |
3 | A3 | B3 |
# Keeps all columns
AB01 = make_df('AB', [0, 1])
AB34 = make_df('BC', [3, 4])
println(AB01)
println(AB34)
pd.concat([AB01, AB34])
A B 0 A0 B0 1 A1 B1 B C 3 B3 C3 4 B4 C4
A | B | C | |
---|---|---|---|
0 | A0 | B0 | NaN |
1 | A1 | B1 | NaN |
3 | NaN | B3 | C3 |
4 | NaN | B4 | C4 |
# Add index to make it multidimensional
AB01 = make_df('AB', [0, 1])
BC23 = make_df('BC', [2, 3])
println(AB01)
println(BC23)
println(pd.concat([AB01, BC23], keys=['AB', 'BC']))
A B 0 A0 B0 1 A1 B1 B C 2 B2 C2 3 B3 C3 A B C AB 0 A0 B0 NaN 1 A1 B1 NaN BC 2 NaN B2 C2 3 NaN B3 C3
1-1
1-many
many-many
# Sample Data
group = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
hire_date = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
'hire_date': [2004, 2008, 2012, 2014]})
println(group); println(hire_date)
employee group 0 Bob Accounting 1 Jake Engineering 2 Lisa Engineering 3 Sue HR employee hire_date 0 Lisa 2004 1 Bob 2008 2 Jake 2012 3 Sue 2014
# Combine two Dataframes into one
group_hire_date = pd.merge(group, hire_date)
group_hire_date
employee | group | hire_date | |
---|---|---|---|
0 | Bob | Accounting | 2008 |
1 | Jake | Engineering | 2012 |
2 | Lisa | Engineering | 2004 |
3 | Sue | HR | 2014 |
pd.merge(hire_date, group)
employee | hire_date | group | |
---|---|---|---|
0 | Lisa | 2004 | Engineering |
1 | Bob | 2008 | Accounting |
2 | Jake | 2012 | Engineering |
3 | Sue | 2014 | HR |
supervisor = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],
'supervisor': ['Carly', 'Guido', 'Steve']})
println(group_hire_date); println(supervisor);
pd.merge(group_hire_date, supervisor)
employee group hire_date 0 Bob Accounting 2008 1 Jake Engineering 2012 2 Lisa Engineering 2004 3 Sue HR 2014 group supervisor 0 Accounting Carly 1 Engineering Guido 2 HR Steve
employee | group | hire_date | supervisor | |
---|---|---|---|---|
0 | Bob | Accounting | 2008 | Carly |
1 | Jake | Engineering | 2012 | Guido |
2 | Lisa | Engineering | 2004 | Guido |
3 | Sue | HR | 2014 | Steve |
skills = pd.DataFrame({'group': ['Accounting', 'Accounting',
'Engineering', 'Engineering', 'HR', 'HR'],
'skills': ['math', 'spreadsheets', 'coding', 'linux',
'spreadsheets', 'organization']})
skills
group | skills | |
---|---|---|
0 | Accounting | math |
1 | Accounting | spreadsheets |
2 | Engineering | coding |
3 | Engineering | linux |
4 | HR | spreadsheets |
5 | HR | organization |
println(group); println(skills)
pd.merge(group, skills)
employee group 0 Bob Accounting 1 Jake Engineering 2 Lisa Engineering 3 Sue HR group skills 0 Accounting math 1 Accounting spreadsheets 2 Engineering coding 3 Engineering linux 4 HR spreadsheets 5 HR organization
employee | group | skills | |
---|---|---|---|
0 | Bob | Accounting | math |
1 | Bob | Accounting | spreadsheets |
2 | Jake | Engineering | coding |
3 | Jake | Engineering | linux |
4 | Lisa | Engineering | coding |
5 | Lisa | Engineering | linux |
6 | Sue | HR | spreadsheets |
7 | Sue | HR | organization |
Specify which colums or rows to merge on
How to handle the case when column names are not the same in DataFrames
group
employee | group | |
---|---|---|
0 | Bob | Accounting |
1 | Jake | Engineering |
2 | Lisa | Engineering |
3 | Sue | HR |
salary = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
'salary': [70000, 80000, 120000, 90000]})
salary
name | salary | |
---|---|---|
0 | Bob | 70000 |
1 | Jake | 80000 |
2 | Lisa | 120000 |
3 | Sue | 90000 |
Column names do not match
# Specify which columns to merge on
# Can provide a list of columns
pd.merge(group, salary, left_on="employee", right_on="name")
employee | group | name | salary | |
---|---|---|---|---|
0 | Bob | Accounting | Bob | 70000 |
1 | Jake | Engineering | Jake | 80000 |
2 | Lisa | Engineering | Lisa | 120000 |
3 | Sue | HR | Sue | 90000 |
What if multuple column names are the same?
How to specify which column to merge on?
# Use on to specify column(s) to merge on
pd.merge(group, hire_date, on='employee')
employee | group | hire_date | |
---|---|---|---|
0 | Bob | Accounting | 2008 |
1 | Jake | Engineering | 2012 |
2 | Lisa | Engineering | 2004 |
3 | Sue | HR | 2014 |
First some data
# Make a dataframe
def make_int_df(cols, ind, increment):
"""Quickly make a DataFrame"""
data = {c: [i + increment for i in ind]
for c in cols}
return pd.DataFrame(data, ind)
abc = make_int_df('ABC', [1, 2],0)
abc
A | B | C | |
---|---|---|---|
1 | 1 | 1 | 1 |
2 | 2 | 2 | 2 |
# Note colunms do not match
bcd = make_int_df('BCD', [1, 2],0)
bcd
B | C | D | |
---|---|---|---|
1 | 1 | 1 | 1 |
2 | 2 | 2 | 2 |
# Here it is
pd.merge(abc, bcd, on='B')
A | B | C_x | C_y | D | |
---|---|---|---|---|---|
0 | 1 | 1 | 1 | 1 | 1 |
1 | 2 | 2 | 2 | 2 | 2 |
# Make one of the columns the index
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
group = df1.set_index('employee')
group
group | |
---|---|
employee | |
Bob | Accounting |
Jake | Engineering |
Lisa | Engineering |
Sue | HR |
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
'hire_date': [2004, 2008, 2012, 2014]})
hire_date = df2.set_index('employee')
hire_date
hire_date | |
---|---|
employee | |
Lisa | 2004 |
Bob | 2008 |
Jake | 2012 |
Sue | 2014 |
# Now merge
pd.merge(group, hire_date, left_index=True, right_index=True)
group | hire_date | |
---|---|---|
employee | ||
Bob | Accounting | 2008 |
Jake | Engineering | 2012 |
Lisa | Engineering | 2004 |
Sue | HR | 2014 |
# join does merge on rows
# Note the switch from function to method
group.join(hire_date)
group | hire_date | |
---|---|---|
employee | ||
Bob | Accounting | 2008 |
Jake | Engineering | 2012 |
Lisa | Engineering | 2004 |
Sue | HR | 2014 |
Inner join - just data in common
Outer join - all the data
Left join - Left table + common data in right table
Right join - Right table + common data in left table
By changing order of dataframes left join becomes right join
# Some Data
food = pd.DataFrame(
{'name': ['Peter', 'Paul', 'Mary'],
'food': ['fish', 'beans', 'bread']},
columns=['name', 'food'])
food
name | food | |
---|---|---|
0 | Peter | fish |
1 | Paul | beans |
2 | Mary | bread |
drink = pd.DataFrame(
{'name': ['Mary', 'Joseph'],
'drink': ['wine', 'beer']},
columns=['name', 'drink'])
drink
name | drink | |
---|---|---|
0 | Mary | wine |
1 | Joseph | beer |
Inner Join - on Mary
Outer Join - all
pd.merge(drink, food)
name | drink | food | |
---|---|---|---|
0 | Mary | wine | bread |
# how='inner' is the default
pd.merge(drink,food, how='inner')
name | drink | food | |
---|---|---|---|
0 | Mary | wine | bread |
pd.merge(drink,food, how='outer')
name | drink | food | |
---|---|---|---|
0 | Mary | wine | bread |
1 | Joseph | beer | NaN |
2 | Peter | NaN | fish |
3 | Paul | NaN | beans |
# Order of dataframes changes order of rows & columns
pd.merge(food,drink, how='outer')
name | food | drink | |
---|---|---|---|
0 | Peter | fish | NaN |
1 | Paul | beans | NaN |
2 | Mary | bread | wine |
3 | Joseph | NaN | beer |
# Add to food
println(food)
pd.merge(food,drink, how='left')
name food 0 Peter fish 1 Paul beans 2 Mary bread
name | food | drink | |
---|---|---|---|
0 | Peter | fish | NaN |
1 | Paul | beans | NaN |
2 | Mary | bread | wine |
# Add to drink
pd.merge(food,drink, how='right')
name | food | drink | |
---|---|---|---|
0 | Mary | bread | wine |
1 | Joseph | NaN | beer |
# Note relationship between left & right
pd.merge(drink,food, how='left')
name | drink | food | |
---|---|---|---|
0 | Mary | wine | bread |
1 | Joseph | beer | NaN |
Pandas do most of the work in C code
But at times can not do all computation to one call to C code
So make multiple calls to C & store intermediate results
Creating memory slows down the computation
# vector addition done is C
import numpy as np
rng = np.random.RandomState(42)
x = rng.rand(1_000_000)
y = rng.rand(1_000_000)
%timeit x + y
1.97 ms ± 94.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# Here we do it all in python
%timeit np.fromiter((xi + yi for xi, yi in zip(x, y)),dtype=x.dtype, count=len(x))
367 ms ± 13.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
mask = (x > 0.5) & (y < 0.5)
is equivalent to
tmp1 = (x > 0.5) tmp2 = (y < 0.5) mask = tmp1 & tmp2
So we allocate space for tmp1 & tmp2
eval will perform all operations in C
# Some data to add together
import pandas as pd
nrows, ncols = 100000, 100
rng = np.random.RandomState(42)
df1, df2, df3, df4 = (pd.DataFrame(rng.rand(nrows, ncols))
for i in range(4))
df1.tail()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
99995 | 0.071979 | 0.439323 | 0.188588 | 0.586705 | 0.640611 | 0.662409 | 0.318503 | 0.600419 | 0.609742 | 0.390592 | ... | 0.122887 | 0.491140 | 0.032855 | 0.567250 | 0.428673 | 0.421092 | 0.021024 | 0.398596 | 0.405897 | 0.869783 |
99996 | 0.313411 | 0.010490 | 0.469216 | 0.600825 | 0.451085 | 0.496918 | 0.983128 | 0.422056 | 0.719077 | 0.045588 | ... | 0.072444 | 0.715574 | 0.300257 | 0.087290 | 0.130703 | 0.549202 | 0.287877 | 0.589258 | 0.516884 | 0.254370 |
99997 | 0.560873 | 0.647396 | 0.043068 | 0.282439 | 0.042950 | 0.346690 | 0.954034 | 0.603182 | 0.447768 | 0.888498 | ... | 0.880079 | 0.508377 | 0.442052 | 0.621332 | 0.314942 | 0.131085 | 0.697310 | 0.111705 | 0.397560 | 0.988347 |
99998 | 0.710115 | 0.067999 | 0.611329 | 0.136199 | 0.054724 | 0.018160 | 0.911428 | 0.762005 | 0.245312 | 0.891027 | ... | 0.249632 | 0.894231 | 0.342761 | 0.844330 | 0.659797 | 0.835561 | 0.117920 | 0.211202 | 0.931760 | 0.296913 |
99999 | 0.116834 | 0.461155 | 0.754556 | 0.250272 | 0.864631 | 0.588966 | 0.358260 | 0.655541 | 0.946702 | 0.145819 | ... | 0.298095 | 0.496018 | 0.990856 | 0.368719 | 0.837910 | 0.244971 | 0.061637 | 0.933295 | 0.245149 | 0.388890 |
5 rows × 100 columns
%timeit df1 + df2 + df3 + df4
82 ms ± 3.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit pd.eval('df1 + df2 + df3 + df4')
38.9 ms ± 1.44 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
+, -, *, /, **, %, //
boolean operations: | (or), & (and), and ~ (not)
and, or, and not with the same semantics as the corresponding bitwise operators
Series and DataFrame objects are supported and behave as they would with plain ol’ Python evaluation.
# Some data
rng = np.random.RandomState(42)
df = pd.DataFrame(rng.rand(1000, 3), columns=['A', 'B', 'C'])
df.head()
A | B | C | |
---|---|---|---|
0 | 0.374540 | 0.950714 | 0.731994 |
1 | 0.598658 | 0.156019 | 0.155995 |
2 | 0.058084 | 0.866176 | 0.601115 |
3 | 0.708073 | 0.020584 | 0.969910 |
4 | 0.832443 | 0.212339 | 0.181825 |
# Normal operations & Using eval
result1 = (df['A'] + df['B']) / (df['C'] - 1)
result2 = pd.eval("(df.A + df.B) / (df.C - 1)")
np.allclose(result1, result2)
True
# Can just use column names
result3 = df.eval('(A + B) / (C - 1)')
np.allclose(result1, result3)
True
# Can assign to columns, new or existing
df.eval('D = (A + B) / C', inplace=True)
df.head()
eval can access values of python variables
column_mean = df.mean(1) # column_mean is in Python memory space
result1 = df['A'] + column_mean
result2 = df.eval('A + @column_mean') # eval is a C program
np.allclose(result1, result2)
# Our data
rng = np.random.RandomState(42)
df = pd.DataFrame(rng.rand(1000, 3), columns=['A', 'B', 'C'])
df.head()
A | B | C | |
---|---|---|---|
0 | 0.374540 | 0.950714 | 0.731994 |
1 | 0.598658 | 0.156019 | 0.155995 |
2 | 0.058084 | 0.866176 | 0.601115 |
3 | 0.708073 | 0.020584 | 0.969910 |
4 | 0.832443 | 0.212339 | 0.181825 |
# Sample Expession
df[(df.A < 0.5) & (df.B < 0.5)].tail()
A | B | C | |
---|---|---|---|
970 | 0.230417 | 0.001474 | 0.729345 |
973 | 0.302931 | 0.325295 | 0.712621 |
987 | 0.345342 | 0.335610 | 0.978525 |
990 | 0.380518 | 0.163035 | 0.786206 |
999 | 0.309788 | 0.290046 | 0.871414 |
# Using Panda.eval
result1 = df[(df.A < 0.5) & (df.B < 0.5)]
result2 = pd.eval('df[(df.A < 0.5) & (df.B < 0.5)]')
np.allclose(result1, result2)
True
# Not possible to use DataFrame.eval
# But query works
result2 = df.query('A < 0.5 and B < 0.5')
np.allclose(result1, result2)
True
DataFrame.index and DataFrame.columns are in the query namespace
Panda.eval used to evaluate the expression
Result of evaluation is first passed to DataFrame.loc
If that fails it is passed to DataFrame.getitem()
Memory
Computational Time
rng = np.random.RandomState(42)
df = pd.DataFrame(rng.rand(2000, 3), columns=['A', 'B', 'C'])
df.values.nbytes
48000
x = df[(df.A < 0.5) & (df.B < 0.5)]
same as
tmp1 = df.A < 0.5
tmp2 = df.B < 0.5
tmp3 = tmp1 & tmp2
x = df[tmp3]
tmp1 = df.A < 0.5
tmp2 = df.B < 0.5
tmp3 = tmp1 & tmp2
x = df[tmp3]
tmp1.values.nbytes
2000
tmp2.values.nbytes == tmp3.nbytes == tmp1.values.nbytes
True
x.values.nbytes
11592
# Our data
# Following are shell commands to download the data
# !curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-population.csv
#!curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-areas.csv
#!curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-abbrevs.csv
population = pd.read_csv('state-population.csv')
areas = pd.read_csv('state-areas.csv')
abbrevs = pd.read_csv('state-abbrevs.csv')
population.head()
state/region | ages | year | population | |
---|---|---|---|---|
0 | AL | under18 | 2012 | 1117489.0 |
1 | AL | total | 2012 | 4817528.0 |
2 | AL | under18 | 2010 | 1130966.0 |
3 | AL | total | 2010 | 4785570.0 |
4 | AL | under18 | 2011 | 1125763.0 |
areas.head()
state | area (sq. mi) | |
---|---|---|
0 | Alabama | 52423 |
1 | Alaska | 656425 |
2 | Arizona | 114006 |
3 | Arkansas | 53182 |
4 | California | 163707 |
abbrevs.head()
state | abbreviation | |
---|---|---|
0 | Alabama | AL |
1 | Alaska | AK |
2 | Arizona | AZ |
3 | Arkansas | AR |
4 | California | CA |
# First add full names to population data
# SO we can display full state name
pop_full_name = pd.merge(
population, abbrevs, how='outer',
left_on='state/region', right_on='abbreviation')
pop_full_name = pop_full_name.drop('abbreviation', 1) # drop duplicate info
pop_full_name.head()
state/region | ages | year | population | state | |
---|---|---|---|---|---|
0 | AL | under18 | 2012 | 1117489.0 | Alabama |
1 | AL | total | 2012 | 4817528.0 | Alabama |
2 | AL | under18 | 2010 | 1130966.0 | Alabama |
3 | AL | total | 2010 | 4785570.0 | Alabama |
4 | AL | under18 | 2011 | 1125763.0 | Alabama |
# Any Nan or None?
pop_full_name.isnull().any()
state/region False ages False year False population True state True dtype: bool
# Which population is empty?
#
# pop_full_name['population'].isnull()
# returns true on rows where popuaion is null, 0 otherwise
pop_full_name[pop_full_name['population'].isnull()].head()
state/region | ages | year | population | state | |
---|---|---|---|---|---|
2448 | PR | under18 | 1990 | NaN | NaN |
2449 | PR | total | 1990 | NaN | NaN |
2450 | PR | total | 1991 | NaN | NaN |
2451 | PR | under18 | 1991 | NaN | NaN |
2452 | PR | total | 1993 | NaN | NaN |
# What about state
pop_full_name[pop_full_name['state'].isnull()].head()
state/region | ages | year | population | state | |
---|---|---|---|---|---|
2448 | PR | under18 | 1990 | NaN | NaN |
2449 | PR | total | 1990 | NaN | NaN |
2450 | PR | total | 1991 | NaN | NaN |
2451 | PR | under18 | 1991 | NaN | NaN |
2452 | PR | total | 1993 | NaN | NaN |
# Is that all? Lets see
# Get all and only show unique values
# So get state/region column were state is null
# loc indexing row, column
pop_full_name.loc[
pop_full_name['state'].isnull(), 'state/region'].unique()
array(['PR', 'USA'], dtype=object)
# Missing state values for PR (Puerto Rico) and USA
# So add them
# pop_full_name['state/region'] == 'PR' return boolean values for rows
pop_full_name.loc[
pop_full_name['state/region'] == 'PR', 'state'] = 'Puerto Rico'
pop_full_name.loc[
pop_full_name['state/region'] == 'USA', 'state'] = 'United States'
pop_full_name.isnull().any()
# Now adding area of each state
with_areas = pd.merge(pop_full_name, areas, on='state', how='left')
with_areas.head()
state/region | ages | year | population | state | area (sq. mi) | |
---|---|---|---|---|---|---|
0 | AL | under18 | 2012 | 1117489.0 | Alabama | 52423.0 |
1 | AL | total | 2012 | 4817528.0 | Alabama | 52423.0 |
2 | AL | under18 | 2010 | 1130966.0 | Alabama | 52423.0 |
3 | AL | total | 2010 | 4785570.0 | Alabama | 52423.0 |
4 | AL | under18 | 2011 | 1125763.0 | Alabama | 52423.0 |
# Check for nulls
with_areas.isnull().any()
state/region False ages False year False population True state True area (sq. mi) True dtype: bool
# Which states are missing area information
# First find Nan and then get unique valuse
with_areas['state'][with_areas['area (sq. mi)'].isnull()].unique()
array([nan], dtype=object)
# Since we do not care about entire country
# Just remove those values
with_areas.dropna(inplace=True)
with_areas.isnull().any()
state/region False ages False year False population False state False area (sq. mi) False dtype: bool
# Select year 2010 and just total populations
data2010 = with_areas.query("year == 2010 & ages == 'total'")
data2010.head()
state/region | ages | year | population | state | area (sq. mi) | |
---|---|---|---|---|---|---|
3 | AL | total | 2010 | 4785570.0 | Alabama | 52423.0 |
91 | AK | total | 2010 | 713868.0 | Alaska | 656425.0 |
101 | AZ | total | 2010 | 6408790.0 | Arizona | 114006.0 |
189 | AR | total | 2010 | 2922280.0 | Arkansas | 53182.0 |
197 | CA | total | 2010 | 37333601.0 | California | 163707.0 |
# Make the state column the index
data2010.set_index('state', inplace=True)
data2010
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) <ipython-input-27-0a126f82f26e> in <module> 1 # Make the state column the index 2 ----> 3 data2010.set_index('state', inplace=True) 4 data2010 ~/opt/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in set_index(self, keys, drop, append, inplace, verify_integrity) 4722 4723 if missing: -> 4724 raise KeyError(f"None of {missing} are in the columns") 4725 4726 if inplace: KeyError: "None of ['state'] are in the columns"
# compute the density
density = data2010['population']/data2010['area (sq. mi)']
density.to_frame().head()
0 | |
---|---|
state | |
Alabama | 91.287603 |
Alaska | 1.087509 |
Arizona | 56.214497 |
Arkansas | 54.948667 |
California | 228.051342 |
# Sort
density.sort_values(ascending=False, inplace=True)
density.head()
state District of Columbia 8898.897059 New Jersey 1009.253268 Rhode Island 681.339159 Connecticut 645.600649 Massachusetts 621.815538 dtype: float64
# If you ever been in these states you believe this numbers
density.tail()
state South Dakota 10.583512 North Dakota 9.537565 Montana 6.736171 Wyoming 5.768079 Alaska 1.087509 dtype: float64
# Finally find the rank of a particular state
density.index.tolist().index('California')
11
rng = np.random.RandomState(42)
series_data = pd.Series(rng.rand(5))
series_data
0 0.374540 1 0.950714 2 0.731994 3 0.598658 4 0.156019 dtype: float64
series_data.sum()
2.811925491708157
series_data.mean()
0.5623850983416314
df_data = pd.DataFrame({'A': rng.rand(5),
'B': rng.rand(5)})
df_data
A | B | |
---|---|---|
0 | 0.155995 | 0.020584 |
1 | 0.058084 | 0.969910 |
2 | 0.866176 | 0.832443 |
3 | 0.601115 | 0.212339 |
4 | 0.708073 | 0.181825 |
# On dataframes aggregation in on columns by default
df_data.mean()
A 0.477888 B 0.443420 dtype: float64
# Aggregate each row
df_data.mean(axis='columns')
0 0.088290 1 0.513997 2 0.849309 3 0.406727 4 0.444949 dtype: float64
Split
Apply
Combine
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
'data': range(6)}, columns=['key', 'data'])
df
key | data | |
---|---|---|
0 | A | 0 |
1 | B | 1 |
2 | C | 2 |
3 | A | 3 |
4 | B | 4 |
5 | C | 5 |
# No computation is done, df is wrapped in DataFrameGroupBy object
# Computation is done when call function on DataFrameGroupBy object
df.groupby('key')
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x11be224d0>
# Compute the sum of values for each key
df.groupby('key').sum()
data | |
---|---|
key | |
A | 3 |
B | 5 |
C | 7 |
# Our data
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
'data1': range(6),
'data2': rng.randint(0, 10, 6)},
columns = ['key', 'data1', 'data2'])
df
key | data1 | data2 | |
---|---|---|---|
0 | A | 0 | 5 |
1 | B | 1 | 0 |
2 | C | 2 | 3 |
3 | A | 3 | 3 |
4 | B | 4 | 7 |
5 | C | 5 | 9 |
Allows us to apply multiple functions at once
# Using one function
df.groupby('key').median()
data1 | data2 | |
---|---|---|
key | ||
A | 1.5 | 4.0 |
B | 2.5 | 3.5 |
C | 3.5 | 6.0 |
# Applying three functions at one
df.groupby('key').aggregate(['min', np.median, max])
data1 | data2 | |||||
---|---|---|---|---|---|---|
min | median | max | min | median | max | |
key | ||||||
A | 0 | 1.5 | 3 | 3 | 4.0 | 5 |
B | 1 | 2.5 | 4 | 0 | 3.5 | 7 |
C | 2 | 3.5 | 5 | 3 | 6.0 | 9 |
# Applying different functions on each column
df.groupby('key').aggregate({'data1': 'min',
'data2': 'max'})
data1 | data2 | |
---|---|---|
key | ||
A | 0 | 5 |
B | 1 | 7 |
C | 2 | 9 |
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
'data1': range(6),
'data2': rng.randint(0, 10, 6)},
columns = ['key', 'data1', 'data2'])
df
key | data1 | data2 | |
---|---|---|---|
0 | A | 0 | 5 |
1 | B | 1 | 0 |
2 | C | 2 | 3 |
3 | A | 3 | 3 |
4 | B | 4 | 7 |
5 | C | 5 | 9 |
# We have some large standard deviations
df.groupby('key').std()
data1 | data2 | |
---|---|---|
key | ||
A | 2.12132 | 1.414214 |
B | 2.12132 | 4.949747 |
C | 2.12132 | 4.242641 |
# Show all keys that have standard deviation greater than 4 in data2
def outlier(x):
return x['data2'].std() > 4
df.groupby('key').filter(outlier)
key | data1 | data2 | |
---|---|---|---|
1 | B | 1 | 0 |
2 | C | 2 | 3 |
4 | B | 4 | 7 |
5 | C | 5 | 9 |
Change the data
# Our data
df
key | data1 | data2 | |
---|---|---|---|
0 | A | 0 | 5 |
1 | B | 1 | 0 |
2 | C | 2 | 3 |
3 | A | 3 | 3 |
4 | B | 4 | 7 |
5 | C | 5 | 9 |
# Substract the mean of each group from the values in each group per column
df.groupby('key').transform(lambda x: x - x.mean())
data1 | data2 | |
---|---|---|
0 | -1.5 | 1.0 |
1 | -1.5 | -3.5 |
2 | -1.5 | -3.0 |
3 | 1.5 | -1.0 |
4 | 1.5 | 3.5 |
5 | 1.5 | 3.0 |
Apply an arbitrary function to the group results
Function
Argument: DataFrame
Return value: DataFrame, Series, Scalar
Combine operation used depends on return type
# Same dataset
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
'data1': range(6),
'data2': rng.randint(0, 10, 6)},
columns = ['key', 'data1', 'data2'])
df
key | data1 | data2 | |
---|---|---|---|
0 | A | 0 | 5 |
1 | B | 1 | 0 |
2 | C | 2 | 3 |
3 | A | 3 | 3 |
4 | B | 4 | 7 |
5 | C | 5 | 9 |
# Divide each value of 'data1' by the sum of the group of 'data2'
def norm_by_data2(x):
# x is a DataFrame of group values
x['data1'] /= x['data2'].sum()
return x
df.groupby('key').apply(norm_by_data2)
key | data1 | data2 | |
---|---|---|---|
0 | A | 0.000000 | 5 |
1 | B | 0.142857 | 0 |
2 | C | 0.166667 | 3 |
3 | A | 0.375000 | 3 |
4 | B | 0.571429 | 7 |
5 | C | 0.416667 | 9 |