2D data structure
Rows & Columns labeled
Columns can have different data types
Create from
import numpy as np
import pandas as pd
def inc(n,m):
return n + m
x = map(inc, [1,3], [2,9])
list(x)
[3, 12]
data = [1, 2, 3, 4, 5, 6]
large = filter(lambda x : x > 3, data)
for x in large:
print(x)
4 5 6
large = filter(lambda x : x > 3, data)
large
<filter at 0x11d5590d0>
data = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
'two': pd.Series([1.5, 2.5, 3.5, 4.], index=['a', 'b', 'c', 'd'])}
sample_df = pd.DataFrame(data)
sample_df
one | two | |
---|---|---|
a | 1.0 | 1.5 |
b | 2.0 | 2.5 |
c | 3.0 | 3.5 |
d | NaN | 4.0 |
data = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
'two': pd.Series([1.5, 2.5, 3.5, 4.], index=['a', 'b', 'c', 'd'])}
sample_df = pd.DataFrame(data, index=['d','c','b'])
sample_df
one | two | |
---|---|---|
d | NaN | 4.0 |
c | 3.0 | 3.5 |
b | 2.0 | 2.5 |
data = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
'two': pd.Series([1.5, 2.5, 3.5, 4.], index=['a', 'b', 'c', 'd'])}
sample_df = pd.DataFrame(data, index=['d','c','b'], columns=['two', 'one'])
sample_df
two | one | |
---|---|---|
d | 4.0 | NaN |
c | 3.5 | 3.0 |
b | 2.5 | 2.0 |
data = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
'two': pd.Series([1.5, 2.5, 3.5, 4.], index=['a', 'b', 'c', 'd'])}
sample_df = pd.DataFrame(data)
sample_df
one | two | |
---|---|---|
a | 1.0 | 1.5 |
b | 2.0 | 2.5 |
c | 3.0 | 3.5 |
d | NaN | 4.0 |
sample_df['one']['a']
1.0
sample_df['one', 'a']
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) ~/opt/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance) 3079 try: -> 3080 return self._engine.get_loc(casted_key) 3081 except KeyError as err: pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() KeyError: ('one', 'a') The above exception was the direct cause of the following exception: KeyError Traceback (most recent call last) <ipython-input-21-578b219e595b> in <module> ----> 1 sample_df['one', 'a'] ~/opt/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in __getitem__(self, key) 3022 if self.columns.nlevels > 1: 3023 return self._getitem_multilevel(key) -> 3024 indexer = self.columns.get_loc(key) 3025 if is_integer(indexer): 3026 indexer = [indexer] ~/opt/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance) 3080 return self._engine.get_loc(casted_key) 3081 except KeyError as err: -> 3082 raise KeyError(key) from err 3083 3084 if tolerance is not None: KeyError: ('one', 'a')
sample_df['one']['a'] = 42
sample_df
one | two | |
---|---|---|
a | 42.0 | 1.5 |
b | 2.0 | 2.5 |
c | 3.0 | 3.5 |
d | NaN | 4.0 |
sample_df.one
a 42.0 b 2.0 c 3.0 d NaN Name: one, dtype: float64
sample_df.two
a 1.5 b 2.5 c 3.5 d 4.0 Name: two, dtype: float64
loc, iloc
sample_df.loc['a'][1]
1.5
sample_df.loc['a','two']
1.5
sample_df.loc['a':'c']
one | two | |
---|---|---|
a | 1.0 | 1.5 |
b | 2.0 | 2.5 |
c | 3.0 | 3.5 |
sample_df.loc['a']["two"]
1.5
sample_df.loc['a'][1]
1.5
sample_df.iloc[0][1]
1.5
data = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
'two': pd.Series([1.5, 2.5, 3.5, 4.], index=['a', 'b', 'c', 'd'])}
sample_df = pd.DataFrame(data)
sample_df.at['a','one']
1.0
sample_df.iat[0,0]
1.0
data = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
'two': pd.Series([1.5, 2.5, 3.5, 4.], index=['a', 'b', 'c', 'd'])}
sample_df = pd.DataFrame(data)
sample_df.index = ['v','w','x','z'] # row index
sample_df.columns = ['cat', 'dog']
sample_df
cat | dog | |
---|---|---|
v | 1.0 | 1.5 |
w | 2.0 | 2.5 |
x | 3.0 | 3.5 |
z | NaN | 4.0 |
# From Series
d = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
'two': pd.Series([1.5, 2.5, 3.5, 4.5], index=['a', 'b', 'c', 'd'])}
pd.DataFrame(d)
one | two | |
---|---|---|
a | 1.0 | 1.5 |
b | 2.0 | 2.5 |
c | 3.0 | 3.5 |
d | NaN | 4.5 |
data = [(1, 2., 'Hello'), (2, 3., "World")]
pd.DataFrame(data)
0 | 1 | 2 | |
---|---|---|---|
0 | 1 | 2.0 | Hello |
1 | 2 | 3.0 | World |
data2 = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]
pd.DataFrame(data2)
a | b | c | |
---|---|---|---|
0 | 1 | 2 | NaN |
1 | 5 | 10 | 20.0 |
pd.DataFrame({('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2},
('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4},
('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6},
('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8},
('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}})
a | b | |||||
---|---|---|---|---|---|---|
b | a | c | a | b | ||
A | B | 1.0 | 4.0 | 5.0 | 8.0 | 10.0 |
C | 2.0 | 3.0 | 6.0 | 7.0 | NaN | |
D | NaN | NaN | NaN | NaN | 9.0 |
area = pd.Series({'California': 423967, 'Texas': 695662,
'New York': 141297, 'Florida': 170312,
'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
'New York': 19651127, 'Florida': 19552860,
'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data
area | pop | |
---|---|---|
California | 423967 | 38332521 |
Texas | 695662 | 26448193 |
New York | 141297 | 19651127 |
Florida | 170312 | 19552860 |
Illinois | 149995 | 12882135 |
data['density'] = data['pop']/data['area']
data
area | pop | density | |
---|---|---|---|
California | 423967 | 38332521 | 90.413926 |
Texas | 695662 | 26448193 | 38.018740 |
New York | 141297 | 19651127 | 139.076746 |
Florida | 170312 | 19552860 | 114.806121 |
Illinois | 149995 | 12882135 | 85.883763 |
data.T # Transpose
California | Texas | New York | Florida | Illinois | |
---|---|---|---|---|---|
area | 4.239670e+05 | 6.956620e+05 | 1.412970e+05 | 1.703120e+05 | 1.499950e+05 |
pop | 3.833252e+07 | 2.644819e+07 | 1.965113e+07 | 1.955286e+07 | 1.288214e+07 |
density | 9.041393e+01 | 3.801874e+01 | 1.390767e+02 | 1.148061e+02 | 8.588376e+01 |
data.plot() # Sort of meaningless
<AxesSubplot:>
data.to_csv()
',area,pop,density\nCalifornia,423967,38332521,90.41392608386974\nTexas,695662,26448193,38.01874042279153\nNew York,141297,19651127,139.07674614464568\nFlorida,170312,19552860,114.80612053173\nIllinois,149995,12882135,85.88376279209307\n'
Formats
from io import StringIO
data = ('col1,col2,col3\n'
'a,b,1\n'
'a,b,2\n'
'c,d,3')
pd.read_csv(StringIO(data))
col1 | col2 | col3 | |
---|---|---|---|
0 | a | b | 1 |
1 | a | b | 2 |
2 | c | d | 3 |
state,area,pop,density
California,423967,38332521,90.41392608386974
Texas,695662,26448193,38.01874042279153
New York,141297,19651127,139.07674614464568
Florida,170312,19552860,114.80612053173
Illinois,149995,12882135,85.88376279209307
pd.read_csv("states.csv")
state | area | pop | density | |
---|---|---|---|---|
0 | California | 423967 | 38332521 | 90.413926 |
1 | Texas | 695662 | 26448193 | 38.018740 |
2 | New York | 141297 | 19651127 | 139.076746 |
3 | Florida | 170312 | 19552860 | 114.806121 |
4 | Illinois | 149995 | 12882135 | 85.883763 |
pd.read_csv("states.csv", header=0, index_col=0)
area | pop | density | |
---|---|---|---|
state | |||
California | 423967 | 38332521 | 90.413926 |
Texas | 695662 | 26448193 | 38.018740 |
New York | 141297 | 19651127 | 139.076746 |
Florida | 170312 | 19552860 | 114.806121 |
Illinois | 149995 | 12882135 | 85.883763 |
Given measurements of A Sepal and a petal can we determine which iris?
Dataset contains 150 rows of measurements
Three types of iris
flower_data_url = "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/iris.csv"
flower_data = pd.read_csv(flower_data_url, index_col=0)
flower_data.head()
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | |
---|---|---|---|---|---|
1 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
2 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
3 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
4 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
5 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
Returns a copy of the dataframe
iris = pd.read_csv(flower_data_url, index_col=0)
(iris
.assign(sepal_ratio =iris['Sepal.Width'] / iris['Sepal.Length'])
.head())
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | sepal_ratio | |
---|---|---|---|---|---|---|
1 | 5.1 | 3.5 | 1.4 | 0.2 | setosa | 0.686275 |
2 | 4.9 | 3.0 | 1.4 | 0.2 | setosa | 0.612245 |
3 | 4.7 | 3.2 | 1.3 | 0.2 | setosa | 0.680851 |
4 | 4.6 | 3.1 | 1.5 | 0.2 | setosa | 0.673913 |
5 | 5.0 | 3.6 | 1.4 | 0.2 | setosa | 0.720000 |
Using Lambda
(iris
.assign(sepal_ratio=lambda x: (x['Sepal.Width'] / x['Sepal.Length']))
.head())
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | sepal_ratio | |
---|---|---|---|---|---|---|
1 | 5.1 | 3.5 | 1.4 | 0.2 | setosa | 0.686275 |
2 | 4.9 | 3.0 | 1.4 | 0.2 | setosa | 0.612245 |
3 | 4.7 | 3.2 | 1.3 | 0.2 | setosa | 0.680851 |
4 | 4.6 | 3.1 | 1.5 | 0.2 | setosa | 0.673913 |
5 | 5.0 | 3.6 | 1.4 | 0.2 | setosa | 0.720000 |
(iris
.assign(SepalRatio=lambda x: x['Sepal.Width'] / x['Sepal.Length'],
PetalRatio=lambda x: x['Petal.Width'] / x['Petal.Length'])
.plot(kind='scatter', x='SepalRatio', y='PetalRatio'))
<AxesSubplot:xlabel='SepalRatio', ylabel='PetalRatio'>
Some syntax does not support dots in names
iris = pd.read_csv(flower_data_url, index_col=0)
iris.Sepal.Width
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-59-a6ef89976732> in <module> 1 iris = pd.read_csv(flower_data_url, index_col=0) ----> 2 iris.Sepal.Width ~/opt/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py in __getattr__(self, name) 5460 if self._info_axis._can_hold_identifiers_and_holds_name(name): 5461 return self[name] -> 5462 return object.__getattribute__(self, name) 5463 5464 def __setattr__(self, name: str, value) -> None: AttributeError: 'DataFrame' object has no attribute 'Sepal'
iris.columns = ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Species']
iris.head()
SepalLength | SepalWidth | PetalLength | PetalWidth | Species | |
---|---|---|---|---|---|
1 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
2 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
3 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
4 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
5 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
iris.SepalWidth.head(3)
1 3.5 2 3.0 3 3.2 Name: SepalWidth, dtype: float64
Done element by element
data = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
'two': pd.Series([1.5, 2.5, 3.5], index=['a', 'b', 'c'])}
sample_df = pd.DataFrame(data)
sample_df + 10
one | two | |
---|---|---|
a | 11.0 | 11.5 |
b | 12.0 | 12.5 |
c | 13.0 | 13.5 |
np.sin(sample_df)
one | two | |
---|---|---|
a | 0.841471 | 0.997495 |
b | 0.909297 | 0.598472 |
c | 0.141120 | -0.350783 |
sample_df + sample_df
one | two | |
---|---|---|
a | 2.0 | 3.0 |
b | 4.0 | 5.0 |
c | 6.0 | 7.0 |
Default is row-wise
a = pd.DataFrame([[1,2,3], [4,5,6],[7,8,9]])
a
0 | 1 | 2 | |
---|---|---|---|
0 | 1 | 2 | 3 |
1 | 4 | 5 | 6 |
2 | 7 | 8 | 9 |
a + 1
0 | 1 | 2 | |
---|---|---|---|
0 | 2 | 3 | 4 |
1 | 5 | 6 | 7 |
2 | 8 | 9 | 10 |
def inc(x):
return x + 1
inc(a)
0 | 1 | 2 | |
---|---|---|---|
0 | 2 | 3 | 4 |
1 | 5 | 6 | 7 |
2 | 8 | 9 | 10 |
a = pd.DataFrame([[1,2,3],
[4,5,6],
[7,8,9]])
b = pd.Series([1,2,3])
a.subtract(b, axis=0)
0 | 1 | 2 | |
---|---|---|---|
0 | 0 | 1 | 2 |
1 | 2 | 3 | 4 |
2 | 4 | 5 | 6 |
a = pd.DataFrame([[1,2,3], [4,5,6],[7,8,9]], columns = ["A","B","C"])
a
A | B | C | |
---|---|---|---|
0 | 1 | 2 | 3 |
1 | 4 | 5 | 6 |
2 | 7 | 8 | 9 |
a.agg(np.sum)
A 12 B 15 C 18 dtype: int64
mixed = pd.DataFrame([[1,2,"Cat"], [4,5,"Dog"],[7,8,"Mouse"]], columns = ["A","B","C"])
mixed
A | B | C | |
---|---|---|---|
0 | 1 | 2 | Cat |
1 | 4 | 5 | Dog |
2 | 7 | 8 | Mouse |
mixed.agg(np.sum)
A 12 B 15 C CatDogMouse dtype: object
Splitting the data into groups based on some criteria
Applying a function to each group independently
Combining the results into a data structure
df = pd.DataFrame({"X": ["B", "B", "A", "A"], "Y": [1, 2, 3, 4]})
df
X | Y | |
---|---|---|
0 | B | 1 |
1 | B | 2 |
2 | A | 3 |
3 | A | 4 |
df.groupby("X")
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fbb786e6a30>
df.groupby("X").sum()
Y | |
---|---|
X | |
A | 7 |
B | 3 |
flower_data_url = "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/iris.csv"
flower_data = pd.read_csv(flower_data_url, index_col=0)
flower_data.head()
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | |
---|---|---|---|---|---|
1 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
2 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
3 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
4 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
5 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
flower_data.groupby("Species").mean()
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | |
---|---|---|---|---|
Species | ||||
setosa | 5.006 | 3.428 | 1.462 | 0.246 |
versicolor | 5.936 | 2.770 | 4.260 | 1.326 |
virginica | 6.588 | 2.974 | 5.552 | 2.026 |
flower_data.groupby("Species").boxplot()
setosa AxesSubplot(0.1,0.559091;0.363636x0.340909) versicolor AxesSubplot(0.536364,0.559091;0.363636x0.340909) virginica AxesSubplot(0.1,0.15;0.363636x0.340909) dtype: object