import numpy as np
import pandas as pd

def inc(n,m):
    return n + m

x = map(inc, [1,3], [2,9])

list(x)

[3, 12]


data = [1, 2, 3, 4, 5, 6]

large = filter(lambda x : x > 3, data)

for x in large:
    print(x)

4
5
6


large = filter(lambda x : x > 3, data)

large

<filter at 0x11d5590d0>


data = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
        'two': pd.Series([1.5, 2.5, 3.5, 4.], index=['a', 'b', 'c', 'd'])}

sample_df = pd.DataFrame(data)
sample_df


data = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
        'two': pd.Series([1.5, 2.5, 3.5, 4.], index=['a', 'b', 'c', 'd'])}

sample_df = pd.DataFrame(data, index=['d','c','b'])
sample_df


data = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
        'two': pd.Series([1.5, 2.5, 3.5, 4.], index=['a', 'b', 'c', 'd'])}

sample_df = pd.DataFrame(data, index=['d','c','b'], columns=['two', 'one'])
sample_df


data = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
        'two': pd.Series([1.5, 2.5, 3.5, 4.], index=['a', 'b', 'c', 'd'])}
sample_df = pd.DataFrame(data)

sample_df


sample_df['one']['a']

1.0


sample_df['one', 'a']

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~/opt/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3079             try:
-> 3080                 return self._engine.get_loc(casted_key)
   3081             except KeyError as err:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: ('one', 'a')

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
<ipython-input-21-578b219e595b> in <module>
----> 1 sample_df['one', 'a']

~/opt/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in __getitem__(self, key)
   3022             if self.columns.nlevels > 1:
   3023                 return self._getitem_multilevel(key)
-> 3024             indexer = self.columns.get_loc(key)
   3025             if is_integer(indexer):
   3026                 indexer = [indexer]

~/opt/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3080                 return self._engine.get_loc(casted_key)
   3081             except KeyError as err:
-> 3082                 raise KeyError(key) from err
   3083 
   3084         if tolerance is not None:

KeyError: ('one', 'a')


sample_df['one']['a'] = 42
sample_df


sample_df.one

a    42.0
b     2.0
c     3.0
d     NaN
Name: one, dtype: float64


sample_df.two

a    1.5
b    2.5
c    3.5
d    4.0
Name: two, dtype: float64


sample_df.loc['a'][1]

1.5


sample_df.loc['a','two']

1.5


sample_df.loc['a':'c']


sample_df.loc['a']["two"]

1.5


sample_df.loc['a'][1]

1.5


sample_df.iloc[0][1]

1.5


data = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
        'two': pd.Series([1.5, 2.5, 3.5, 4.], index=['a', 'b', 'c', 'd'])}
sample_df = pd.DataFrame(data)

sample_df.at['a','one']

1.0


sample_df.iat[0,0]

1.0


data = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
        'two': pd.Series([1.5, 2.5, 3.5, 4.], index=['a', 'b', 'c', 'd'])}
sample_df = pd.DataFrame(data)

sample_df.index = ['v','w','x','z'] # row index
sample_df.columns = ['cat', 'dog']
sample_df


# From Series

d = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
    'two': pd.Series([1.5, 2.5, 3.5, 4.5], index=['a', 'b', 'c', 'd'])}
pd.DataFrame(d)


data = [(1, 2., 'Hello'), (2, 3., "World")]
pd.DataFrame(data)


data2 = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]
pd.DataFrame(data2)


pd.DataFrame({('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2},
              ('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4},
              ('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6},
              ('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8},
              ('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}})


area = pd.Series({'California': 423967, 'Texas': 695662,
                   'New York': 141297, 'Florida': 170312,
                   'Illinois': 149995}) 
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data


data['density'] = data['pop']/data['area']
data


data.T    # Transpose


data.plot()    # Sort of meaningless

<AxesSubplot:>


data.to_csv()

',area,pop,density\nCalifornia,423967,38332521,90.41392608386974\nTexas,695662,26448193,38.01874042279153\nNew York,141297,19651127,139.07674614464568\nFlorida,170312,19552860,114.80612053173\nIllinois,149995,12882135,85.88376279209307\n'


from io import StringIO
data = ('col1,col2,col3\n'
        'a,b,1\n'
        'a,b,2\n'
        'c,d,3')

pd.read_csv(StringIO(data))


pd.read_csv("states.csv")


pd.read_csv("states.csv", header=0, index_col=0)


flower_data_url = "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/iris.csv"
flower_data = pd.read_csv(flower_data_url, index_col=0)
flower_data.head()


iris = pd.read_csv(flower_data_url, index_col=0)

(iris
 .assign(sepal_ratio =iris['Sepal.Width'] / iris['Sepal.Length'])
 .head())


(iris
 .assign(sepal_ratio=lambda x: (x['Sepal.Width'] / x['Sepal.Length']))
 .head())


 (iris
  .assign(SepalRatio=lambda x: x['Sepal.Width'] / x['Sepal.Length'],
          PetalRatio=lambda x: x['Petal.Width'] / x['Petal.Length'])
  .plot(kind='scatter', x='SepalRatio', y='PetalRatio'))

<AxesSubplot:xlabel='SepalRatio', ylabel='PetalRatio'>


iris = pd.read_csv(flower_data_url, index_col=0)
iris.Sepal.Width

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-59-a6ef89976732> in <module>
      1 iris = pd.read_csv(flower_data_url, index_col=0)
----> 2 iris.Sepal.Width

~/opt/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py in __getattr__(self, name)
   5460             if self._info_axis._can_hold_identifiers_and_holds_name(name):
   5461                 return self[name]
-> 5462             return object.__getattribute__(self, name)
   5463 
   5464     def __setattr__(self, name: str, value) -> None:

AttributeError: 'DataFrame' object has no attribute 'Sepal'


iris.columns = ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Species']
iris.head()


iris.SepalWidth.head(3)

1    3.5
2    3.0
3    3.2
Name: SepalWidth, dtype: float64


data = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
        'two': pd.Series([1.5, 2.5, 3.5], index=['a', 'b', 'c'])}
sample_df = pd.DataFrame(data)
sample_df + 10


np.sin(sample_df)


sample_df + sample_df


a = pd.DataFrame([[1,2,3], [4,5,6],[7,8,9]])
a


a + 1


def inc(x):
    return x + 1

inc(a)


a = pd.DataFrame([[1,2,3], 
                  [4,5,6],
                  [7,8,9]])
b = pd.Series([1,2,3])
a.subtract(b, axis=0)


a = pd.DataFrame([[1,2,3], [4,5,6],[7,8,9]], columns = ["A","B","C"])
a


a.agg(np.sum)

A    12
B    15
C    18
dtype: int64


mixed = pd.DataFrame([[1,2,"Cat"], [4,5,"Dog"],[7,8,"Mouse"]], columns = ["A","B","C"])
mixed


mixed.agg(np.sum)

A             12
B             15
C    CatDogMouse
dtype: object


df = pd.DataFrame({"X": ["B", "B", "A", "A"], "Y": [1, 2, 3, 4]})
df


df.groupby("X")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fbb786e6a30>


df.groupby("X").sum()


flower_data_url = "https://vincentarelbundock.github.io/Rdatasets/csv/datasets/iris.csv"
flower_data = pd.read_csv(flower_data_url, index_col=0)
flower_data.head()


flower_data.groupby("Species").mean()


flower_data.groupby("Species").boxplot()

setosa             AxesSubplot(0.1,0.559091;0.363636x0.340909)
versicolor    AxesSubplot(0.536364,0.559091;0.363636x0.340909)
virginica              AxesSubplot(0.1,0.15;0.363636x0.340909)
dtype: object

	California	Texas	New York	Florida	Illinois
area	4.239670e+05	6.956620e+05	1.412970e+05	1.703120e+05	1.499950e+05
pop	3.833252e+07	2.644819e+07	1.965113e+07	1.955286e+07	1.288214e+07
density	9.041393e+01	3.801874e+01	1.390767e+02	1.148061e+02	8.588376e+01

	Sepal.Length	Sepal.Width	Petal.Length	Petal.Width	Species
1	5.1	3.5	1.4	0.2	setosa
2	4.9	3.0	1.4	0.2	setosa
3	4.7	3.2	1.3	0.2	setosa
4	4.6	3.1	1.5	0.2	setosa
5	5.0	3.6	1.4	0.2	setosa

	Sepal.Length	Sepal.Width	Petal.Length	Petal.Width	Species	sepal_ratio
1	5.1	3.5	1.4	0.2	setosa	0.686275
2	4.9	3.0	1.4	0.2	setosa	0.612245
3	4.7	3.2	1.3	0.2	setosa	0.680851
4	4.6	3.1	1.5	0.2	setosa	0.673913
5	5.0	3.6	1.4	0.2	setosa	0.720000

	Sepal.Length	Sepal.Width	Petal.Length	Petal.Width	Species	sepal_ratio
1	5.1	3.5	1.4	0.2	setosa	0.686275
2	4.9	3.0	1.4	0.2	setosa	0.612245
3	4.7	3.2	1.3	0.2	setosa	0.680851
4	4.6	3.1	1.5	0.2	setosa	0.673913
5	5.0	3.6	1.4	0.2	setosa	0.720000

	SepalLength	SepalWidth	PetalLength	PetalWidth	Species
1	5.1	3.5	1.4	0.2	setosa
2	4.9	3.0	1.4	0.2	setosa
3	4.7	3.2	1.3	0.2	setosa
4	4.6	3.1	1.5	0.2	setosa
5	5.0	3.6	1.4	0.2	setosa

DataFrame¶

Accessing Elements¶

at and iat¶

Changing Row and Column Index¶

Creating DataFrame¶

Calculation¶

Reading Data¶

Contents of local file states.csv¶

R Datasets¶

Iris Dataset¶

Sepal and Petal¶

Iris Dataset¶

Assign¶

Prolems with dots in names¶

Operations on DataFrames¶

DataFrame & Series Operations¶

Column-wise Operations with Series¶

Aggregation¶

Group by: split apply combine¶

	area	pop
California	423967	38332521
Texas	695662	26448193
New York	141297	19651127
Florida	170312	19552860
Illinois	149995	12882135

	area	pop	density
state
California	423967	38332521	90.413926
Texas	695662	26448193	38.018740
New York	141297	19651127	139.076746
Florida	170312	19552860	114.806121
Illinois	149995	12882135	85.883763

	Sepal.Length	Sepal.Width	Petal.Length	Petal.Width
Species
setosa	5.006	3.428	1.462	0.246
versicolor	5.936	2.770	4.260	1.326
virginica	6.588	2.974	5.552	2.026

	0	1	2
0	1	2.0	Hello
1	2	3.0	World

	a	b	c
0	1	2	NaN
1	5	10	20.0

	col1	col2	col3
0	a	b	1
1	a	b	2
2	c	d	3

	one	two
a	11.0	11.5
b	12.0	12.5
c	13.0	13.5