import numpy as np
import pandas as pd


# Some data with Nan & None

data = pd.Series([1, np.nan, 'hello', None])
data

0        1
1      NaN
2    hello
3     None
dtype: object


# Finding all Nan & None locations

data.isnull()

0    False
1     True
2    False
3     True
dtype: bool


# Using boolean indexing

data[data.notnull()]

0        1
2    hello
dtype: object


# Simpler way to drop Nan & None

x = data.dropna()
x

0        1
2    hello
dtype: object


# Dataframe with Nan

df = pd.DataFrame([[6,   np.nan, 2],
                   [2,      3,      5],
                   [np.nan, 4,      6]])
df


# On DataFrame drop rows with NaN or None

df.dropna()


# Drop colums with Nan or None

df.dropna(axis='columns')


# Modifying data to show how options

df[3] = np.nan
df


# drop columns that are all NaN, options all, any (default)

df.dropna(axis='columns', how='all')


# thresh = number of on Nan values needed to keep

df.dropna(axis='rows', thresh=3)


# Dataframe with unsorted indices

index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names = ['char', 'int']
data

char  int
a     1      0.532697
      2      0.620827
c     1      0.435243
      2      0.142022
b     1      0.408163
      2      0.309437
dtype: float64


# Can't create slice do to indices being out of order
 
try:
    data['a':'b']
except KeyError as e:
    print(type(e))
    print(e)

<class 'pandas.errors.UnsortedIndexError'>
'Key length (1) was greater than MultiIndex lexsort depth (0)'


# sort the index 

data = data.sort_index()
data

char  int
a     1      0.532697
      2      0.620827
b     1      0.408163
      2      0.309437
c     1      0.435243
      2      0.142022
dtype: float64


# Now can slice

data['a':'b']

char  int
a     1      0.532697
      2      0.620827
b     1      0.408163
      2      0.309437
dtype: float64


# hierarchical indices and columns
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
                                   names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
                                     names=['subject', 'type'])

# mock some data
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37

# create the DataFrame
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data


health_data['Bob']


health_data['Bob'].mean()

type
HR      33.750
Temp    36.425
dtype: float64


# Our data

health_data


# mean of everyone's values per year

data_mean = health_data.mean(level='year')
data_mean


# How does this work?

data_mean.mean(axis=1, level='type')


ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([ser1, ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object


def make_df(cols, ind):
    """Quickly make a DataFrame"""
    data = {c: [str(c) + str(i) for i in ind]
            for c in cols}
    return pd.DataFrame(data, ind)

# example DataFrame
make_df('ABC', range(3))


# More helper functions

def println(object):
    print(object)
    print("\n")

def show_concat(df1, df2):
    println(df1)
    println(df2)
    println(pd.concat([df1, df2]))


# Add columns

AB12 = make_df('AB', [1, 2])
AB34 = make_df('AB', [3, 4])
show_concat(AB12, AB34)

    A   B
1  A1  B1
2  A2  B2


    A   B
3  A3  B3
4  A4  B4


    A   B
1  A1  B1
2  A2  B2
3  A3  B3
4  A4  B4


# Add Rows

AB01 = make_df('AB', [0, 1])
CD01 = make_df('CD', [0, 1])
println(AB01)
println(CD01)
pd.concat([AB01, CD01],axis=1)

    A   B
0  A0  B0
1  A1  B1


    C   D
0  C0  D0
1  C1  D1


# Concat keeps duplicate indices

AB01 = make_df('AB', [0, 1])
AB23 = make_df('AB', [2, 3])
AB23.index = AB01.index  # make duplicate indices!
show_concat(AB01, AB23)

    A   B
0  A0  B0
1  A1  B1


    A   B
0  A2  B2
1  A3  B3


    A   B
0  A0  B0
1  A1  B1
0  A2  B2
1  A3  B3


# Raise an error on dupilcate indices

try:
    pd.concat([AB01, AB23], verify_integrity=True)
except ValueError as e:
    print("ValueError:", e)

ValueError: Indexes have overlapping values: Int64Index([0, 1], dtype='int64')


# Ignore row index, reindex

println(AB01) 
print(AB23)
pd.concat([AB01, AB23], ignore_index=True)

    A   B
0  A0  B0
1  A1  B1


    A   B
0  A2  B2
1  A3  B3


# Keeps all columns

AB01 = make_df('AB', [0, 1])
AB34 = make_df('BC', [3, 4])
println(AB01) 
println(AB34)
pd.concat([AB01, AB34])

    A   B
0  A0  B0
1  A1  B1


    B   C
3  B3  C3
4  B4  C4


# Add index to make it multidimensional 

AB01 = make_df('AB', [0, 1])
BC23 = make_df('BC', [2, 3])
println(AB01)
println(BC23)
println(pd.concat([AB01, BC23], keys=['AB', 'BC']))

    A   B
0  A0  B0
1  A1  B1


    B   C
2  B2  C2
3  B3  C3


        A   B    C
AB 0   A0  B0  NaN
   1   A1  B1  NaN
BC 2  NaN  B2   C2
   3  NaN  B3   C3


# Sample Data

group = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
hire_date = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
                    'hire_date': [2004, 2008, 2012, 2014]})
println(group); println(hire_date)

  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR


  employee  hire_date
0     Lisa       2004
1      Bob       2008
2     Jake       2012
3      Sue       2014


# Combine two Dataframes into one

group_hire_date = pd.merge(group, hire_date)
group_hire_date


pd.merge(hire_date, group)


supervisor = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],
                           'supervisor': ['Carly', 'Guido', 'Steve']})
println(group_hire_date); println(supervisor); 
pd.merge(group_hire_date, supervisor)

  employee        group  hire_date
0      Bob   Accounting       2008
1     Jake  Engineering       2012
2     Lisa  Engineering       2004
3      Sue           HR       2014


         group supervisor
0   Accounting      Carly
1  Engineering      Guido
2           HR      Steve


skills = pd.DataFrame({'group': ['Accounting', 'Accounting',
                                     'Engineering', 'Engineering', 'HR', 'HR'],

                           'skills': ['math', 'spreadsheets', 'coding', 'linux',
                                      'spreadsheets', 'organization']})
skills


println(group); println(skills)
pd.merge(group, skills)

  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR


         group        skills
0   Accounting          math
1   Accounting  spreadsheets
2  Engineering        coding
3  Engineering         linux
4           HR  spreadsheets
5           HR  organization


group


salary = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'salary': [70000, 80000, 120000, 90000]})
salary


# Specify which columns to merge on
# Can provide a list of columns

pd.merge(group, salary, left_on="employee", right_on="name")


# Use on to specify column(s) to merge on

pd.merge(group, hire_date, on='employee')


# Make a dataframe
  
def make_int_df(cols, ind, increment):
    """Quickly make a DataFrame"""
    data = {c: [i + increment for i in ind]
            for c in cols}
    return pd.DataFrame(data, ind)


abc = make_int_df('ABC', [1, 2],0)
abc


# Note colunms do not match

bcd = make_int_df('BCD', [1, 2],0)
bcd


# Here it is

pd.merge(abc, bcd, on='B')


# Make one of the columns the index

df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})

group = df1.set_index('employee')
group


df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
                    'hire_date': [2004, 2008, 2012, 2014]})
hire_date = df2.set_index('employee')
hire_date


# Now merge

pd.merge(group, hire_date, left_index=True, right_index=True)


# join does merge on rows
# Note the switch from function to method

group.join(hire_date)


# Some Data

food = pd.DataFrame(
    {'name': ['Peter', 'Paul', 'Mary'],
     'food': ['fish', 'beans', 'bread']},
    columns=['name', 'food'])
food


drink = pd.DataFrame(
    {'name': ['Mary', 'Joseph'],
     'drink': ['wine', 'beer']},
    columns=['name', 'drink'])
drink


pd.merge(drink, food)


# how='inner' is the default

pd.merge(drink,food, how='inner')


pd.merge(drink,food, how='outer')


# Order of dataframes changes order of rows & columns

pd.merge(food,drink, how='outer')


# Add to food
println(food)
pd.merge(food,drink, how='left')

    name   food
0  Peter   fish
1   Paul  beans
2   Mary  bread


# Add to drink

pd.merge(food,drink, how='right')


# Note relationship between left & right

pd.merge(drink,food, how='left')


# vector addition done is C

import numpy as np
rng = np.random.RandomState(42)
x = rng.rand(1_000_000)
y = rng.rand(1_000_000)
%timeit x + y

1.97 ms ± 94.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# Here we do it all in python

%timeit np.fromiter((xi + yi for xi, yi in zip(x, y)),dtype=x.dtype, count=len(x))

367 ms ± 13.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Some data to add together

import pandas as pd
nrows, ncols = 100000, 100
rng = np.random.RandomState(42)
df1, df2, df3, df4 = (pd.DataFrame(rng.rand(nrows, ncols))
                      for i in range(4))
df1.tail()


%timeit df1 + df2 + df3 + df4

82 ms ± 3.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


%timeit pd.eval('df1 + df2 + df3 + df4')

38.9 ms ± 1.44 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


# Some data

rng = np.random.RandomState(42)
df = pd.DataFrame(rng.rand(1000, 3), columns=['A', 'B', 'C'])
df.head()


# Normal operations & Using eval

result1 = (df['A'] + df['B']) / (df['C'] - 1)

result2 = pd.eval("(df.A + df.B) / (df.C - 1)")

np.allclose(result1, result2)

True


# Can just use column names

result3 = df.eval('(A + B) / (C - 1)')

np.allclose(result1, result3)

True


# Can assign to columns, new or existing
 
df.eval('D = (A + B) / C', inplace=True)
df.head()


column_mean = df.mean(1)          # column_mean is in Python memory space
result1 = df['A'] + column_mean

result2 = df.eval('A + @column_mean') # eval is a C program

np.allclose(result1, result2)


# Our data

rng = np.random.RandomState(42)
df = pd.DataFrame(rng.rand(1000, 3), columns=['A', 'B', 'C'])
df.head()


# Sample Expession

df[(df.A < 0.5) & (df.B < 0.5)].tail()


# Using Panda.eval

result1 = df[(df.A < 0.5) & (df.B < 0.5)]
result2 = pd.eval('df[(df.A < 0.5) & (df.B < 0.5)]')
np.allclose(result1, result2)

True


# Not possible to use DataFrame.eval
# But query works

result2 = df.query('A < 0.5 and B < 0.5')

np.allclose(result1, result2)

True


rng = np.random.RandomState(42)
df = pd.DataFrame(rng.rand(2000, 3), columns=['A', 'B', 'C'])
df.values.nbytes

48000


tmp1 = df.A < 0.5
tmp2 = df.B < 0.5
tmp3 = tmp1 & tmp2
x = df[tmp3]


tmp1.values.nbytes

2000


tmp2.values.nbytes == tmp3.nbytes == tmp1.values.nbytes

True


x.values.nbytes

11592


# Our data

# Following are shell commands to download the data
# !curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-population.csv
#!curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-areas.csv
#!curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-abbrevs.csv


population = pd.read_csv('state-population.csv')
areas = pd.read_csv('state-areas.csv')
abbrevs = pd.read_csv('state-abbrevs.csv')


population.head()


areas.head()


abbrevs.head()


# First add full names to population data
# SO we can display full state name

pop_full_name = pd.merge(
    population, abbrevs, how='outer',
    left_on='state/region', right_on='abbreviation')

pop_full_name = pop_full_name.drop('abbreviation', 1) # drop duplicate info
pop_full_name.head()


# Any Nan or None?

pop_full_name.isnull().any()

state/region    False
ages            False
year            False
population       True
state            True
dtype: bool


# Which population is empty?
#
# pop_full_name['population'].isnull()
#   returns true  on rows where popuaion is null, 0 otherwise

pop_full_name[pop_full_name['population'].isnull()].head()


# What about state

pop_full_name[pop_full_name['state'].isnull()].head()


# Is that all? Lets see
# Get all and only show unique values
# So get state/region column were state is null
# loc indexing row, column

pop_full_name.loc[
    pop_full_name['state'].isnull(), 'state/region'].unique()

array(['PR', 'USA'], dtype=object)


# Missing state values for PR (Puerto Rico) and USA
# So add them
# pop_full_name['state/region'] == 'PR' return boolean values for rows


pop_full_name.loc[
    pop_full_name['state/region'] == 'PR', 'state'] = 'Puerto Rico'
pop_full_name.loc[
    pop_full_name['state/region'] == 'USA', 'state'] = 'United States'
pop_full_name.isnull().any()


# Now adding area of each state

with_areas = pd.merge(pop_full_name, areas, on='state', how='left')
with_areas.head()


# Check for nulls

with_areas.isnull().any()

state/region     False
ages             False
year             False
population        True
state             True
area (sq. mi)     True
dtype: bool


# Which states are missing area information
# First find Nan and then get unique valuse

with_areas['state'][with_areas['area (sq. mi)'].isnull()].unique()

array([nan], dtype=object)


# Since we do not care about entire country 
#  Just remove those values

with_areas.dropna(inplace=True)
with_areas.isnull().any()

state/region     False
ages             False
year             False
population       False
state            False
area (sq. mi)    False
dtype: bool


# Select year 2010 and just total populations

data2010 = with_areas.query("year == 2010 & ages == 'total'")
data2010.head()


# Make the state column the index

data2010.set_index('state', inplace=True)
data2010

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-27-0a126f82f26e> in <module>
      1 # Make the state column the index
      2 
----> 3 data2010.set_index('state', inplace=True)
      4 data2010

~/opt/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in set_index(self, keys, drop, append, inplace, verify_integrity)
   4722 
   4723         if missing:
-> 4724             raise KeyError(f"None of {missing} are in the columns")
   4725 
   4726         if inplace:

KeyError: "None of ['state'] are in the columns"


# compute the density

density = data2010['population']/data2010['area (sq. mi)']
density.to_frame().head()


# Sort 

density.sort_values(ascending=False, inplace=True)
density.head()

state
District of Columbia    8898.897059
New Jersey              1009.253268
Rhode Island             681.339159
Connecticut              645.600649
Massachusetts            621.815538
dtype: float64


# If you ever been in these states you believe this numbers

density.tail()

state
South Dakota    10.583512
North Dakota     9.537565
Montana          6.736171
Wyoming          5.768079
Alaska           1.087509
dtype: float64


# Finally find the rank of a particular state

density.index.tolist().index('California')

11


rng = np.random.RandomState(42)
series_data = pd.Series(rng.rand(5))
series_data

0    0.374540
1    0.950714
2    0.731994
3    0.598658
4    0.156019
dtype: float64


series_data.sum()

2.811925491708157


series_data.mean()

0.5623850983416314


df_data = pd.DataFrame({'A': rng.rand(5),
                        'B': rng.rand(5)})
df_data


# On dataframes aggregation in on columns by default 

df_data.mean()

A    0.477888
B    0.443420
dtype: float64


# Aggregate each row

df_data.mean(axis='columns')

0    0.088290
1    0.513997
2    0.849309
3    0.406727
4    0.444949
dtype: float64


df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data': range(6)}, columns=['key', 'data'])
df


# No computation is done, df is wrapped in DataFrameGroupBy object
# Computation is done when call function on DataFrameGroupBy object 

df.groupby('key')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x11be224d0>


# Compute the sum of values for each key

df.groupby('key').sum()


# Our data

rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data1': range(6),
                   'data2': rng.randint(0, 10, 6)},
                  columns = ['key', 'data1', 'data2'])
df


# Using one function

df.groupby('key').median()


# Applying three functions at one

df.groupby('key').aggregate(['min', np.median, max])


# Applying different functions on each column

df.groupby('key').aggregate({'data1': 'min',
                             'data2': 'max'})


rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data1': range(6),
                   'data2': rng.randint(0, 10, 6)},
                  columns = ['key', 'data1', 'data2'])
df


# We have some large standard deviations

df.groupby('key').std()


# Show all keys that have standard deviation greater than 4 in data2

def outlier(x):
    return x['data2'].std() > 4

df.groupby('key').filter(outlier)


# Our data

df


# Substract the mean of each group from the values in each group per column

df.groupby('key').transform(lambda x: x - x.mean())


# Same dataset

rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data1': range(6),
                   'data2': rng.randint(0, 10, 6)},
                  columns = ['key', 'data1', 'data2'])
df


# Divide each value of 'data1' by the sum of the group of 'data2'

def norm_by_data2(x):
    # x is a DataFrame of group values
    x['data1'] /= x['data2'].sum()
    return x

df.groupby('key').apply(norm_by_data2)

	0	1	2
0	6.0	NaN	2
1	2.0	3.0	5
2	NaN	4.0	6

	0	1	2
0	6.0	NaN	2
1	2.0	3.0	5
2	NaN	4.0	6

	subject	Bob		Guido		Sue
	type	HR	Temp	HR	Temp	HR	Temp
year	visit
2013	1	50.0	38.2	39.0	37.7	48.0	37.8
2013	2	31.0	35.9	25.0	36.0	24.0	37.1
2014	1	23.0	34.4	48.0	35.5	29.0	36.1
2014	2	31.0	37.2	42.0	38.3	37.0	35.8

	subject	Bob		Guido		Sue
	type	HR	Temp	HR	Temp	HR	Temp
year	visit
2013	1	50.0	38.2	39.0	37.7	48.0	37.8
2013	2	31.0	35.9	25.0	36.0	24.0	37.1
2014	1	23.0	34.4	48.0	35.5	29.0	36.1
2014	2	31.0	37.2	42.0	38.3	37.0	35.8

	0	1	2	3	4	5	6	7	8	9	...	90	91	92	93	94	95	96	97	98	99
99995	0.071979	0.439323	0.188588	0.586705	0.640611	0.662409	0.318503	0.600419	0.609742	0.390592	...	0.122887	0.491140	0.032855	0.567250	0.428673	0.421092	0.021024	0.398596	0.405897	0.869783
99996	0.313411	0.010490	0.469216	0.600825	0.451085	0.496918	0.983128	0.422056	0.719077	0.045588	...	0.072444	0.715574	0.300257	0.087290	0.130703	0.549202	0.287877	0.589258	0.516884	0.254370
99997	0.560873	0.647396	0.043068	0.282439	0.042950	0.346690	0.954034	0.603182	0.447768	0.888498	...	0.880079	0.508377	0.442052	0.621332	0.314942	0.131085	0.697310	0.111705	0.397560	0.988347
99998	0.710115	0.067999	0.611329	0.136199	0.054724	0.018160	0.911428	0.762005	0.245312	0.891027	...	0.249632	0.894231	0.342761	0.844330	0.659797	0.835561	0.117920	0.211202	0.931760	0.296913
99999	0.116834	0.461155	0.754556	0.250272	0.864631	0.588966	0.358260	0.655541	0.946702	0.145819	...	0.298095	0.496018	0.990856	0.368719	0.837910	0.244971	0.061637	0.933295	0.245149	0.388890

Data Manipulation¶

Dealing with NaN and None¶

SORTED AND UNSORTED INDICES¶

Hierarchical indices and columns¶

Data Aggregations on Multi-Indices¶

Combining Datasets: Concat and Append¶

Helper functions¶

Merge¶

One-One¶

Many-One¶

Many-Many¶

On¶

How to merge group and salary?¶

Explain This Merge¶

THE LEFT_INDEX AND RIGHT_INDEX KEYWORDS¶

Inner, Outer, Left, Right Joins¶

Note only Mary is in common¶

query() and eval()¶

The Problem¶

The Solution¶

So eval is about 50% faster¶

Supported Operations¶

DataFrame.eval() for Column-Wise Operations¶

Local Python Variables in eval¶

DataFrame.query() Method¶

query Details¶

Performance: When to Use These Functions¶

Expanding it out¶

Example - US State Data¶

Goal - Compute population density and sort¶

Until you do it¶

Until you do it a lot¶

Aggregation and Grouping¶

Aggregation in Pandas¶

GroupBy: Split, Apply, Combine¶

AGGREGATE, FILTER, TRANSFORM, APPLY¶

Aggregation¶

Filtering¶

Transformation¶

apply() method¶

	employee	group	hire_date
0	Bob	Accounting	2008
1	Jake	Engineering	2012
2	Lisa	Engineering	2004
3	Sue	HR	2014

	employee	group	hire_date	supervisor
0	Bob	Accounting	2008	Carly
1	Jake	Engineering	2012	Guido
2	Lisa	Engineering	2004	Guido
3	Sue	HR	2014	Steve

	group	skills
0	Accounting	math
1	Accounting	spreadsheets
2	Engineering	coding
3	Engineering	linux
4	HR	spreadsheets
5	HR	organization

	employee	group	name	salary
0	Bob	Accounting	Bob	70000
1	Jake	Engineering	Jake	80000
2	Lisa	Engineering	Lisa	120000
3	Sue	HR	Sue	90000

	group	hire_date
employee
Bob	Accounting	2008
Jake	Engineering	2012
Lisa	Engineering	2004
Sue	HR	2014

	A	B	C
0	0.374540	0.950714	0.731994
1	0.598658	0.156019	0.155995
2	0.058084	0.866176	0.601115
3	0.708073	0.020584	0.969910
4	0.832443	0.212339	0.181825

	A	B	C
970	0.230417	0.001474	0.729345
973	0.302931	0.325295	0.712621
987	0.345342	0.335610	0.978525
990	0.380518	0.163035	0.786206
999	0.309788	0.290046	0.871414

	state/region	ages	year	population
0	AL	under18	2012	1117489.0
1	AL	total	2012	4817528.0
2	AL	under18	2010	1130966.0
3	AL	total	2010	4785570.0
4	AL	under18	2011	1125763.0

	state	area (sq. mi)
0	Alabama	52423
1	Alaska	656425
2	Arizona	114006
3	Arkansas	53182
4	California	163707

	state/region	ages	year	population	state
2448	PR	under18	1990	NaN	NaN
2449	PR	total	1990	NaN	NaN
2450	PR	total	1991	NaN	NaN
2451	PR	under18	1991	NaN	NaN
2452	PR	total	1993	NaN	NaN

	A	B	C_x	C_y	D
0	1	1	1	1	1
1	2	2	2	2	2

	name	drink
0	Mary	wine
1	Joseph	beer

	0
state
Alabama	91.287603
Alaska	1.087509
Arizona	56.214497
Arkansas	54.948667
California	228.051342

	data1			data2
	min	median	max	min	median	max
key
A	0	1.5	3	3	4.0	5
B	1	2.5	4	0	3.5	7
C	2	3.5	5	3	6.0	9

	key	data1	data2
0	A	0.000000	5
1	B	0.142857	0
2	C	0.166667	3
3	A	0.375000	3
4	B	0.571429	7
5	C	0.416667	9