library(reticulate)
use_python("/anaconda3/bin/python3.7")
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.switch_backend('agg') # Very Important in R Markdown with Rocker/VBox
mpg = pd.read_csv("mpg.csv")
sns.scatterplot(x='displ', y='hwy', data=mpg);
sns.scatterplot(x='displ', y='hwy', hue='class', data=mpg);
sns.scatterplot(x='displ', y='hwy', size='class', data=mpg);
sns.scatterplot(x='displ', y='hwy', style='class', data=mpg);
g = sns.FacetGrid(mpg, col='class', col_wrap=4)
g.map(plt.scatter, 'displ', 'hwy').add_legend()
## <seaborn.axisgrid.FacetGrid object at 0x1a2331d5c0>
plt.show()
g = sns.FacetGrid(mpg, col='cyl', row='drv')
g.map(plt.scatter, 'displ', 'hwy').add_legend()
## <seaborn.axisgrid.FacetGrid object at 0x10dc09710>
plt.show()
plt.hist(mpg['cyl']);
## (array([81., 0., 4., 0., 0., 79., 0., 0., 0., 70.]), array([4. , 4.4, 4.8, 5.2, 5.6, 6. , 6.4, 6.8, 7.2, 7.6, 8. ]), <a list of 10 Patch objects>)
plt.show()
plt.hist(mpg['hwy'], alpha=0.3)
## (array([17., 48., 26., 27., 61., 40., 7., 5., 0., 3.]), array([12. , 15.2, 18.4, 21.6, 24.8, 28. , 31.2, 34.4, 37.6, 40.8, 44. ]), <a list of 10 Patch objects>)
plt.hist(mpg['cty'], alpha=0.3)
## (array([25., 48., 43., 62., 34., 12., 5., 3., 0., 2.]), array([ 9. , 11.6, 14.2, 16.8, 19.4, 22. , 24.6, 27.2, 29.8, 32.4, 35. ]), <a list of 10 Patch objects>)
plt.show()
sns.pairplot(mpg[['cty', 'drv']], hue='drv', diag_kind='hist', height=5, palette='Set2')
## <seaborn.axisgrid.PairGrid object at 0x1a26515828>
plt.show()
sns.countplot(x='class', data=mpg)
plt.show()
ax = sns.barplot(x='class', y='year', data=mpg, estimator=lambda x: len(x) / len(mpg))
plt.show()
mpg.groupby(['class', 'drv']).size().unstack().plot(kind='bar', stacked=True, figsize=(15, 7))
dft = mpg.groupby(['class', 'drv']).size().unstack().fillna(0)
total = dft['4'] + dft['r'] + dft['f']
dft['4'] = round(dft['4'] / total * 100, 0)
dft['r'] = round(dft['r'] / total * 100, 0)
dft['f'] = 100 - dft['4'] - dft['r']
dft[['4', 'r', 'f']].plot(kind='bar', stacked=True, figsize=(14, 7))
mpg.groupby(['class', 'drv']).size().unstack().plot(kind='bar')
sns.boxplot(x='class', y='hwy', data=mpg)
sns.boxplot(x='hwy', y='class', data=mpg, orient='h')
sns.boxplot(x='class', y="hwy", hue='drv', data=mpg)
sns.boxplot(x='class', y='hwy', data=mpg)
sns.swarmplot(x='class', y='hwy', data=mpg, color='.25')
## <matplotlib.axes._subplots.AxesSubplot object at 0x1a2af902e8>
plt.show()
sns.catplot(x='drv', y='hwy',
col='class',
data=mpg, kind='box',
height=4, col_wrap=3, aspect=0.9);
## <seaborn.axisgrid.FacetGrid object at 0x1a2a0bf780>
plt.show()
flights = pd.read_csv('flights.csv')
flights.loc[(flights['month'] == 1) & (flights['day'] == 1)]
## Unnamed: 0 year month day ... distance hour minute time_hour
## 0 1 2013 1 1 ... 1400 5 15 2013-01-01 05:00:00
## 1 2 2013 1 1 ... 1416 5 29 2013-01-01 05:00:00
## 2 3 2013 1 1 ... 1089 5 40 2013-01-01 05:00:00
## 3 4 2013 1 1 ... 1576 5 45 2013-01-01 05:00:00
## 4 5 2013 1 1 ... 762 6 0 2013-01-01 06:00:00
## .. ... ... ... ... ... ... ... ... ...
## 837 838 2013 1 1 ... 1576 23 59 2013-01-01 23:00:00
## 838 839 2013 1 1 ... 416 16 30 2013-01-01 16:00:00
## 839 840 2013 1 1 ... 1389 19 35 2013-01-01 19:00:00
## 840 841 2013 1 1 ... 1096 15 0 2013-01-01 15:00:00
## 841 842 2013 1 1 ... 1069 6 0 2013-01-01 06:00:00
##
## [842 rows x 20 columns]
flights.loc[flights['month'].isin([11, 12])]
## Unnamed: 0 year month ... hour minute time_hour
## 55893 55894 2013 11 ... 23 59 2013-11-01 23:00:00
## 55894 55895 2013 11 ... 22 50 2013-11-01 22:00:00
## 55895 55896 2013 11 ... 5 0 2013-11-01 05:00:00
## 55896 55897 2013 11 ... 5 45 2013-11-01 05:00:00
## 55897 55898 2013 11 ... 5 45 2013-11-01 05:00:00
## ... ... ... ... ... ... ... ...
## 111291 111292 2013 12 ... 7 5 2013-12-31 07:00:00
## 111292 111293 2013 12 ... 8 25 2013-12-31 08:00:00
## 111293 111294 2013 12 ... 16 15 2013-12-31 16:00:00
## 111294 111295 2013 12 ... 6 0 2013-12-31 06:00:00
## 111295 111296 2013 12 ... 8 30 2013-12-31 08:00:00
##
## [55403 rows x 20 columns]
flights.loc[~((flights['arr_delay'] > 120) | (flights['dep_delay'] > 120))]
## Unnamed: 0 year month ... hour minute time_hour
## 0 1 2013 1 ... 5 15 2013-01-01 05:00:00
## 1 2 2013 1 ... 5 29 2013-01-01 05:00:00
## 2 3 2013 1 ... 5 40 2013-01-01 05:00:00
## 3 4 2013 1 ... 5 45 2013-01-01 05:00:00
## 4 5 2013 1 ... 6 0 2013-01-01 06:00:00
## ... ... ... ... ... ... ... ...
## 336771 336772 2013 9 ... 14 55 2013-09-30 14:00:00
## 336772 336773 2013 9 ... 22 0 2013-09-30 22:00:00
## 336773 336774 2013 9 ... 12 10 2013-09-30 12:00:00
## 336774 336775 2013 9 ... 11 59 2013-09-30 11:00:00
## 336775 336776 2013 9 ... 8 40 2013-09-30 08:00:00
##
## [325354 rows x 20 columns]
flights.loc[flights['dep_time'].isnull()]
## Unnamed: 0 year month ... hour minute time_hour
## 838 839 2013 1 ... 16 30 2013-01-01 16:00:00
## 839 840 2013 1 ... 19 35 2013-01-01 19:00:00
## 840 841 2013 1 ... 15 0 2013-01-01 15:00:00
## 841 842 2013 1 ... 6 0 2013-01-01 06:00:00
## 1777 1778 2013 1 ... 15 40 2013-01-02 15:00:00
## ... ... ... ... ... ... ... ...
## 336771 336772 2013 9 ... 14 55 2013-09-30 14:00:00
## 336772 336773 2013 9 ... 22 0 2013-09-30 22:00:00
## 336773 336774 2013 9 ... 12 10 2013-09-30 12:00:00
## 336774 336775 2013 9 ... 11 59 2013-09-30 11:00:00
## 336775 336776 2013 9 ... 8 40 2013-09-30 08:00:00
##
## [8255 rows x 20 columns]
flights.loc[flights['distance'].between(700, 1000)]
## Unnamed: 0 year month ... hour minute time_hour
## 4 5 2013 1 ... 6 0 2013-01-01 06:00:00
## 5 6 2013 1 ... 5 58 2013-01-01 05:00:00
## 8 9 2013 1 ... 6 0 2013-01-01 06:00:00
## 9 10 2013 1 ... 6 0 2013-01-01 06:00:00
## 18 19 2013 1 ... 6 0 2013-01-01 06:00:00
## ... ... ... ... ... ... ... ...
## 336759 336760 2013 9 ... 21 40 2013-09-30 21:00:00
## 336760 336761 2013 9 ... 20 59 2013-09-30 20:00:00
## 336763 336764 2013 9 ... 20 1 2013-09-30 20:00:00
## 336770 336771 2013 9 ... 18 42 2013-09-30 18:00:00
## 336773 336774 2013 9 ... 12 10 2013-09-30 12:00:00
##
## [74683 rows x 20 columns]
flights.sort_values(by=['year', 'month', 'day'])
## Unnamed: 0 year month ... hour minute time_hour
## 0 1 2013 1 ... 5 15 2013-01-01 05:00:00
## 1 2 2013 1 ... 5 29 2013-01-01 05:00:00
## 2 3 2013 1 ... 5 40 2013-01-01 05:00:00
## 3 4 2013 1 ... 5 45 2013-01-01 05:00:00
## 4 5 2013 1 ... 6 0 2013-01-01 06:00:00
## ... ... ... ... ... ... ... ...
## 111291 111292 2013 12 ... 7 5 2013-12-31 07:00:00
## 111292 111293 2013 12 ... 8 25 2013-12-31 08:00:00
## 111293 111294 2013 12 ... 16 15 2013-12-31 16:00:00
## 111294 111295 2013 12 ... 6 0 2013-12-31 06:00:00
## 111295 111296 2013 12 ... 8 30 2013-12-31 08:00:00
##
## [336776 rows x 20 columns]
flights.sort_values(by=['dep_delay'], ascending=False)
## Unnamed: 0 year month ... hour minute time_hour
## 7072 7073 2013 1 ... 9 0 2013-01-09 09:00:00
## 235778 235779 2013 6 ... 19 35 2013-06-15 19:00:00
## 8239 8240 2013 1 ... 16 35 2013-01-10 16:00:00
## 327043 327044 2013 9 ... 18 45 2013-09-20 18:00:00
## 270376 270377 2013 7 ... 16 0 2013-07-22 16:00:00
## ... ... ... ... ... ... ... ...
## 336771 336772 2013 9 ... 14 55 2013-09-30 14:00:00
## 336772 336773 2013 9 ... 22 0 2013-09-30 22:00:00
## 336773 336774 2013 9 ... 12 10 2013-09-30 12:00:00
## 336774 336775 2013 9 ... 11 59 2013-09-30 11:00:00
## 336775 336776 2013 9 ... 8 40 2013-09-30 08:00:00
##
## [336776 rows x 20 columns]
flights.sort_values(by=['dep_delay', 'arr_delay'], ascending=[False, True])
## Unnamed: 0 year month ... hour minute time_hour
## 7072 7073 2013 1 ... 9 0 2013-01-09 09:00:00
## 235778 235779 2013 6 ... 19 35 2013-06-15 19:00:00
## 8239 8240 2013 1 ... 16 35 2013-01-10 16:00:00
## 327043 327044 2013 9 ... 18 45 2013-09-20 18:00:00
## 270376 270377 2013 7 ... 16 0 2013-07-22 16:00:00
## ... ... ... ... ... ... ... ...
## 336771 336772 2013 9 ... 14 55 2013-09-30 14:00:00
## 336772 336773 2013 9 ... 22 0 2013-09-30 22:00:00
## 336773 336774 2013 9 ... 12 10 2013-09-30 12:00:00
## 336774 336775 2013 9 ... 11 59 2013-09-30 11:00:00
## 336775 336776 2013 9 ... 8 40 2013-09-30 08:00:00
##
## [336776 rows x 20 columns]
flights.sort_values(by=['dep_time'], ascending=False, na_position='first')
## Unnamed: 0 year month ... hour minute time_hour
## 838 839 2013 1 ... 16 30 2013-01-01 16:00:00
## 839 840 2013 1 ... 19 35 2013-01-01 19:00:00
## 840 841 2013 1 ... 15 0 2013-01-01 15:00:00
## 841 842 2013 1 ... 6 0 2013-01-01 06:00:00
## 1777 1778 2013 1 ... 15 40 2013-01-02 15:00:00
## ... ... ... ... ... ... ... ...
## 151984 151985 2013 3 ... 21 28 2013-03-18 21:00:00
## 152965 152966 2013 3 ... 22 50 2013-03-19 22:00:00
## 26076 26077 2013 1 ... 21 0 2013-01-31 21:00:00
## 215892 215893 2013 5 ... 23 59 2013-05-25 23:00:00
## 131559 131560 2013 2 ... 22 45 2013-02-24 22:00:00
##
## [336776 rows x 20 columns]
flights.columns.values.tolist()
## ['Unnamed: 0', 'year', 'month', 'day', 'dep_time', 'sched_dep_time', 'dep_delay', 'arr_time', 'sched_arr_time', 'arr_delay', 'carrier', 'flight', 'tailnum', 'origin', 'dest', 'air_time', 'distance', 'hour', 'minute', 'time_hour']
flights.loc[:, ['year', 'month', 'day']]
## year month day
## 0 2013 1 1
## 1 2013 1 1
## 2 2013 1 1
## 3 2013 1 1
## 4 2013 1 1
## ... ... ... ...
## 336771 2013 9 30
## 336772 2013 9 30
## 336773 2013 9 30
## 336774 2013 9 30
## 336775 2013 9 30
##
## [336776 rows x 3 columns]
flights.loc[:, 'year':'day']
## year month day
## 0 2013 1 1
## 1 2013 1 1
## 2 2013 1 1
## 3 2013 1 1
## 4 2013 1 1
## ... ... ... ...
## 336771 2013 9 30
## 336772 2013 9 30
## 336773 2013 9 30
## 336774 2013 9 30
## 336775 2013 9 30
##
## [336776 rows x 3 columns]
flights.loc[:, ~flights.columns.isin(['year', 'day'])]
## Unnamed: 0 month dep_time ... hour minute time_hour
## 0 1 1 517.0 ... 5 15 2013-01-01 05:00:00
## 1 2 1 533.0 ... 5 29 2013-01-01 05:00:00
## 2 3 1 542.0 ... 5 40 2013-01-01 05:00:00
## 3 4 1 544.0 ... 5 45 2013-01-01 05:00:00
## 4 5 1 554.0 ... 6 0 2013-01-01 06:00:00
## ... ... ... ... ... ... ... ...
## 336771 336772 9 NaN ... 14 55 2013-09-30 14:00:00
## 336772 336773 9 NaN ... 22 0 2013-09-30 22:00:00
## 336773 336774 9 NaN ... 12 10 2013-09-30 12:00:00
## 336774 336775 9 NaN ... 11 59 2013-09-30 11:00:00
## 336775 336776 9 NaN ... 8 40 2013-09-30 08:00:00
##
## [336776 rows x 18 columns]
flights.iloc[:, [1,3]]
## year day
## 0 2013 1
## 1 2013 1
## 2 2013 1
## 3 2013 1
## 4 2013 1
## ... ... ...
## 336771 2013 30
## 336772 2013 30
## 336773 2013 30
## 336774 2013 30
## 336775 2013 30
##
## [336776 rows x 2 columns]
flights.iloc[:, 1:4]
## year month day
## 0 2013 1 1
## 1 2013 1 1
## 2 2013 1 1
## 3 2013 1 1
## 4 2013 1 1
## ... ... ... ...
## 336771 2013 9 30
## 336772 2013 9 30
## 336773 2013 9 30
## 336774 2013 9 30
## 336775 2013 9 30
##
## [336776 rows x 3 columns]
flights.iloc[:, list(set(range(len(flights.columns))) - set([1, 3]))]
## Unnamed: 0 month dep_time ... hour minute time_hour
## 0 1 1 517.0 ... 5 15 2013-01-01 05:00:00
## 1 2 1 533.0 ... 5 29 2013-01-01 05:00:00
## 2 3 1 542.0 ... 5 40 2013-01-01 05:00:00
## 3 4 1 544.0 ... 5 45 2013-01-01 05:00:00
## 4 5 1 554.0 ... 6 0 2013-01-01 06:00:00
## ... ... ... ... ... ... ... ...
## 336771 336772 9 NaN ... 14 55 2013-09-30 14:00:00
## 336772 336773 9 NaN ... 22 0 2013-09-30 22:00:00
## 336773 336774 9 NaN ... 12 10 2013-09-30 12:00:00
## 336774 336775 9 NaN ... 11 59 2013-09-30 11:00:00
## 336775 336776 9 NaN ... 8 40 2013-09-30 08:00:00
##
## [336776 rows x 18 columns]
flights.iloc[:, list(range(1, 4)) + list(range(5, 8))]
## year month day sched_dep_time dep_delay arr_time
## 0 2013 1 1 515 2.0 830.0
## 1 2013 1 1 529 4.0 850.0
## 2 2013 1 1 540 2.0 923.0
## 3 2013 1 1 545 -1.0 1004.0
## 4 2013 1 1 600 -6.0 812.0
## ... ... ... ... ... ... ...
## 336771 2013 9 30 1455 NaN NaN
## 336772 2013 9 30 2200 NaN NaN
## 336773 2013 9 30 1210 NaN NaN
## 336774 2013 9 30 1159 NaN NaN
## 336775 2013 9 30 840 NaN NaN
##
## [336776 rows x 6 columns]
flights.loc[:, flights.columns.str.contains('time')]
## dep_time sched_dep_time ... air_time time_hour
## 0 517.0 515 ... 227.0 2013-01-01 05:00:00
## 1 533.0 529 ... 227.0 2013-01-01 05:00:00
## 2 542.0 540 ... 160.0 2013-01-01 05:00:00
## 3 544.0 545 ... 183.0 2013-01-01 05:00:00
## 4 554.0 600 ... 116.0 2013-01-01 06:00:00
## ... ... ... ... ... ...
## 336771 NaN 1455 ... NaN 2013-09-30 14:00:00
## 336772 NaN 2200 ... NaN 2013-09-30 22:00:00
## 336773 NaN 1210 ... NaN 2013-09-30 12:00:00
## 336774 NaN 1159 ... NaN 2013-09-30 11:00:00
## 336775 NaN 840 ... NaN 2013-09-30 08:00:00
##
## [336776 rows x 6 columns]
Some helper functions:
flights.assign(gain = flights['dep_delay'] - flights['arr_delay'],
speed = flights['dep_delay'] - flights['arr_delay'])
## Unnamed: 0 year month day ... minute time_hour gain speed
## 0 1 2013 1 1 ... 15 2013-01-01 05:00:00 -9.0 -9.0
## 1 2 2013 1 1 ... 29 2013-01-01 05:00:00 -16.0 -16.0
## 2 3 2013 1 1 ... 40 2013-01-01 05:00:00 -31.0 -31.0
## 3 4 2013 1 1 ... 45 2013-01-01 05:00:00 17.0 17.0
## 4 5 2013 1 1 ... 0 2013-01-01 06:00:00 19.0 19.0
## ... ... ... ... ... ... ... ... ... ...
## 336771 336772 2013 9 30 ... 55 2013-09-30 14:00:00 NaN NaN
## 336772 336773 2013 9 30 ... 0 2013-09-30 22:00:00 NaN NaN
## 336773 336774 2013 9 30 ... 10 2013-09-30 12:00:00 NaN NaN
## 336774 336775 2013 9 30 ... 59 2013-09-30 11:00:00 NaN NaN
## 336775 336776 2013 9 30 ... 40 2013-09-30 08:00:00 NaN NaN
##
## [336776 rows x 22 columns]
pd.DataFrame({'gain': flights['dep_delay'] - flights['arr_delay'],
'hours': flights['air_time'] / 60,
'gain_per_hour': (flights['dep_delay'] - flights['arr_delay']) / (flights['air_time'] / 60)
})
## gain hours gain_per_hour
## 0 -9.0 3.783333 -2.378855
## 1 -16.0 3.783333 -4.229075
## 2 -31.0 2.666667 -11.625000
## 3 17.0 3.050000 5.573770
## 4 19.0 1.933333 9.827586
## ... ... ... ...
## 336771 NaN NaN NaN
## 336772 NaN NaN NaN
## 336773 NaN NaN NaN
## 336774 NaN NaN NaN
## 336775 NaN NaN NaN
##
## [336776 rows x 3 columns]
Functions for creating new variables:
flights.groupby('dest')['distance'].mean()
## dest
## ABQ 1826.000000
## ACK 199.000000
## ALB 143.000000
## ANC 3370.000000
## ATL 757.108220
## ...
## TPA 1003.935575
## TUL 1215.000000
## TVC 652.386139
## TYS 638.809826
## XNA 1142.505792
## Name: distance, Length: 105, dtype: float64
flights.groupby(
['dest']
).agg(
{
'dest': 'count',
'distance': 'mean',
'arr_delay': 'mean'
})
## dest distance arr_delay
## dest
## ABQ 254 1826.000000 4.381890
## ACK 265 199.000000 4.852273
## ALB 439 143.000000 14.397129
## ANC 8 3370.000000 -2.500000
## ATL 17215 757.108220 11.300113
## ... ... ... ...
## TPA 7466 1003.935575 7.408525
## TUL 315 1215.000000 33.659864
## TVC 101 652.386139 12.968421
## TYS 631 638.809826 24.069204
## XNA 1036 1142.505792 7.465726
##
## [105 rows x 3 columns]
flights.groupby(
['year', 'month', 'day']
).agg(
{
'dep_delay': 'mean'
})
## dep_delay
## year month day
## 2013 1 1 11.548926
## 2 13.858824
## 3 10.987832
## 4 8.951595
## 5 5.732218
## ... ...
## 12 27 10.937630
## 28 7.981550
## 29 22.309551
## 30 10.698113
## 31 6.996053
##
## [365 rows x 1 columns]
Some helper functions can be used: