R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

##P6.E4##

library(tidyverse)
## -- Attaching packages ---------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts ------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
mpg 
## # A tibble: 234 x 11
##    manufacturer model displ  year   cyl trans drv     cty   hwy fl    class
##    <chr>        <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
##  1 audi         a4      1.8  1999     4 auto~ f        18    29 p     comp~
##  2 audi         a4      1.8  1999     4 manu~ f        21    29 p     comp~
##  3 audi         a4      2    2008     4 manu~ f        20    31 p     comp~
##  4 audi         a4      2    2008     4 auto~ f        21    30 p     comp~
##  5 audi         a4      2.8  1999     6 auto~ f        16    26 p     comp~
##  6 audi         a4      2.8  1999     6 manu~ f        18    26 p     comp~
##  7 audi         a4      3.1  2008     6 auto~ f        18    27 p     comp~
##  8 audi         a4 q~   1.8  1999     4 manu~ 4        18    26 p     comp~
##  9 audi         a4 q~   1.8  1999     4 auto~ 4        16    25 p     comp~
## 10 audi         a4 q~   2    2008     4 manu~ 4        20    28 p     comp~
## # ... with 224 more rows
ggplot(mpg)+
  geom_point(aes(hwy,cyl))

##p12.E3##

ggplot(data=mpg)+
  geom_point(mapping= aes(x=hwy,y=cyl,colour=drv))

ggplot(data=mpg)+
  geom_point(mapping=aes(x=hwy, y=cyl, size= drv))
## Warning: Using size for a discrete variable is not advised.

##p20.E1##

##line##
ggplot(mpg, aes(x=hwy, y=cyl,))+
  geom_line()

##boxplot##
ggplot(mpg, aes(group=cyl, y=hwy))+
  geom_boxplot()

##histogram##
ggplot(mpg, aes(x=hwy))+
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

##areachart## 
ggplot(mpg, aes(x=hwy, y=cyl))+
  geom_area()

##p21.E6##

##1##
ggplot(data=mpg, mapping=aes(x=displ, y=hwy))+
  geom_point()+ 
  geom_smooth(show.legend = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data=mpg, mapping= aes(x=displ, y=hwy, group=drv))+
  geom_point()+
  geom_smooth(show.legend = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

##2##
ggplot(data=mpg, mapping =aes(x=displ, y=hwy, colour= drv))+
  geom_point()+
  geom_smooth(show.legend = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data=mpg, mapping=aes(x=displ, y=hwy))+
  geom_point(aes(color=drv))+
  geom_smooth(show.legend = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

##3##
ggplot(data=mpg, mapping = aes(x=displ,y=hwy))+
  geom_point(aes(color=drv))+
  geom_smooth(aes(linetype=drv), show.legend = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data=mpg, mapping = aes(x=displ, y=hwy ))+
  geom_point(aes(colour=drv),size=2,stroke=2)

##p49.E1##

library(nycflights13)
library(dplyr)
flights
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
##a##
filter(flights, !(arr_delay >= 120 ))
## # A tibble: 317,146 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # ... with 317,136 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
##b##
filter(flights, dest == 'IAH'| dest== 'HOU')
## # A tibble: 9,313 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      623            627        -4      933
##  4  2013     1     1      728            732        -4     1041
##  5  2013     1     1      739            739         0     1104
##  6  2013     1     1      908            908         0     1228
##  7  2013     1     1     1028           1026         2     1350
##  8  2013     1     1     1044           1045        -1     1352
##  9  2013     1     1     1114            900       134     1447
## 10  2013     1     1     1205           1200         5     1503
## # ... with 9,303 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
filter(flights, dest %in%  c('IAH','HOU'))
## # A tibble: 9,313 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      623            627        -4      933
##  4  2013     1     1      728            732        -4     1041
##  5  2013     1     1      739            739         0     1104
##  6  2013     1     1      908            908         0     1228
##  7  2013     1     1     1028           1026         2     1350
##  8  2013     1     1     1044           1045        -1     1352
##  9  2013     1     1     1114            900       134     1447
## 10  2013     1     1     1205           1200         5     1503
## # ... with 9,303 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
##c##
filter(flights, carrier == 'UA' | carrier == 'AA' | carrier == 'DL')
## # A tibble: 139,504 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      554            600        -6      812
##  5  2013     1     1      554            558        -4      740
##  6  2013     1     1      558            600        -2      753
##  7  2013     1     1      558            600        -2      924
##  8  2013     1     1      558            600        -2      923
##  9  2013     1     1      559            600        -1      941
## 10  2013     1     1      559            600        -1      854
## # ... with 139,494 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
filter(flights, carrier %in%  c('UA', 'AA', 'DL'))
## # A tibble: 139,504 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      554            600        -6      812
##  5  2013     1     1      554            558        -4      740
##  6  2013     1     1      558            600        -2      753
##  7  2013     1     1      558            600        -2      924
##  8  2013     1     1      558            600        -2      923
##  9  2013     1     1      559            600        -1      941
## 10  2013     1     1      559            600        -1      854
## # ... with 139,494 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
##d##
filter(flights, month == 7)
## # A tibble: 29,425 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     7     1        1           2029       212      236
##  2  2013     7     1        2           2359         3      344
##  3  2013     7     1       29           2245       104      151
##  4  2013     7     1       43           2130       193      322
##  5  2013     7     1       44           2150       174      300
##  6  2013     7     1       46           2051       235      304
##  7  2013     7     1       48           2001       287      308
##  8  2013     7     1       58           2155       183      335
##  9  2013     7     1      100           2146       194      327
## 10  2013     7     1      100           2245       135      337
## # ... with 29,415 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
filter(flights,month == 8)
## # A tibble: 29,327 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     8     1       12           2130       162      257
##  2  2013     8     1       12           2359        13      349
##  3  2013     8     1       22           2146       156      255
##  4  2013     8     1       26           2051       215      333
##  5  2013     8     1       32           2359        33      420
##  6  2013     8     1       33           2231       122      412
##  7  2013     8     1       44           2245       119      342
##  8  2013     8     1       46           2359        47      424
##  9  2013     8     1       47           2255       112      154
## 10  2013     8     1       50           2305       105      154
## # ... with 29,317 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
filter(flights, month == 9)
## # A tibble: 27,574 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     9     1        9           2359        10      343
##  2  2013     9     1      117           2245       152      218
##  3  2013     9     1      508            516        -8      717
##  4  2013     9     1      537            545        -8      849
##  5  2013     9     1      537            545        -8      906
##  6  2013     9     1      549            600       -11      815
##  7  2013     9     1      552            600        -8      843
##  8  2013     9     1      553            600        -7      809
##  9  2013     9     1      554            600        -6      700
## 10  2013     9     1      554            600        -6      803
## # ... with 27,564 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
filter(flights,month %in%  c(7, 8, 9))
## # A tibble: 86,326 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     7     1        1           2029       212      236
##  2  2013     7     1        2           2359         3      344
##  3  2013     7     1       29           2245       104      151
##  4  2013     7     1       43           2130       193      322
##  5  2013     7     1       44           2150       174      300
##  6  2013     7     1       46           2051       235      304
##  7  2013     7     1       48           2001       287      308
##  8  2013     7     1       58           2155       183      335
##  9  2013     7     1      100           2146       194      327
## 10  2013     7     1      100           2245       135      337
## # ... with 86,316 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
##e##
filter(flights, !(arr_delay > 120 | dep_delay > 120))
## # A tibble: 316,050 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # ... with 316,040 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
##f##
filter(flights, !(arr_delay <= 60 | dep_delay >= 30))
## # A tibble: 1,931 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1     1342           1320        22     1617
##  2  2013     1     1     1558           1534        24     1808
##  3  2013     1     1     1751           1745         6     2015
##  4  2013     1     2      841            845        -4     1134
##  5  2013     1     2      928            905        23     1331
##  6  2013     1     2     1558           1600        -2     1923
##  7  2013     1     6      654            655        -1     1025
##  8  2013     1     6      906            904         2     1313
##  9  2013     1     6     1932           1910        22     2318
## 10  2013     1     7     1348           1350        -2     1625
## # ... with 1,921 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
##g##
filter(flights, between(dep_time, 1800, 3600))
## # A tibble: 76,737 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1     1800           1800         0     1945
##  2  2013     1     1     1800           1800         0     1951
##  3  2013     1     1     1802           1805        -3     1930
##  4  2013     1     1     1802           1801         1     2125
##  5  2013     1     1     1803           1726        37     2011
##  6  2013     1     1     1803           1620       103     2008
##  7  2013     1     1     1803           1800         3     2021
##  8  2013     1     1     1805           1757         8     2117
##  9  2013     1     1     1806           1810        -4     2002
## 10  2013     1     1     1807           1738        29     2251
## # ... with 76,727 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

##p50.E2##

filter(flights, between(month, 7, 9))
## # A tibble: 86,326 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     7     1        1           2029       212      236
##  2  2013     7     1        2           2359         3      344
##  3  2013     7     1       29           2245       104      151
##  4  2013     7     1       43           2130       193      322
##  5  2013     7     1       44           2150       174      300
##  6  2013     7     1       46           2051       235      304
##  7  2013     7     1       48           2001       287      308
##  8  2013     7     1       58           2155       183      335
##  9  2013     7     1      100           2146       194      327
## 10  2013     7     1      100           2245       135      337
## # ... with 86,316 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
filter(flights, dep_time<= 1800 | dep_time==3600)
## # A tibble: 252,164 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # ... with 252,154 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

##p50.E3##

summary(flights)
##       year          month             day           dep_time   
##  Min.   :2013   Min.   : 1.000   Min.   : 1.00   Min.   :   1  
##  1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 907  
##  Median :2013   Median : 7.000   Median :16.00   Median :1401  
##  Mean   :2013   Mean   : 6.549   Mean   :15.71   Mean   :1349  
##  3rd Qu.:2013   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:1744  
##  Max.   :2013   Max.   :12.000   Max.   :31.00   Max.   :2400  
##                                                  NA's   :8255  
##  sched_dep_time   dep_delay          arr_time    sched_arr_time
##  Min.   : 106   Min.   : -43.00   Min.   :   1   Min.   :   1  
##  1st Qu.: 906   1st Qu.:  -5.00   1st Qu.:1104   1st Qu.:1124  
##  Median :1359   Median :  -2.00   Median :1535   Median :1556  
##  Mean   :1344   Mean   :  12.64   Mean   :1502   Mean   :1536  
##  3rd Qu.:1729   3rd Qu.:  11.00   3rd Qu.:1940   3rd Qu.:1945  
##  Max.   :2359   Max.   :1301.00   Max.   :2400   Max.   :2359  
##                 NA's   :8255      NA's   :8713                 
##    arr_delay          carrier              flight       tailnum         
##  Min.   : -86.000   Length:336776      Min.   :   1   Length:336776     
##  1st Qu.: -17.000   Class :character   1st Qu.: 553   Class :character  
##  Median :  -5.000   Mode  :character   Median :1496   Mode  :character  
##  Mean   :   6.895                      Mean   :1972                     
##  3rd Qu.:  14.000                      3rd Qu.:3465                     
##  Max.   :1272.000                      Max.   :8500                     
##  NA's   :9430                                                           
##     origin              dest              air_time        distance   
##  Length:336776      Length:336776      Min.   : 20.0   Min.   :  17  
##  Class :character   Class :character   1st Qu.: 82.0   1st Qu.: 502  
##  Mode  :character   Mode  :character   Median :129.0   Median : 872  
##                                        Mean   :150.7   Mean   :1040  
##                                        3rd Qu.:192.0   3rd Qu.:1389  
##                                        Max.   :695.0   Max.   :4983  
##                                        NA's   :9430                  
##       hour           minute        time_hour                  
##  Min.   : 1.00   Min.   : 0.00   Min.   :2013-01-01 05:00:00  
##  1st Qu.: 9.00   1st Qu.: 8.00   1st Qu.:2013-04-04 13:00:00  
##  Median :13.00   Median :29.00   Median :2013-07-03 10:00:00  
##  Mean   :13.18   Mean   :26.23   Mean   :2013-07-03 05:22:54  
##  3rd Qu.:17.00   3rd Qu.:44.00   3rd Qu.:2013-10-01 07:00:00  
##  Max.   :23.00   Max.   :59.00   Max.   :2013-12-31 23:00:00  
## 

##p51.E1##

tibble(x = c(5, 2, NA))
## # A tibble: 3 x 1
##       x
##   <dbl>
## 1     5
## 2     2
## 3    NA

##p51.E2##

arrange(flights, desc(dep_delay))
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     9      641            900      1301     1242
##  2  2013     6    15     1432           1935      1137     1607
##  3  2013     1    10     1121           1635      1126     1239
##  4  2013     9    20     1139           1845      1014     1457
##  5  2013     7    22      845           1600      1005     1044
##  6  2013     4    10     1100           1900       960     1342
##  7  2013     3    17     2321            810       911      135
##  8  2013     6    27      959           1900       899     1236
##  9  2013     7    22     2257            759       898      121
## 10  2013    12     5      756           1700       896     1058
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
arrange(flights, dep_delay)
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013    12     7     2040           2123       -43       40
##  2  2013     2     3     2022           2055       -33     2240
##  3  2013    11    10     1408           1440       -32     1549
##  4  2013     1    11     1900           1930       -30     2233
##  5  2013     1    29     1703           1730       -27     1947
##  6  2013     8     9      729            755       -26     1002
##  7  2013    10    23     1907           1932       -25     2143
##  8  2013     3    30     2030           2055       -25     2213
##  9  2013     3     2     1431           1455       -24     1601
## 10  2013     5     5      934            958       -24     1225
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

##p51.E3##

flights %>% mutate(travel_time = ifelse((arr_time - dep_time < 0), 
                                        2400+(arr_time - dep_time),
                                        arr_time - dep_time)) %>% 
  arrange(travel_time) %>% select(arr_time, dep_time, travel_time)
## # A tibble: 336,776 x 3
##    arr_time dep_time travel_time
##       <int>    <int>       <dbl>
##  1     1358     1323          35
##  2     1347     1312          35
##  3     1238     1203          35
##  4      758      722          36
##  5      758      722          36
##  6      754      718          36
##  7     1455     1418          37
##  8       53       16          37
##  9      754      717          37
## 10     1353     1315          38
## # ... with 336,766 more rows
arrange(flights, (arr_time-dep_time))
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     7    17     2400           2142       138       54
##  2  2013    12     9     2400           2250        70       59
##  3  2013     6    12     2338           2129       129       17
##  4  2013    12    29     2332           2155        97       14
##  5  2013    11     6     2335           2215        80       18
##  6  2013     2    25     2347           2145       122       30
##  7  2013     8    13     2351           2152       119       35
##  8  2013    10    11     2342           2030       192       27
##  9  2013     2    26     2356           2000       236       41
## 10  2013     1    24     2342           2159       103       28
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

##P51.E4##

arrange(flights, desc(distance)) %>% select(1:5, distance)
## # A tibble: 336,776 x 6
##     year month   day dep_time sched_dep_time distance
##    <int> <int> <int>    <int>          <int>    <dbl>
##  1  2013     1     1      857            900     4983
##  2  2013     1     2      909            900     4983
##  3  2013     1     3      914            900     4983
##  4  2013     1     4      900            900     4983
##  5  2013     1     5      858            900     4983
##  6  2013     1     6     1019            900     4983
##  7  2013     1     7     1042            900     4983
##  8  2013     1     8      901            900     4983
##  9  2013     1     9      641            900     4983
## 10  2013     1    10      859            900     4983
## # ... with 336,766 more rows
arrange(flights, distance) %>% select(1:5, distance)
## # A tibble: 336,776 x 6
##     year month   day dep_time sched_dep_time distance
##    <int> <int> <int>    <int>          <int>    <dbl>
##  1  2013     7    27       NA            106       17
##  2  2013     1     3     2127           2129       80
##  3  2013     1     4     1240           1200       80
##  4  2013     1     4     1829           1615       80
##  5  2013     1     4     2128           2129       80
##  6  2013     1     5     1155           1200       80
##  7  2013     1     6     2125           2129       80
##  8  2013     1     7     2124           2129       80
##  9  2013     1     8     2127           2130       80
## 10  2013     1     9     2126           2129       80
## # ... with 336,766 more rows

##P58.E4##

filter(flights, min_rank(desc(dep_delay))<=10)
## # A tibble: 10 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     9      641            900      1301     1242
##  2  2013     1    10     1121           1635      1126     1239
##  3  2013    12     5      756           1700       896     1058
##  4  2013     3    17     2321            810       911      135
##  5  2013     4    10     1100           1900       960     1342
##  6  2013     6    15     1432           1935      1137     1607
##  7  2013     6    27      959           1900       899     1236
##  8  2013     7    22      845           1600      1005     1044
##  9  2013     7    22     2257            759       898      121
## 10  2013     9    20     1139           1845      1014     1457
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>
flights %>% top_n(n = 10, wt = dep_delay)
## # A tibble: 10 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     9      641            900      1301     1242
##  2  2013     1    10     1121           1635      1126     1239
##  3  2013    12     5      756           1700       896     1058
##  4  2013     3    17     2321            810       911      135
##  5  2013     4    10     1100           1900       960     1342
##  6  2013     6    15     1432           1935      1137     1607
##  7  2013     6    27      959           1900       899     1236
##  8  2013     7    22      845           1600      1005     1044
##  9  2013     7    22     2257            759       898      121
## 10  2013     9    20     1139           1845      1014     1457
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

##p73.E4##

flights %>%
  mutate(dep_date = lubridate::make_datetime(year, month, day)) %>%
  group_by(dep_date) %>%
  summarise(cancelled = sum(is.na(dep_delay)), 
            n = n(),
            mean_dep_delay = mean(dep_delay,na.rm=TRUE),
            mean_arr_delay = mean(arr_delay,na.rm=TRUE)) %>%
  ggplot(aes(x= cancelled/n)) + 
  geom_point(aes(y=mean_dep_delay), colour='black', alpha=0.5) + 
  geom_point(aes(y=mean_arr_delay), colour='red', alpha=0.5) + 
  ylab('mean delay (minutes)')

##P73.E5##

flights %>%
  filter(arr_delay > 0) %>%
  group_by(carrier) %>%
  summarise(average_arr_delay = mean(arr_delay, na.rm=TRUE)) %>%
  arrange(desc(average_arr_delay))
## # A tibble: 16 x 2
##    carrier average_arr_delay
##    <chr>               <dbl>
##  1 OO                   60.6
##  2 YV                   51.1
##  3 9E                   49.3
##  4 EV                   48.3
##  5 F9                   47.6
##  6 VX                   43.8
##  7 FL                   41.1
##  8 WN                   40.7
##  9 B6                   40.0
## 10 AA                   38.3
## 11 MQ                   37.9
## 12 DL                   37.7
## 13 UA                   36.7
## 14 HA                   35.0
## 15 AS                   34.4
## 16 US                   29.0
flights %>%
  summarise(n_distinct(carrier),
            n_distinct(origin),
            n_distinct(dest))
## # A tibble: 1 x 3
##   `n_distinct(carrier)` `n_distinct(origin)` `n_distinct(dest)`
##                   <int>                <int>              <int>
## 1                    16                    3                105

##p75.E2##

flights %>%
  group_by(tailnum) %>%
  summarise(prop_on_time = sum(arr_delay <= 30 & !is.na(arr_delay))/n(),
            mean_arr_delay = mean(arr_delay, na.rm=TRUE),
            flights = n()) %>%
  arrange(prop_on_time, desc(mean_arr_delay))
## # A tibble: 4,044 x 4
##    tailnum prop_on_time mean_arr_delay flights
##    <chr>          <dbl>          <dbl>   <int>
##  1 N844MH             0            320       1
##  2 N911DA             0            294       1
##  3 N922EV             0            276       1
##  4 N587NW             0            264       1
##  5 N851NW             0            219       1
##  6 N928DN             0            201       1
##  7 N7715E             0            188       1
##  8 N654UA             0            185       1
##  9 N427SW             0            157       1
## 10 N136DL             0            146       1
## # ... with 4,034 more rows
flights %>%
  group_by(tailnum) %>%
  filter(all(is.na(arr_delay))) %>%
  tally(sort=TRUE)
## # A tibble: 7 x 2
##   tailnum     n
##   <chr>   <int>
## 1 <NA>     2512
## 2 N347SW      1
## 3 N728SK      1
## 4 N768SK      1
## 5 N862DA      1
## 6 N865DA      1
## 7 N939DN      1

##p75.E3##

flights %>%
  ggplot(aes(x=factor(hour), fill=arr_delay>5 | is.na(arr_delay))) + geom_bar()

##p75.E4##

flights %>%
  mutate(new_sched_dep_time = lubridate::make_datetime(year, month, day, hour, minute)) %>%
  group_by(origin) %>%
  arrange(new_sched_dep_time) %>%
  mutate(prev_flight_dep_delay = lag(dep_delay)) %>%
  ggplot(aes(x=prev_flight_dep_delay, y= dep_delay)) + geom_point()
## Warning: Removed 14383 rows containing missing values (geom_point).