#Homework1 ##by Chantsal 108035487
##p6.E4##
library(tidyverse)
## -- Attaching packages ---------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
mpg
## # A tibble: 234 x 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto~ f 18 29 p comp~
## 2 audi a4 1.8 1999 4 manu~ f 21 29 p comp~
## 3 audi a4 2 2008 4 manu~ f 20 31 p comp~
## 4 audi a4 2 2008 4 auto~ f 21 30 p comp~
## 5 audi a4 2.8 1999 6 auto~ f 16 26 p comp~
## 6 audi a4 2.8 1999 6 manu~ f 18 26 p comp~
## 7 audi a4 3.1 2008 6 auto~ f 18 27 p comp~
## 8 audi a4 q~ 1.8 1999 4 manu~ 4 18 26 p comp~
## 9 audi a4 q~ 1.8 1999 4 auto~ 4 16 25 p comp~
## 10 audi a4 q~ 2 2008 4 manu~ 4 20 28 p comp~
## # ... with 224 more rows
ggplot(mpg)+
geom_point(aes(hwy,cyl))
##p12.E3##
ggplot(data=mpg)+
geom_point(mapping= aes(x=hwy,y=cyl,colour=displ))
ggplot(data=mpg)+
geom_point(mapping=aes(x=hwy, y=cyl, size= displ))
##p20.E1##
##line##
ggplot(mpg, aes(x=hwy, y=cyl,))+
geom_line()
##boxplot##
ggplot(mpg, aes(group=cyl, y=hwy))+
geom_boxplot()
##histogram##
ggplot(mpg, aes(x=hwy))+
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##areachart##
ggplot(mpg, aes(x=hwy, y=cyl))+
geom_area()
##p21.E6##
##1##
ggplot(data=mpg, mapping=aes(x=displ, y=hwy))+
geom_point()+
geom_smooth(show.legend = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(data=mpg, mapping= aes(x=displ, y=hwy, group=drv))+
geom_point()+
geom_smooth(show.legend = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
##2##
ggplot(data=mpg, mapping =aes(x=displ, y=hwy, colour= drv))+
geom_point()+
geom_smooth(show.legend = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(data=mpg, mapping=aes(x=displ, y=hwy))+
geom_point(aes(color=drv))+
geom_smooth(show.legend = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
##3##
ggplot(data=mpg, mapping = aes(x=displ,y=hwy))+
geom_point(aes(color=drv))+
geom_smooth(aes(linetype=drv), show.legend = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(data=mpg, mapping = aes(x=displ, y=hwy ))+
geom_point(aes(colour=drv),size=2,stroke=2)
##p49.E1##
library(nycflights13)
library(dplyr)
flights
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
##a##
filter(flights, !(arr_delay >= 120 ))
## # A tibble: 317,146 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 317,136 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
##b##
filter(flights, dest == 'IAH'| dest== 'HOU')
## # A tibble: 9,313 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 623 627 -4 933
## 4 2013 1 1 728 732 -4 1041
## 5 2013 1 1 739 739 0 1104
## 6 2013 1 1 908 908 0 1228
## 7 2013 1 1 1028 1026 2 1350
## 8 2013 1 1 1044 1045 -1 1352
## 9 2013 1 1 1114 900 134 1447
## 10 2013 1 1 1205 1200 5 1503
## # ... with 9,303 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, dest %in% c('IAH','HOU'))
## # A tibble: 9,313 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 623 627 -4 933
## 4 2013 1 1 728 732 -4 1041
## 5 2013 1 1 739 739 0 1104
## 6 2013 1 1 908 908 0 1228
## 7 2013 1 1 1028 1026 2 1350
## 8 2013 1 1 1044 1045 -1 1352
## 9 2013 1 1 1114 900 134 1447
## 10 2013 1 1 1205 1200 5 1503
## # ... with 9,303 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
##c##
filter(flights, carrier == 'UA' | carrier == 'AA' | carrier == 'DL')
## # A tibble: 139,504 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 554 600 -6 812
## 5 2013 1 1 554 558 -4 740
## 6 2013 1 1 558 600 -2 753
## 7 2013 1 1 558 600 -2 924
## 8 2013 1 1 558 600 -2 923
## 9 2013 1 1 559 600 -1 941
## 10 2013 1 1 559 600 -1 854
## # ... with 139,494 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, carrier %in% c('UA', 'AA', 'DL'))
## # A tibble: 139,504 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 554 600 -6 812
## 5 2013 1 1 554 558 -4 740
## 6 2013 1 1 558 600 -2 753
## 7 2013 1 1 558 600 -2 924
## 8 2013 1 1 558 600 -2 923
## 9 2013 1 1 559 600 -1 941
## 10 2013 1 1 559 600 -1 854
## # ... with 139,494 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
##d##
filter(flights, month == 7)
## # A tibble: 29,425 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 7 1 1 2029 212 236
## 2 2013 7 1 2 2359 3 344
## 3 2013 7 1 29 2245 104 151
## 4 2013 7 1 43 2130 193 322
## 5 2013 7 1 44 2150 174 300
## 6 2013 7 1 46 2051 235 304
## 7 2013 7 1 48 2001 287 308
## 8 2013 7 1 58 2155 183 335
## 9 2013 7 1 100 2146 194 327
## 10 2013 7 1 100 2245 135 337
## # ... with 29,415 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights,month == 8)
## # A tibble: 29,327 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 8 1 12 2130 162 257
## 2 2013 8 1 12 2359 13 349
## 3 2013 8 1 22 2146 156 255
## 4 2013 8 1 26 2051 215 333
## 5 2013 8 1 32 2359 33 420
## 6 2013 8 1 33 2231 122 412
## 7 2013 8 1 44 2245 119 342
## 8 2013 8 1 46 2359 47 424
## 9 2013 8 1 47 2255 112 154
## 10 2013 8 1 50 2305 105 154
## # ... with 29,317 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, month == 9)
## # A tibble: 27,574 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 9 1 9 2359 10 343
## 2 2013 9 1 117 2245 152 218
## 3 2013 9 1 508 516 -8 717
## 4 2013 9 1 537 545 -8 849
## 5 2013 9 1 537 545 -8 906
## 6 2013 9 1 549 600 -11 815
## 7 2013 9 1 552 600 -8 843
## 8 2013 9 1 553 600 -7 809
## 9 2013 9 1 554 600 -6 700
## 10 2013 9 1 554 600 -6 803
## # ... with 27,564 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights,month %in% c(7, 8, 9))
## # A tibble: 86,326 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 7 1 1 2029 212 236
## 2 2013 7 1 2 2359 3 344
## 3 2013 7 1 29 2245 104 151
## 4 2013 7 1 43 2130 193 322
## 5 2013 7 1 44 2150 174 300
## 6 2013 7 1 46 2051 235 304
## 7 2013 7 1 48 2001 287 308
## 8 2013 7 1 58 2155 183 335
## 9 2013 7 1 100 2146 194 327
## 10 2013 7 1 100 2245 135 337
## # ... with 86,316 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
##e##
filter(flights, !(arr_delay > 120 | dep_delay > 120))
## # A tibble: 316,050 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 316,040 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
##f##
filter(flights, !(arr_delay <= 60 | dep_delay >= 30))
## # A tibble: 1,931 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 1342 1320 22 1617
## 2 2013 1 1 1558 1534 24 1808
## 3 2013 1 1 1751 1745 6 2015
## 4 2013 1 2 841 845 -4 1134
## 5 2013 1 2 928 905 23 1331
## 6 2013 1 2 1558 1600 -2 1923
## 7 2013 1 6 654 655 -1 1025
## 8 2013 1 6 906 904 2 1313
## 9 2013 1 6 1932 1910 22 2318
## 10 2013 1 7 1348 1350 -2 1625
## # ... with 1,921 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
##g##
filter(flights, between(dep_time, 1800, 3600))
## # A tibble: 76,737 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 1800 1800 0 1945
## 2 2013 1 1 1800 1800 0 1951
## 3 2013 1 1 1802 1805 -3 1930
## 4 2013 1 1 1802 1801 1 2125
## 5 2013 1 1 1803 1726 37 2011
## 6 2013 1 1 1803 1620 103 2008
## 7 2013 1 1 1803 1800 3 2021
## 8 2013 1 1 1805 1757 8 2117
## 9 2013 1 1 1806 1810 -4 2002
## 10 2013 1 1 1807 1738 29 2251
## # ... with 76,727 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
##p50.E2##
filter(flights, between(month, 7, 9))
## # A tibble: 86,326 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 7 1 1 2029 212 236
## 2 2013 7 1 2 2359 3 344
## 3 2013 7 1 29 2245 104 151
## 4 2013 7 1 43 2130 193 322
## 5 2013 7 1 44 2150 174 300
## 6 2013 7 1 46 2051 235 304
## 7 2013 7 1 48 2001 287 308
## 8 2013 7 1 58 2155 183 335
## 9 2013 7 1 100 2146 194 327
## 10 2013 7 1 100 2245 135 337
## # ... with 86,316 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
filter(flights, dep_time<= 1800 | dep_time==3600)
## # A tibble: 252,164 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 252,154 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
##p50.E3##
summary(flights)
## year month day dep_time
## Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1
## 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 907
## Median :2013 Median : 7.000 Median :16.00 Median :1401
## Mean :2013 Mean : 6.549 Mean :15.71 Mean :1349
## 3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1744
## Max. :2013 Max. :12.000 Max. :31.00 Max. :2400
## NA's :8255
## sched_dep_time dep_delay arr_time sched_arr_time
## Min. : 106 Min. : -43.00 Min. : 1 Min. : 1
## 1st Qu.: 906 1st Qu.: -5.00 1st Qu.:1104 1st Qu.:1124
## Median :1359 Median : -2.00 Median :1535 Median :1556
## Mean :1344 Mean : 12.64 Mean :1502 Mean :1536
## 3rd Qu.:1729 3rd Qu.: 11.00 3rd Qu.:1940 3rd Qu.:1945
## Max. :2359 Max. :1301.00 Max. :2400 Max. :2359
## NA's :8255 NA's :8713
## arr_delay carrier flight tailnum
## Min. : -86.000 Length:336776 Min. : 1 Length:336776
## 1st Qu.: -17.000 Class :character 1st Qu.: 553 Class :character
## Median : -5.000 Mode :character Median :1496 Mode :character
## Mean : 6.895 Mean :1972
## 3rd Qu.: 14.000 3rd Qu.:3465
## Max. :1272.000 Max. :8500
## NA's :9430
## origin dest air_time distance
## Length:336776 Length:336776 Min. : 20.0 Min. : 17
## Class :character Class :character 1st Qu.: 82.0 1st Qu.: 502
## Mode :character Mode :character Median :129.0 Median : 872
## Mean :150.7 Mean :1040
## 3rd Qu.:192.0 3rd Qu.:1389
## Max. :695.0 Max. :4983
## NA's :9430
## hour minute time_hour
## Min. : 1.00 Min. : 0.00 Min. :2013-01-01 05:00:00
## 1st Qu.: 9.00 1st Qu.: 8.00 1st Qu.:2013-04-04 13:00:00
## Median :13.00 Median :29.00 Median :2013-07-03 10:00:00
## Mean :13.18 Mean :26.23 Mean :2013-07-03 05:22:54
## 3rd Qu.:17.00 3rd Qu.:44.00 3rd Qu.:2013-10-01 07:00:00
## Max. :23.00 Max. :59.00 Max. :2013-12-31 23:00:00
##
##p51.E1##
tibble(x = c(5, 2, NA))
## # A tibble: 3 x 1
## x
## <dbl>
## 1 5
## 2 2
## 3 NA
##p51.E2##
arrange(flights, desc(dep_delay))
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 9 641 900 1301 1242
## 2 2013 6 15 1432 1935 1137 1607
## 3 2013 1 10 1121 1635 1126 1239
## 4 2013 9 20 1139 1845 1014 1457
## 5 2013 7 22 845 1600 1005 1044
## 6 2013 4 10 1100 1900 960 1342
## 7 2013 3 17 2321 810 911 135
## 8 2013 6 27 959 1900 899 1236
## 9 2013 7 22 2257 759 898 121
## 10 2013 12 5 756 1700 896 1058
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
arrange(flights, dep_delay)
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 12 7 2040 2123 -43 40
## 2 2013 2 3 2022 2055 -33 2240
## 3 2013 11 10 1408 1440 -32 1549
## 4 2013 1 11 1900 1930 -30 2233
## 5 2013 1 29 1703 1730 -27 1947
## 6 2013 8 9 729 755 -26 1002
## 7 2013 10 23 1907 1932 -25 2143
## 8 2013 3 30 2030 2055 -25 2213
## 9 2013 3 2 1431 1455 -24 1601
## 10 2013 5 5 934 958 -24 1225
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
##p51.E3##
flights %>% mutate(travel_time = ifelse((arr_time - dep_time < 0),
2400+(arr_time - dep_time),
arr_time - dep_time)) %>%
arrange(travel_time) %>% select(arr_time, dep_time, travel_time)
## # A tibble: 336,776 x 3
## arr_time dep_time travel_time
## <int> <int> <dbl>
## 1 1358 1323 35
## 2 1347 1312 35
## 3 1238 1203 35
## 4 758 722 36
## 5 758 722 36
## 6 754 718 36
## 7 1455 1418 37
## 8 53 16 37
## 9 754 717 37
## 10 1353 1315 38
## # ... with 336,766 more rows
arrange(flights, (arr_time-dep_time))
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 7 17 2400 2142 138 54
## 2 2013 12 9 2400 2250 70 59
## 3 2013 6 12 2338 2129 129 17
## 4 2013 12 29 2332 2155 97 14
## 5 2013 11 6 2335 2215 80 18
## 6 2013 2 25 2347 2145 122 30
## 7 2013 8 13 2351 2152 119 35
## 8 2013 10 11 2342 2030 192 27
## 9 2013 2 26 2356 2000 236 41
## 10 2013 1 24 2342 2159 103 28
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
##P51.E4##
arrange(flights, desc(distance)) %>% select(1:5, distance)
## # A tibble: 336,776 x 6
## year month day dep_time sched_dep_time distance
## <int> <int> <int> <int> <int> <dbl>
## 1 2013 1 1 857 900 4983
## 2 2013 1 2 909 900 4983
## 3 2013 1 3 914 900 4983
## 4 2013 1 4 900 900 4983
## 5 2013 1 5 858 900 4983
## 6 2013 1 6 1019 900 4983
## 7 2013 1 7 1042 900 4983
## 8 2013 1 8 901 900 4983
## 9 2013 1 9 641 900 4983
## 10 2013 1 10 859 900 4983
## # ... with 336,766 more rows
arrange(flights, distance) %>% select(1:5, distance)
## # A tibble: 336,776 x 6
## year month day dep_time sched_dep_time distance
## <int> <int> <int> <int> <int> <dbl>
## 1 2013 7 27 NA 106 17
## 2 2013 1 3 2127 2129 80
## 3 2013 1 4 1240 1200 80
## 4 2013 1 4 1829 1615 80
## 5 2013 1 4 2128 2129 80
## 6 2013 1 5 1155 1200 80
## 7 2013 1 6 2125 2129 80
## 8 2013 1 7 2124 2129 80
## 9 2013 1 8 2127 2130 80
## 10 2013 1 9 2126 2129 80
## # ... with 336,766 more rows
##P58.E4##
filter(flights, min_rank(desc(dep_delay))<=10)
## # A tibble: 10 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 9 641 900 1301 1242
## 2 2013 1 10 1121 1635 1126 1239
## 3 2013 12 5 756 1700 896 1058
## 4 2013 3 17 2321 810 911 135
## 5 2013 4 10 1100 1900 960 1342
## 6 2013 6 15 1432 1935 1137 1607
## 7 2013 6 27 959 1900 899 1236
## 8 2013 7 22 845 1600 1005 1044
## 9 2013 7 22 2257 759 898 121
## 10 2013 9 20 1139 1845 1014 1457
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
flights %>% top_n(n = 10, wt = dep_delay)
## # A tibble: 10 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 9 641 900 1301 1242
## 2 2013 1 10 1121 1635 1126 1239
## 3 2013 12 5 756 1700 896 1058
## 4 2013 3 17 2321 810 911 135
## 5 2013 4 10 1100 1900 960 1342
## 6 2013 6 15 1432 1935 1137 1607
## 7 2013 6 27 959 1900 899 1236
## 8 2013 7 22 845 1600 1005 1044
## 9 2013 7 22 2257 759 898 121
## 10 2013 9 20 1139 1845 1014 1457
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
##p73.E4##
flights %>%
mutate(dep_date = lubridate::make_datetime(year, month, day)) %>%
group_by(dep_date) %>%
summarise(cancelled = sum(is.na(dep_delay)),
n = n(),
mean_dep_delay = mean(dep_delay,na.rm=TRUE),
mean_arr_delay = mean(arr_delay,na.rm=TRUE)) %>%
ggplot(aes(x= cancelled/n)) +
geom_point(aes(y=mean_dep_delay), colour='black', alpha=0.5) +
geom_point(aes(y=mean_arr_delay), colour='yellow', alpha=0.5) +
ylab('mean delay (minutes)')
##P73.E5##
flights %>%
filter(arr_delay > 0) %>%
group_by(carrier) %>%
summarise(average_arr_delay = mean(arr_delay, na.rm=TRUE)) %>%
arrange(desc(average_arr_delay))
## # A tibble: 16 x 2
## carrier average_arr_delay
## <chr> <dbl>
## 1 OO 60.6
## 2 YV 51.1
## 3 9E 49.3
## 4 EV 48.3
## 5 F9 47.6
## 6 VX 43.8
## 7 FL 41.1
## 8 WN 40.7
## 9 B6 40.0
## 10 AA 38.3
## 11 MQ 37.9
## 12 DL 37.7
## 13 UA 36.7
## 14 HA 35.0
## 15 AS 34.4
## 16 US 29.0
flights %>%
summarise(n_distinct(carrier),
n_distinct(origin),
n_distinct(dest))
## # A tibble: 1 x 3
## `n_distinct(carrier)` `n_distinct(origin)` `n_distinct(dest)`
## <int> <int> <int>
## 1 16 3 105
##p75.E2##
flights %>%
group_by(tailnum) %>%
summarise(prop_on_time = sum(arr_delay <= 30 & !is.na(arr_delay))/n(),
mean_arr_delay = mean(arr_delay, na.rm=TRUE),
flights = n()) %>%
arrange(prop_on_time, desc(mean_arr_delay))
## # A tibble: 4,044 x 4
## tailnum prop_on_time mean_arr_delay flights
## <chr> <dbl> <dbl> <int>
## 1 N844MH 0 320 1
## 2 N911DA 0 294 1
## 3 N922EV 0 276 1
## 4 N587NW 0 264 1
## 5 N851NW 0 219 1
## 6 N928DN 0 201 1
## 7 N7715E 0 188 1
## 8 N654UA 0 185 1
## 9 N427SW 0 157 1
## 10 N136DL 0 146 1
## # ... with 4,034 more rows
flights %>%
group_by(tailnum) %>%
filter(all(is.na(arr_delay))) %>%
tally(sort=TRUE)
## # A tibble: 7 x 2
## tailnum n
## <chr> <int>
## 1 <NA> 2512
## 2 N347SW 1
## 3 N728SK 1
## 4 N768SK 1
## 5 N862DA 1
## 6 N865DA 1
## 7 N939DN 1
##p75.E3##
flights %>%
ggplot(aes(x=factor(hour), fill=arr_delay>5 | is.na(arr_delay))) + geom_bar()
##p75.E4##
flights %>%
mutate(new_sched_dep_time = lubridate::make_datetime(year, month, day, hour, minute)) %>%
group_by(origin) %>%
arrange(new_sched_dep_time) %>%
mutate(prev_flight_dep_delay = lag(dep_delay)) %>%
ggplot(aes(x=prev_flight_dep_delay, y= dep_delay)) + geom_point()
## Warning: Removed 14383 rows containing missing values (geom_point).