In this workshop we will do some of the exercises from Chapter 5 of R4DS.
Use a separate code block for each exercise.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.2 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(nycflights13)
nycflights13::flights
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
filter(arr_delay >= 120) # note delays are in minutes
## # A tibble: 10,200 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 811 630 101 1047 830
## 2 2013 1 1 848 1835 853 1001 1950
## 3 2013 1 1 957 733 144 1056 853
## 4 2013 1 1 1114 900 134 1447 1222
## 5 2013 1 1 1505 1310 115 1638 1431
## 6 2013 1 1 1525 1340 105 1831 1626
## 7 2013 1 1 1549 1445 64 1912 1656
## 8 2013 1 1 1558 1359 119 1718 1515
## 9 2013 1 1 1732 1630 62 2028 1825
## 10 2013 1 1 1803 1620 103 2008 1750
## # … with 10,190 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
filter(carrier == "UA", dest == "IAH")
## # A tibble: 6,924 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 623 627 -4 933 932
## 4 2013 1 1 728 732 -4 1041 1038
## 5 2013 1 1 739 739 0 1104 1038
## 6 2013 1 1 908 908 0 1228 1219
## 7 2013 1 1 1028 1026 2 1350 1339
## 8 2013 1 1 1044 1045 -1 1352 1351
## 9 2013 1 1 1114 900 134 1447 1222
## 10 2013 1 1 1205 1200 5 1503 1505
## # … with 6,914 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
filter(arr_delay > 120, dep_delay < 1)
## # A tibble: 29 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 27 1419 1420 -1 1754 1550
## 2 2013 10 7 1350 1350 0 1736 1526
## 3 2013 10 7 1357 1359 -2 1858 1654
## 4 2013 10 16 657 700 -3 1258 1056
## 5 2013 11 1 658 700 -2 1329 1015
## 6 2013 3 18 1844 1847 -3 39 2219
## 7 2013 4 17 1635 1640 -5 2049 1845
## 8 2013 4 18 558 600 -2 1149 850
## 9 2013 4 18 655 700 -5 1213 950
## 10 2013 5 22 1827 1830 -3 2217 2010
## # … with 19 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
filter(dep_time <=600 | dep_time == 2400)
## # A tibble: 9,373 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # … with 9,363 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
## This equals 1 (unless its zero)
NA ^ 0
## [1] 1
## "Or TRUE" will always be true.
NA | TRUE
## [1] TRUE
## "FALSE" by itself or with "&" is always false.
FALSE & NA
## [1] FALSE
You have to be careful with NA and expressions.
How does this differ with normal practice in Mathematics?
##install.packages("dplyr")
library(dplyr)
flights %>%
arrange(-dep_delay)
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## 2 2013 6 15 1432 1935 1137 1607 2120
## 3 2013 1 10 1121 1635 1126 1239 1810
## 4 2013 9 20 1139 1845 1014 1457 2210
## 5 2013 7 22 845 1600 1005 1044 1815
## 6 2013 4 10 1100 1900 960 1342 2211
## 7 2013 3 17 2321 810 911 135 1020
## 8 2013 6 27 959 1900 899 1236 2226
## 9 2013 7 22 2257 759 898 121 1026
## 10 2013 12 5 756 1700 896 1058 2020
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
select(ends_with('delay'), ends_with('time'))
## # A tibble: 336,776 x 7
## dep_delay arr_delay dep_time sched_dep_time arr_time sched_arr_time air_time
## <dbl> <dbl> <int> <int> <int> <int> <dbl>
## 1 2 11 517 515 830 819 227
## 2 4 20 533 529 850 830 227
## 3 2 33 542 540 923 850 160
## 4 -1 -18 544 545 1004 1022 183
## 5 -6 -25 554 600 812 837 116
## 6 -4 12 554 558 740 728 150
## 7 -5 19 555 600 913 854 158
## 8 -3 -14 557 600 709 723 53
## 9 -3 -8 557 600 838 846 140
## 10 -2 8 558 600 753 745 138
## # … with 336,766 more rows
flights %>%
mutate(dep_time = (dep_time %/% 100) * 60 + (dep_time %% 100),
sched_dep_time = (sched_dep_time %/% 100) * 60 + (sched_dep_time %% 100))
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <dbl> <dbl> <dbl> <int> <int>
## 1 2013 1 1 317 315 2 830 819
## 2 2013 1 1 333 329 4 850 830
## 3 2013 1 1 342 340 2 923 850
## 4 2013 1 1 344 345 -1 1004 1022
## 5 2013 1 1 354 360 -6 812 837
## 6 2013 1 1 354 358 -4 740 728
## 7 2013 1 1 355 360 -5 913 854
## 8 2013 1 1 357 360 -3 709 723
## 9 2013 1 1 357 360 -3 838 846
## 10 2013 1 1 358 360 -2 753 745
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
9.Compare air_time with arr_time - dep_time. What do you expect to see? What do you see? What do you need to do to fix it?
fl_time <- flights %>%
select(air_time, arr_time, dep_time) %>%
mutate(dif_time = arr_time - dep_time)
fl_time
## # A tibble: 336,776 x 4
## air_time arr_time dep_time dif_time
## <dbl> <int> <int> <int>
## 1 227 830 517 313
## 2 227 850 533 317
## 3 160 923 542 381
## 4 183 1004 544 460
## 5 116 812 554 258
## 6 150 740 554 186
## 7 158 913 555 358
## 8 53 709 557 152
## 9 140 838 557 281
## 10 138 753 558 195
## # … with 336,766 more rows
fl_time %>%
mutate(dep_time = (dep_time %/% 100) * 60 + (dep_time %% 100),
arr_time = (arr_time %/% 100) * 60 + (arr_time %% 100),
dif_time = (dif_time %/% 100) * 60 + (dif_time %% 100))
## # A tibble: 336,776 x 4
## air_time arr_time dep_time dif_time
## <dbl> <dbl> <dbl> <dbl>
## 1 227 510 317 193
## 2 227 530 333 197
## 3 160 563 342 261
## 4 183 604 344 300
## 5 116 492 354 178
## 6 150 460 354 146
## 7 158 553 355 238
## 8 53 429 357 112
## 9 140 518 357 201
## 10 138 473 358 155
## # … with 336,766 more rows
%>% How does the design of the tidyverse facillitate the use of pipes?##Allows you to chain multiple functions to an object.
Brainstorm at least 5 different ways to assess the typical delay characteristics of a group of flights. Consider the following scenarios:
Define who you need to assess the data for. What type of trends would be important to them.
Play around with the data until you understand what’s going on. Look at the mean, median, mode, sd, and variance. Are there any outliers in the data.
Play around with graphs to see if any still are a better fit to the data.
flights %>%
group_by(flight) %>%
summarise(n = n(),
fifteen_early = mean(arr_delay == -15, na.rm = TRUE),
fifteen_late = mean(arr_delay == 15, na.rm = TRUE))
## # A tibble: 3,844 x 4
## flight n fifteen_early fifteen_late
## <int> <int> <dbl> <dbl>
## 1 1 701 0.0215 0.0100
## 2 2 51 0.0392 0.0196
## 3 3 631 0.00955 0.00637
## 4 4 393 0.0358 0.0102
## 5 5 324 0.0123 0.00617
## 6 6 210 0.0291 0.00485
## 7 7 237 0.0169 0.00424
## 8 8 236 0.0556 0.00855
## 9 9 153 0.0132 0.0132
## 10 10 61 0.0164 0.0164
## # … with 3,834 more rows
fifteen_fl <- flights %>%
group_by(flight) %>%
summarise(n = n(),
fifteen_early = mean(arr_delay == -15, na.rm = TRUE),
fifteen_late = mean(arr_delay == 15, na.rm = TRUE))
fifteen_fl %>%
filter(fifteen_early == .5 && fifteen_late == .5)
## # A tibble: 0 x 4
## # … with 4 variables: flight <int>, n <int>, fifteen_early <dbl>,
## # fifteen_late <dbl>
There are no flights that are 15 min late and early 50% of the time each.
thirty_fl <- flights %>%
group_by(flight) %>%
summarise(n = n(),
thirty_early = mean(arr_delay == -30, na.rm = TRUE),
thirty_late = mean(arr_delay == 30, na.rm = TRUE))
thirty_fl %>%
filter(thirty_early == .5 && thirty_late == .5)
## # A tibble: 0 x 4
## # … with 4 variables: flight <int>, n <int>, thirty_early <dbl>,
## # thirty_late <dbl>
There are no flights that are 30 min late and early 50% of the time each.
ten_fl <- flights %>%
group_by(flight) %>%
summarise(n = n(),
ten_late = mean(arr_delay == 10, na.rm = TRUE))
ten_fl %>%
filter(ten_late == 1)
## # A tibble: 5 x 3
## flight n ten_late
## <int> <int> <dbl>
## 1 2254 1 1
## 2 3656 1 1
## 3 3785 2 1
## 4 3880 1 1
## 5 5854 1 1
Five flights were always 10 minute late.
mix_fl <- flights %>%
group_by(flight) %>%
summarise(n = n(),
on_time = mean(arr_delay == 0, na.rm = TRUE),
two_hours_late = mean(arr_delay == 120, na.rm = TRUE))
mix_fl %>%
filter(on_time == .99 && two_hours_late == .01)
## # A tibble: 0 x 4
## # … with 4 variables: flight <int>, n <int>, on_time <dbl>,
## # two_hours_late <dbl>
No flights are on time 99% of the time and 1% of the time it’s 2 hours late.
Depends on the role of the stakeholder. Customers may believe the departure delay to be more important if it affects their travel plans, while airline companies may believe arrival delays to be more important if it has a larger impact on cost.