In this workshop we will do some of the exercises from Chapter 5 of R4DS.
Use a separate code block for each exercise.
for example: 1. Find all flights that had an arrival delay of two or more hours.
flights %>%
filter(arr_delay >= 120) # note delays are in minutes
## # A tibble: 10,200 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 811 630 101 1047 830
## 2 2013 1 1 848 1835 853 1001 1950
## 3 2013 1 1 957 733 144 1056 853
## 4 2013 1 1 1114 900 134 1447 1222
## 5 2013 1 1 1505 1310 115 1638 1431
## 6 2013 1 1 1525 1340 105 1831 1626
## 7 2013 1 1 1549 1445 64 1912 1656
## 8 2013 1 1 1558 1359 119 1718 1515
## 9 2013 1 1 1732 1630 62 2028 1825
## 10 2013 1 1 1803 1620 103 2008 1750
## # … with 10,190 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights1 <- flights %>%
filter(dest == "IAH")
flights1
## # A tibble: 7,198 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 623 627 -4 933 932
## 4 2013 1 1 728 732 -4 1041 1038
## 5 2013 1 1 739 739 0 1104 1038
## 6 2013 1 1 908 908 0 1228 1219
## 7 2013 1 1 1028 1026 2 1350 1339
## 8 2013 1 1 1044 1045 -1 1352 1351
## 9 2013 1 1 1114 900 134 1447 1222
## 10 2013 1 1 1205 1200 5 1503 1505
## # … with 7,188 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights2 <- flights %>%
filter(arr_delay > 2, dep_delay == 0)
flights2
## # A tibble: 4,368 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 600 600 0 837 825
## 2 2013 1 1 635 635 0 1028 940
## 3 2013 1 1 739 739 0 1104 1038
## 4 2013 1 1 745 745 0 1135 1125
## 5 2013 1 1 800 800 0 1022 1014
## 6 2013 1 1 805 805 0 1015 1005
## 7 2013 1 1 810 810 0 1048 1037
## 8 2013 1 1 823 823 0 1151 1135
## 9 2013 1 1 830 830 0 1018 1015
## 10 2013 1 1 835 835 0 1210 1150
## # … with 4,358 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights3 <- flights %>%
filter(dep_time >= 000 | dep_time <= 600)
flights3
## # A tibble: 328,521 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # … with 328,511 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
a) x ^ 0 = 1, because of this, NA ^ 0 = 1 and therefore it is not missing
b) anything OR TRUE will always be true, therefore NA | TRUE will never be FALSE (or missing)
c) anything AND FALSE will always be false
d) x * 0 is at first glance equal to 0. However, 0 * ∞ or -∞ is an undefined real number.
flights4 <- flights %>%
arrange(dep_delay)
head(flights4, 5)
## # A tibble: 5 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 12 7 2040 2123 -43 40 2352
## 2 2013 2 3 2022 2055 -33 2240 2338
## 3 2013 11 10 1408 1440 -32 1549 1559
## 4 2013 1 11 1900 1930 -30 2233 2243
## 5 2013 1 29 1703 1730 -27 1947 1957
## # … with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights5 <- flights %>%
select(contains("time"))
flights5
## # A tibble: 336,776 × 6
## dep_time sched_dep_time arr_time sched_arr_time air_time time_hour
## <int> <int> <int> <int> <dbl> <dttm>
## 1 517 515 830 819 227 2013-01-01 05:00:00
## 2 533 529 850 830 227 2013-01-01 05:00:00
## 3 542 540 923 850 160 2013-01-01 05:00:00
## 4 544 545 1004 1022 183 2013-01-01 05:00:00
## 5 554 600 812 837 116 2013-01-01 06:00:00
## 6 554 558 740 728 150 2013-01-01 05:00:00
## 7 555 600 913 854 158 2013-01-01 06:00:00
## 8 557 600 709 723 53 2013-01-01 06:00:00
## 9 557 600 838 846 140 2013-01-01 06:00:00
## 10 558 600 753 745 138 2013-01-01 06:00:00
## # … with 336,766 more rows
flights7 <- flights %>%
mutate(flights,
dep_time = (dep_time %/% 100) * 60 + (dep_time %% 100),
sched_dep_time = (sched_dep_time %/% 100) * 60 + (sched_dep_time %% 100))
flights7
## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <dbl> <dbl> <dbl> <int> <int>
## 1 2013 1 1 317 315 2 830 819
## 2 2013 1 1 333 329 4 850 830
## 3 2013 1 1 342 340 2 923 850
## 4 2013 1 1 344 345 -1 1004 1022
## 5 2013 1 1 354 360 -6 812 837
## 6 2013 1 1 354 358 -4 740 728
## 7 2013 1 1 355 360 -5 913 854
## 8 2013 1 1 357 360 -3 709 723
## 9 2013 1 1 357 360 -3 838 846
## 10 2013 1 1 358 360 -2 753 745
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights_airtime <-
mutate(flights,
arr_time = (arr_time %/% 100 * 60 + arr_time %% 100) %% 1440,
dep_time = (dep_time %/% 100 * 60 + dep_time %% 100) %% 1440,
air_time_diff = air_time - arr_time + dep_time)
flights_airtime
## # A tibble: 336,776 × 20
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <dbl> <int> <dbl> <dbl> <int>
## 1 2013 1 1 317 515 2 510 819
## 2 2013 1 1 333 529 4 530 830
## 3 2013 1 1 342 540 2 563 850
## 4 2013 1 1 344 545 -1 604 1022
## 5 2013 1 1 354 600 -6 492 837
## 6 2013 1 1 354 558 -4 460 728
## 7 2013 1 1 355 600 -5 553 854
## 8 2013 1 1 357 600 -3 429 723
## 9 2013 1 1 357 600 -3 518 846
## 10 2013 1 1 358 600 -2 473 745
## # … with 336,766 more rows, and 12 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## # air_time_diff <dbl>
nrow(filter(flights_airtime, air_time_diff != 0))
## [1] 327150
# air_time should be equal to arr_time - dep_time. But that is not the case as there are quite a lot of values with the arr_time_diff equal to 0. The reason for this may be because of the passing of different time zones. Given this, there should be 60 min differences across each time zone.
%>%
How does the design of the tidyverse
facilitate the use of pipes?Essentially, the pipe function, %>%, takes the output of the previous function and passes it to the next. Because of this, it allows us to connect a series of different functions. Since the tidyverse package allows for convenient data manipulation that all share the same “method”, it helps pipe functions to work efficiently as they facilitate better communication and connections.
flights %>%
group_by(flight) %>%
summarize(n = n(),
early_15 = sum(arr_delay <= -15, na.rm = TRUE) / n(),
late_15 = sum(arr_delay >= 15, na.rm = TRUE) / n()) %>%
filter(early_15 == 0.5,
late_15 == 0.5)
## # A tibble: 18 × 4
## flight n early_15 late_15
## <int> <int> <dbl> <dbl>
## 1 107 2 0.5 0.5
## 2 2072 2 0.5 0.5
## 3 2366 2 0.5 0.5
## 4 2500 2 0.5 0.5
## 5 2552 2 0.5 0.5
## 6 3495 2 0.5 0.5
## 7 3518 2 0.5 0.5
## 8 3544 2 0.5 0.5
## 9 3651 2 0.5 0.5
## 10 3705 2 0.5 0.5
## 11 3916 2 0.5 0.5
## 12 3951 2 0.5 0.5
## 13 4273 2 0.5 0.5
## 14 4313 2 0.5 0.5
## 15 5297 2 0.5 0.5
## 16 5322 2 0.5 0.5
## 17 5388 2 0.5 0.5
## 18 5505 4 0.5 0.5
flights %>%
group_by(flight) %>%
summarise(n = n(),
late = n_distinct(arr_delay, na.rm = TRUE) / n(),
always_10 = mean(arr_delay, na.rm = TRUE)) %>%
filter(late == 1 & always_10 == 10)
## # A tibble: 4 × 4
## flight n late always_10
## <int> <int> <dbl> <dbl>
## 1 2254 1 1 10
## 2 3656 1 1 10
## 3 3880 1 1 10
## 4 5854 1 1 10
flights %>%
group_by(flight) %>%
summarise(n = n(),
early_30 = sum(arr_delay <= -30, na.rm = TRUE) / n(),
late_30 = sum(arr_delay >= 30, na.rm = TRUE) / n()) %>%
filter(early_30 == 0.5 & late_30 == 0.5)
## # A tibble: 3 × 4
## flight n early_30 late_30
## <int> <int> <dbl> <dbl>
## 1 3651 2 0.5 0.5
## 2 3916 2 0.5 0.5
## 3 3951 2 0.5 0.5
flights %>%
group_by(flight) %>%
summarise(n = n(),
early.prop = sum(arr_delay <= 0, na.rm = TRUE) / n(),
late.prop = sum(arr_delay >= 120, na.rm = TRUE) / n()) %>%
filter(early.prop == 0.99 & late.prop == 0.01 )
## # A tibble: 0 × 4
## # … with 4 variables: flight <int>, n <int>, early.prop <dbl>, late.prop <dbl>
Arrival delay should be more important as it affects making the schedules and possible connecting flights. The departure delay only affects the wait timein the airport with a chance to still arrive on time or even earlier