In this workshop we will do some of the exercises from Chapter 5 of R4DS.
Use a separate code block for each exercise.
for example: 1. Find all flights that had an arrival delay of two or more hours.
flights %>%
filter(arr_delay >= 120) %>%
ggplot() +
geom_histogram(aes(x=arr_delay), binwidth=5) # note delays are in minutes
flights %>% filter(dest == 'IAH')
## # A tibble: 7,198 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 623 627 -4 933 932
## 4 2013 1 1 728 732 -4 1041 1038
## 5 2013 1 1 739 739 0 1104 1038
## 6 2013 1 1 908 908 0 1228 1219
## 7 2013 1 1 1028 1026 2 1350 1339
## 8 2013 1 1 1044 1045 -1 1352 1351
## 9 2013 1 1 1114 900 134 1447 1222
## 10 2013 1 1 1205 1200 5 1503 1505
## # … with 7,188 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>% filter( arr_time > 120 & dep_delay <= 0)
## # A tibble: 198,323 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 544 545 -1 1004 1022
## 2 2013 1 1 554 600 -6 812 837
## 3 2013 1 1 554 558 -4 740 728
## 4 2013 1 1 555 600 -5 913 854
## 5 2013 1 1 557 600 -3 709 723
## 6 2013 1 1 557 600 -3 838 846
## 7 2013 1 1 558 600 -2 753 745
## 8 2013 1 1 558 600 -2 849 851
## 9 2013 1 1 558 600 -2 853 856
## 10 2013 1 1 558 600 -2 924 917
## # … with 198,313 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>% filter (dep_time >= 0 & dep_time <= 600)
## # A tibble: 9,344 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # … with 9,334 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
–NA^0 = 1 # regardless of value, any real number raised to 0 is 1 –NA | TRUE # regardless of value, anything or TRUE is always TRUE –NA0 = NA # counterexample, anything0 should be 0, but this result is caused by the way multiplication is carried out (shifting bits leftwise). Multiplication by 0 means no shift, thus NA remains NA.
arrange(flights, desc(dep_delay))
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## 2 2013 6 15 1432 1935 1137 1607 2120
## 3 2013 1 10 1121 1635 1126 1239 1810
## 4 2013 9 20 1139 1845 1014 1457 2210
## 5 2013 7 22 845 1600 1005 1044 1815
## 6 2013 4 10 1100 1900 960 1342 2211
## 7 2013 3 17 2321 810 911 135 1020
## 8 2013 6 27 959 1900 899 1236 2226
## 9 2013 7 22 2257 759 898 121 1026
## 10 2013 12 5 756 1700 896 1058 2020
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# the top 5 rows of:
# one can also find the most delayed arrivals:
arrange(flights, desc(arr_delay))
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## 2 2013 6 15 1432 1935 1137 1607 2120
## 3 2013 1 10 1121 1635 1126 1239 1810
## 4 2013 9 20 1139 1845 1014 1457 2210
## 5 2013 7 22 845 1600 1005 1044 1815
## 6 2013 4 10 1100 1900 960 1342 2211
## 7 2013 3 17 2321 810 911 135 1020
## 8 2013 7 22 2257 759 898 121 1026
## 9 2013 12 5 756 1700 896 1058 2020
## 10 2013 5 3 1133 2055 878 1250 2215
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# the top five rows of:
flights %>%
select (ends_with("time"))
## # A tibble: 336,776 x 5
## dep_time sched_dep_time arr_time sched_arr_time air_time
## <int> <int> <int> <int> <dbl>
## 1 517 515 830 819 227
## 2 533 529 850 830 227
## 3 542 540 923 850 160
## 4 544 545 1004 1022 183
## 5 554 600 812 837 116
## 6 554 558 740 728 150
## 7 555 600 913 854 158
## 8 557 600 709 723 53
## 9 557 600 838 846 140
## 10 558 600 753 745 138
## # … with 336,766 more rows
min_af_mid <- function(tim_h_min) {
tim_h_min%%100 + (tim_h_min%/%100)*60
}
flights$dep_time <- min_af_mid (flights$dep_time)
flights$sched_dep_time <- min_af_mid (flights$sched_dep_time)
flights$arr_time <- min_af_mid(flights$arr_time)
# it can be seen that this code works by displying flights before and after running those code lines; they convert the variables to be used in exercise 9, next
9.Compare air_time with arr_time - dep_time. What do you expect to see? What do you see? What do you need to do to fix it?
#create a new variable : airtime, assigned to arr_time - dep_time
#also create differ, assigned to air_time - airtime
flights_new <- mutate(flights, airtime=arr_time-dep_time, differ=air_time-airtime) %>%
select(air_time, airtime, differ, everything())
flights_new
## # A tibble: 336,776 x 21
## air_time airtime differ year month day dep_time sched_dep_time dep_delay
## <dbl> <dbl> <dbl> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 227 193 34 2013 1 1 317 315 2
## 2 227 197 30 2013 1 1 333 329 4
## 3 160 221 -61 2013 1 1 342 340 2
## 4 183 260 -77 2013 1 1 344 345 -1
## 5 116 138 -22 2013 1 1 354 360 -6
## 6 150 106 44 2013 1 1 354 358 -4
## 7 158 198 -40 2013 1 1 355 360 -5
## 8 53 72 -19 2013 1 1 357 360 -3
## 9 140 161 -21 2013 1 1 357 360 -3
## 10 138 115 23 2013 1 1 358 360 -2
## # … with 336,766 more rows, and 12 more variables: arr_time <dbl>,
## # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
# I was expecting to see a multiple of 60 min because of timezone
# differences, but doesn't happen. the difference appears arbitrary
%>%
How does the design of the tidyverse
facilitate the use of pipes?pipes send the results of a command to the next command. e.g., flights %>% group_by(tail_num)
it avoids having to retype flights inside the group_by function.
earlyandlate15 <- flights %>% filter (arr_delay == -15 | arr_delay == 15) %>% group_by(flight) %>% summarize(n(), sumdelay = sum(arr_delay)) %>% filter(sumdelay==0)
earlyandlate15
## # A tibble: 205 x 3
## flight `n()` sumdelay
## <int> <int> <dbl>
## 1 9 4 0
## 2 10 2 0
## 3 12 2 0
## 4 17 10 0
## 5 45 6 0
## 6 87 8 0
## 7 133 10 0
## 8 141 8 0
## 9 148 2 0
## 10 153 8 0
## # … with 195 more rows
#this code shows that there are 205 flights (identified by flight number) that have equal number of 15-min delays as 15-min early arrivals.
ontime99 <- flights %>% filter(arr_delay == 0 | arr_delay == 120) %>%
group_by(flight) %>% summarize(nflts=n(), n120 = sum(arr_delay)/120) %>% filter(nflts == 99*n120)
ontime99
## # A tibble: 0 x 3
## # … with 3 variables: flight <int>, nflts <int>, n120 <dbl>
#this code shows that no flight is on time 99% of the time and 120 min late 1% of the time