Sample codes from R4DS.
Load the required libraries
library(tidyverse)
library(nycflights13)
Peek into data
head(flights)
## # A tibble: 6 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2. 830
## 2 2013 1 1 533 529 4. 850
## 3 2013 1 1 542 540 2. 923
## 4 2013 1 1 544 545 -1. 1004
## 5 2013 1 1 554 600 -6. 812
## 6 2013 1 1 554 558 -4. 740
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
Lets filter just the not-cancelled flights i.e remove rows with missing dep_delay or arry_delay
not_cancelled <- flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay))
Whats the mean arrival delay for flights not cancelled grouped by year, month & day ?
not_cancelled %>%
group_by(year, month, day) %>%
summarise(
avg_delay1=mean(arr_delay),
avg_delay2=mean(arr_delay[arr_delay > 0])
)
## # A tibble: 365 x 5
## # Groups: year, month [?]
## year month day avg_delay1 avg_delay2
## <int> <int> <int> <dbl> <dbl>
## 1 2013 1 1 12.7 32.5
## 2 2013 1 2 12.7 32.0
## 3 2013 1 3 5.73 27.7
## 4 2013 1 4 -1.93 28.3
## 5 2013 1 5 -1.53 22.6
## 6 2013 1 6 4.24 24.4
## 7 2013 1 7 -4.95 27.8
## 8 2013 1 8 -3.23 20.8
## 9 2013 1 9 -0.264 25.6
## 10 2013 1 10 -5.90 27.3
## # ... with 355 more rows
Whats the departure time for first and last flight? grouped by Y/M/D
not_cancelled %>%
group_by(year, month, day) %>%
summarise(
first = min(dep_time),
last = max(dep_time)
)
## # A tibble: 365 x 5
## # Groups: year, month [?]
## year month day first last
## <int> <int> <int> <dbl> <dbl>
## 1 2013 1 1 517. 2356.
## 2 2013 1 2 42. 2354.
## 3 2013 1 3 32. 2349.
## 4 2013 1 4 25. 2358.
## 5 2013 1 5 14. 2357.
## 6 2013 1 6 16. 2355.
## 7 2013 1 7 49. 2359.
## 8 2013 1 8 454. 2351.
## 9 2013 1 9 2. 2252.
## 10 2013 1 10 3. 2320.
## # ... with 355 more rows
Or the same can be achieved using ‘first’ and ‘last’ instead of min and max
not_cancelled %>%
group_by(year, month, day) %>%
summarise(
first_dep = first(dep_time),
last_dep = last(dep_time)
)
## # A tibble: 365 x 5
## # Groups: year, month [?]
## year month day first_dep last_dep
## <int> <int> <int> <int> <int>
## 1 2013 1 1 517 2356
## 2 2013 1 2 42 2354
## 3 2013 1 3 32 2349
## 4 2013 1 4 25 2358
## 5 2013 1 5 14 2357
## 6 2013 1 6 16 2355
## 7 2013 1 7 49 2359
## 8 2013 1 8 454 2351
## 9 2013 1 9 2 2252
## 10 2013 1 10 3 2320
## # ... with 355 more rows
Or using the min_rank() and range() utility functions
not_cancelled %>%
group_by(year, month, day) %>%
mutate(r = min_rank(desc(dep_time))) %>%
filter(r %in% range(r))
## # A tibble: 770 x 20
## # Groups: year, month, day [365]
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2. 830
## 2 2013 1 1 2356 2359 -3. 425
## 3 2013 1 2 42 2359 43. 518
## 4 2013 1 2 2354 2359 -5. 413
## 5 2013 1 3 32 2359 33. 504
## 6 2013 1 3 2349 2359 -10. 434
## 7 2013 1 4 25 2359 26. 505
## 8 2013 1 4 2358 2359 -1. 429
## 9 2013 1 4 2358 2359 -1. 436
## 10 2013 1 5 14 2359 15. 503
## # ... with 760 more rows, and 13 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, r <int>
head(with(not_cancelled, min_rank(desc(dep_time))))
## [1] 325665 325407 324801 324665 322407 322407
nc <- not_cancelled %>%
group_by(year, month, day) %>%
mutate(r = min_rank(desc(dep_time))) %>%
filter(r %in% range(r))
head(nc)
## # A tibble: 6 x 20
## # Groups: year, month, day [3]
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2. 830
## 2 2013 1 1 2356 2359 -3. 425
## 3 2013 1 2 42 2359 43. 518
## 4 2013 1 2 2354 2359 -5. 413
## 5 2013 1 3 32 2359 33. 504
## 6 2013 1 3 2349 2359 -10. 434
## # ... with 13 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>, r <int>
As range() picks the min and max from min_rank() output, min_rank(desc(dep_time)) is not required, just min_rank(dep_time) will do
nc2 <- not_cancelled %>%
group_by(year, month, day) %>%
mutate(r = min_rank(dep_time)) %>%
filter(r %in% range(r))
head(nc2)
## # A tibble: 6 x 20
## # Groups: year, month, day [3]
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2. 830
## 2 2013 1 1 2356 2359 -3. 425
## 3 2013 1 2 42 2359 43. 518
## 4 2013 1 2 2354 2359 -5. 413
## 5 2013 1 3 32 2359 33. 504
## 6 2013 1 3 2349 2359 -10. 434
## # ... with 13 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>, r <int>
Bring the cols: year, month, day, dep_time, r to front and display the remaining afterwards
select(nc, year, month, day, dep_time, r, everything())
## # A tibble: 770 x 20
## # Groups: year, month, day [365]
## year month day dep_time r sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 831 515 2. 830
## 2 2013 1 1 2356 1 2359 -3. 425
## 3 2013 1 2 42 928 2359 43. 518
## 4 2013 1 2 2354 1 2359 -5. 413
## 5 2013 1 3 32 900 2359 33. 504
## 6 2013 1 3 2349 1 2359 -10. 434
## 7 2013 1 4 25 908 2359 26. 505
## 8 2013 1 4 2358 1 2359 -1. 429
## 9 2013 1 4 2358 1 2359 -1. 436
## 10 2013 1 5 14 717 2359 15. 503
## # ... with 760 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
select(nc2, year, month, day, dep_time, r, everything())
## # A tibble: 770 x 20
## # Groups: year, month, day [365]
## year month day dep_time r sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 1 515 2. 830
## 2 2013 1 1 2356 831 2359 -3. 425
## 3 2013 1 2 42 1 2359 43. 518
## 4 2013 1 2 2354 928 2359 -5. 413
## 5 2013 1 3 32 1 2359 33. 504
## 6 2013 1 3 2349 900 2359 -10. 434
## 7 2013 1 4 25 1 2359 26. 505
## 8 2013 1 4 2358 907 2359 -1. 429
## 9 2013 1 4 2358 907 2359 -1. 436
## 10 2013 1 5 14 1 2359 15. 503
## # ... with 760 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
How many distinct carriers are available for a given dest?
not_cancelled %>%
group_by(year, month, day) %>%
summarise(
first_dep = first(dep_time),
last_dep = last(dep_time)
)
## # A tibble: 365 x 5
## # Groups: year, month [?]
## year month day first_dep last_dep
## <int> <int> <int> <int> <int>
## 1 2013 1 1 517 2356
## 2 2013 1 2 42 2354
## 3 2013 1 3 32 2349
## 4 2013 1 4 25 2358
## 5 2013 1 5 14 2357
## 6 2013 1 6 16 2355
## 7 2013 1 7 49 2359
## 8 2013 1 8 454 2351
## 9 2013 1 9 2 2252
## 10 2013 1 10 3 2320
## # ... with 355 more rows
not_cancelled %>%
group_by(dest) %>%
summarise(carriers = n_distinct(carrier)) %>%
arrange(desc(carriers))
## # A tibble: 104 x 2
## dest carriers
## <chr> <int>
## 1 ATL 7
## 2 BOS 7
## 3 CLT 7
## 4 ORD 7
## 5 TPA 7
## 6 AUS 6
## 7 DCA 6
## 8 DTW 6
## 9 IAD 6
## 10 MSP 6
## # ... with 94 more rows
How many destinations in total?
not_cancelled %>%
count(dest)
## # A tibble: 104 x 2
## dest n
## <chr> <int>
## 1 ABQ 254
## 2 ACK 264
## 3 ALB 418
## 4 ANC 8
## 5 ATL 16837
## 6 AUS 2411
## 7 AVL 261
## 8 BDL 412
## 9 BGR 358
## 10 BHM 269
## # ... with 94 more rows
not_cancelled %>%
count(tailnum, wt = distance)
## # A tibble: 4,037 x 2
## tailnum n
## <chr> <dbl>
## 1 D942DN 3418.
## 2 N0EGMQ 239143.
## 3 N10156 109664.
## 4 N102UW 25722.
## 5 N103US 24619.
## 6 N104UW 24616.
## 7 N10575 139903.
## 8 N105UW 23618.
## 9 N107US 21677.
## 10 N108UW 32070.
## # ... with 4,027 more rows
Or this way
not_cancelled %>%
group_by(tailnum) %>%
summarise(dist= sum(distance))
## # A tibble: 4,037 x 2
## tailnum dist
## <chr> <dbl>
## 1 D942DN 3418.
## 2 N0EGMQ 239143.
## 3 N10156 109664.
## 4 N102UW 25722.
## 5 N103US 24619.
## 6 N104UW 24616.
## 7 N10575 139903.
## 8 N105UW 23618.
## 9 N107US 21677.
## 10 N108UW 32070.
## # ... with 4,027 more rows
How many flights left before 5am? (these usually indicate delayed flights from the previous day) grouped by Y/M/D
not_cancelled %>%
group_by(year, month, day) %>%
summarise(n_early = sum(dep_time < 500))
## # A tibble: 365 x 4
## # Groups: year, month [?]
## year month day n_early
## <int> <int> <int> <int>
## 1 2013 1 1 0
## 2 2013 1 2 3
## 3 2013 1 3 4
## 4 2013 1 4 3
## 5 2013 1 5 3
## 6 2013 1 6 2
## 7 2013 1 7 2
## 8 2013 1 8 1
## 9 2013 1 9 3
## 10 2013 1 10 3
## # ... with 355 more rows
Or if you want just the total count
not_cancelled %>%
summarise(n_early=sum(dep_time < 500))
## # A tibble: 1 x 1
## n_early
## <int>
## 1 1476
What proportion of flights are delayed by more than an hour?
not_cancelled %>%
group_by(year, month, day) %>%
summarise(hour_perc = mean(arr_delay > 60))
## # A tibble: 365 x 4
## # Groups: year, month [?]
## year month day hour_perc
## <int> <int> <int> <dbl>
## 1 2013 1 1 0.0722
## 2 2013 1 2 0.0851
## 3 2013 1 3 0.0567
## 4 2013 1 4 0.0396
## 5 2013 1 5 0.0349
## 6 2013 1 6 0.0470
## 7 2013 1 7 0.0333
## 8 2013 1 8 0.0213
## 9 2013 1 9 0.0202
## 10 2013 1 10 0.0183
## # ... with 355 more rows
by year, month
not_cancelled %>%
group_by(year, month) %>%
summarise(hr_perc= mean(arr_delay > 60))
## # A tibble: 12 x 3
## # Groups: year [?]
## year month hr_perc
## <int> <int> <dbl>
## 1 2013 1 0.0705
## 2 2013 2 0.0689
## 3 2013 3 0.0837
## 4 2013 4 0.102
## 5 2013 5 0.0795
## 6 2013 6 0.142
## 7 2013 7 0.145
## 8 2013 8 0.0835
## 9 2013 9 0.0476
## 10 2013 10 0.0451
## 11 2013 11 0.0415
## 12 2013 12 0.107
Total flight count daily
daily <- group_by(flights, year, month, day)
flight count for each day, month , year
(per_day <- summarise(daily, flights = n()))
## # A tibble: 365 x 4
## # Groups: year, month [?]
## year month day flights
## <int> <int> <int> <int>
## 1 2013 1 1 842
## 2 2013 1 2 943
## 3 2013 1 3 914
## 4 2013 1 4 915
## 5 2013 1 5 720
## 6 2013 1 6 832
## 7 2013 1 7 933
## 8 2013 1 8 899
## 9 2013 1 9 902
## 10 2013 1 10 932
## # ... with 355 more rows
(per_month <- summarise(per_day, flights = sum(flights)))
## # A tibble: 12 x 3
## # Groups: year [?]
## year month flights
## <int> <int> <int>
## 1 2013 1 27004
## 2 2013 2 24951
## 3 2013 3 28834
## 4 2013 4 28330
## 5 2013 5 28796
## 6 2013 6 28243
## 7 2013 7 29425
## 8 2013 8 29327
## 9 2013 9 27574
## 10 2013 10 28889
## 11 2013 11 27268
## 12 2013 12 28135
(per_year <- summarise(per_month, flights = sum(flights)))
## # A tibble: 1 x 2
## year flights
## <int> <int>
## 1 2013 336776
Flight count for each destinatio
not_cancelled %>% count(dest)
## # A tibble: 104 x 2
## dest n
## <chr> <int>
## 1 ABQ 254
## 2 ACK 264
## 3 ALB 418
## 4 ANC 8
## 5 ATL 16837
## 6 AUS 2411
## 7 AVL 261
## 8 BDL 412
## 9 BGR 358
## 10 BHM 269
## # ... with 94 more rows
or
not_cancelled %>%
group_by(dest) %>%
summarise(N=n())
## # A tibble: 104 x 2
## dest N
## <chr> <int>
## 1 ABQ 254
## 2 ACK 264
## 3 ALB 418
## 4 ANC 8
## 5 ATL 16837
## 6 AUS 2411
## 7 AVL 261
## 8 BDL 412
## 9 BGR 358
## 10 BHM 269
## # ... with 94 more rows
Total distance covered for each tailnum using count() and weight ‘wt’ parameter
not_cancelled %>% count(tailnum, wt = distance)
## # A tibble: 4,037 x 2
## tailnum n
## <chr> <dbl>
## 1 D942DN 3418.
## 2 N0EGMQ 239143.
## 3 N10156 109664.
## 4 N102UW 25722.
## 5 N103US 24619.
## 6 N104UW 24616.
## 7 N10575 139903.
## 8 N105UW 23618.
## 9 N107US 21677.
## 10 N108UW 32070.
## # ... with 4,027 more rows
or
not_cancelled %>%
group_by(tailnum) %>%
summarise(sum(distance))
## # A tibble: 4,037 x 2
## tailnum `sum(distance)`
## <chr> <dbl>
## 1 D942DN 3418.
## 2 N0EGMQ 239143.
## 3 N10156 109664.
## 4 N102UW 25722.
## 5 N103US 24619.
## 6 N104UW 24616.
## 7 N10575 139903.
## 8 N105UW 23618.
## 9 N107US 21677.
## 10 N108UW 32070.
## # ... with 4,027 more rows
Count tailnum
not_cancelled %>%
count(tailnum)
## # A tibble: 4,037 x 2
## tailnum n
## <chr> <int>
## 1 D942DN 4
## 2 N0EGMQ 352
## 3 N10156 145
## 4 N102UW 48
## 5 N103US 46
## 6 N104UW 46
## 7 N10575 269
## 8 N105UW 45
## 9 N107US 41
## 10 N108UW 60
## # ... with 4,027 more rows
Total number of fights by carrier for each destination
flights %>%
group_by(carrier, dest) %>%
summarise(N=n())
## # A tibble: 314 x 3
## # Groups: carrier [?]
## carrier dest N
## <chr> <chr> <int>
## 1 9E ATL 59
## 2 9E AUS 2
## 3 9E AVL 10
## 4 9E BGR 1
## 5 9E BNA 474
## 6 9E BOS 914
## 7 9E BTV 2
## 8 9E BUF 833
## 9 9E BWI 856
## 10 9E CAE 3
## # ... with 304 more rows