pacman::p_load(nycflights13)
summary(flights)
## year month day dep_time sched_dep_time
## Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1 Min. : 106
## 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 907 1st Qu.: 906
## Median :2013 Median : 7.000 Median :16.00 Median :1401 Median :1359
## Mean :2013 Mean : 6.549 Mean :15.71 Mean :1349 Mean :1344
## 3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1744 3rd Qu.:1729
## Max. :2013 Max. :12.000 Max. :31.00 Max. :2400 Max. :2359
## NA's :8255
## dep_delay arr_time sched_arr_time arr_delay
## Min. : -43.00 Min. : 1 Min. : 1 Min. : -86.000
## 1st Qu.: -5.00 1st Qu.:1104 1st Qu.:1124 1st Qu.: -17.000
## Median : -2.00 Median :1535 Median :1556 Median : -5.000
## Mean : 12.64 Mean :1502 Mean :1536 Mean : 6.895
## 3rd Qu.: 11.00 3rd Qu.:1940 3rd Qu.:1945 3rd Qu.: 14.000
## Max. :1301.00 Max. :2400 Max. :2359 Max. :1272.000
## NA's :8255 NA's :8713 NA's :9430
## carrier flight tailnum origin
## Length:336776 Min. : 1 Length:336776 Length:336776
## Class :character 1st Qu.: 553 Class :character Class :character
## Mode :character Median :1496 Mode :character Mode :character
## Mean :1972
## 3rd Qu.:3465
## Max. :8500
##
## dest air_time distance hour
## Length:336776 Min. : 20.0 Min. : 17 Min. : 1.00
## Class :character 1st Qu.: 82.0 1st Qu.: 502 1st Qu.: 9.00
## Mode :character Median :129.0 Median : 872 Median :13.00
## Mean :150.7 Mean :1040 Mean :13.18
## 3rd Qu.:192.0 3rd Qu.:1389 3rd Qu.:17.00
## Max. :695.0 Max. :4983 Max. :23.00
## NA's :9430
## minute time_hour
## Min. : 0.00 Min. :2013-01-01 05:00:00
## 1st Qu.: 8.00 1st Qu.:2013-04-04 13:00:00
## Median :29.00 Median :2013-07-03 10:00:00
## Mean :26.23 Mean :2013-07-03 05:22:54
## 3rd Qu.:44.00 3rd Qu.:2013-10-01 07:00:00
## Max. :59.00 Max. :2013-12-31 23:00:00
##
maxdep <- max(flights$dep_delay, na.rm=TRUE)
maxdep_id <- which(flights$dep_delay==maxdep)
flights[maxdep_id, 10:12]
## # A tibble: 1 × 3
## carrier flight tailnum
## <chr> <int> <chr>
## 1 HA 51 N384HA
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
not_cancelled <- flights %>%
filter(!is.na(arr_delay))
lowest_tailnum <- not_cancelled %>%
group_by(tailnum) %>%
summarise(avg_arr_delay = mean(arr_delay)) %>%
arrange(avg_arr_delay) %>%
slice(1)
lowest_tailnum
## # A tibble: 1 × 2
## tailnum avg_arr_delay
## <chr> <dbl>
## 1 N560AS -53
not_cancelled %>%
group_by(year, month, day) %>%
summarise(
first = min(dep_time, na.rm = TRUE),
last = max(dep_time, na.rm = TRUE)
) %>%
arrange(desc(last)) %>%
head(10)
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 10 × 5
## # Groups: year, month [5]
## year month day first last
## <int> <int> <int> <int> <int>
## 1 2013 2 7 27 2400
## 2 2013 2 11 1 2400
## 3 2013 3 15 11 2400
## 4 2013 3 22 37 2400
## 5 2013 3 25 13 2400
## 6 2013 4 2 9 2400
## 7 2013 4 4 14 2400
## 8 2013 4 20 7 2400
## 9 2013 5 21 110 2400
## 10 2013 6 17 2 2400
library(dplyr)
not_cancelled %>%
group_by(month) %>%
summarise(
proportion = mean(dep_delay > 60, na.rm = TRUE)
) %>%
arrange(desc(proportion))
## # A tibble: 12 × 2
## month proportion
## <int> <dbl>
## 1 7 0.133
## 2 6 0.128
## 3 12 0.0936
## 4 4 0.0909
## 5 3 0.0832
## 6 5 0.0812
## 7 8 0.0794
## 8 2 0.0694
## 9 1 0.0685
## 10 9 0.0486
## 11 10 0.0469
## 12 11 0.0401
flights %>%
group_by(dest) %>%
summarise(num_carriers = n_distinct(carrier)) %>%
arrange(desc(num_carriers))
## # A tibble: 105 × 2
## dest num_carriers
## <chr> <int>
## 1 ATL 7
## 2 BOS 7
## 3 CLT 7
## 4 ORD 7
## 5 TPA 7
## 6 AUS 6
## 7 DCA 6
## 8 DTW 6
## 9 IAD 6
## 10 MSP 6
## # ℹ 95 more rows