Justin Kaplan
Workshop 6
Load appropriate packages and data-sets
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
pacman::p_load(nycflights13)
Question 2
# A did not return the desired outcome
select(flights, starts_with("dep"))
## # A tibble: 336,776 × 2
## dep_time dep_delay
## <int> <dbl>
## 1 517 2
## 2 533 4
## 3 542 2
## 4 544 -1
## 5 554 -6
## 6 554 -4
## 7 555 -5
## 8 557 -3
## 9 557 -3
## 10 558 -2
## # ℹ 336,766 more rows
# B did not return the outcome
sortf <- arrange(flights,desc(dep_delay))
select(sortf, carrier, flight, tailnum, everything())
## # A tibble: 336,776 × 19
## carrier flight tailnum year month day dep_time sched_dep_time dep_delay
## <chr> <int> <chr> <int> <int> <int> <int> <int> <dbl>
## 1 HA 51 N384HA 2013 1 9 641 900 1301
## 2 MQ 3535 N504MQ 2013 6 15 1432 1935 1137
## 3 MQ 3695 N517MQ 2013 1 10 1121 1635 1126
## 4 AA 177 N338AA 2013 9 20 1139 1845 1014
## 5 MQ 3075 N665MQ 2013 7 22 845 1600 1005
## 6 DL 2391 N959DL 2013 4 10 1100 1900 960
## 7 DL 2119 N927DA 2013 3 17 2321 810 911
## 8 DL 2007 N3762Y 2013 6 27 959 1900 899
## 9 DL 2047 N6716C 2013 7 22 2257 759 898
## 10 AA 172 N5DMAA 2013 12 5 756 1700 896
## # ℹ 336,766 more rows
## # ℹ 10 more variables: arr_time <int>, sched_arr_time <int>, arr_delay <dbl>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
# C did return the correct answer
maxdep <- max(flights$dep_delay, na.rm=TRUE)
maxdep_id <- which(flights$dep_delay==maxdep)
flights[maxdep_id, 10:12]
## # A tibble: 1 × 3
## carrier flight tailnum
## <chr> <int> <chr>
## 1 HA 51 N384HA
# D did not return the desired outcome
summarise(flights, delay=mean(dep_delay,na.rm=TRUE))
## # A tibble: 1 × 1
## delay
## <dbl>
## 1 12.6
Question 3
# A returned an error
# B returned the data that we're looking for
flights %>%
group_by(year, month, day) %>%
summarise(mean = mean(dep_delay, na.rm = TRUE))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups: year, month [12]
## year month day mean
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.5
## 2 2013 1 2 13.9
## 3 2013 1 3 11.0
## 4 2013 1 4 8.95
## 5 2013 1 5 5.73
## 6 2013 1 6 7.15
## 7 2013 1 7 5.42
## 8 2013 1 8 2.55
## 9 2013 1 9 2.28
## 10 2013 1 10 2.84
## # ℹ 355 more rows
# C did not return the data
not_cancelled <- flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay))
# D returned the data we are looking for
not_cancelled2 <- flights %>%
filter(!is.na(dep_delay))
not_cancelled %>%
group_by(year, month, day) %>%
summarise(mean = mean(dep_delay))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups: year, month [12]
## year month day mean
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.4
## 2 2013 1 2 13.7
## 3 2013 1 3 10.9
## 4 2013 1 4 8.97
## 5 2013 1 5 5.73
## 6 2013 1 6 7.15
## 7 2013 1 7 5.42
## 8 2013 1 8 2.56
## 9 2013 1 9 2.30
## 10 2013 1 10 2.84
## # ℹ 355 more rows
Question 4
# Part One, obtaining the average arrival delay by tailnumber
Four <- flights %>%
group_by(tailnum) %>%
summarise(mean = mean(arr_delay, na.rm = TRUE))
# Part two, finding the lowest average arrival delay
Question 5
not_cancelled %>%
group_by(year, month, day) %>%
summarise(
first = min(dep_time),
last = max(dep_time)
)
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 5
## # Groups: year, month [12]
## year month day first last
## <int> <int> <int> <int> <int>
## 1 2013 1 1 517 2356
## 2 2013 1 2 42 2354
## 3 2013 1 3 32 2349
## 4 2013 1 4 25 2358
## 5 2013 1 5 14 2357
## 6 2013 1 6 16 2355
## 7 2013 1 7 49 2359
## 8 2013 1 8 454 2351
## 9 2013 1 9 2 2252
## 10 2013 1 10 3 2320
## # ℹ 355 more rows
summary(flights$dep_time)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1 907 1401 1349 1744 2400 8255
Question 6
# Step one, find out the number of flights that are going each month
monthly_flights <- flights %>%
group_by(month) %>%
summarize(flight_count = n())
# Step two, find the amount of flights that were delayed by over 60 minutes
delayed_flights <- flights %>%
filter(dep_delay > 60) %>%
group_by(month) %>%
summarize(delayed_flight_count = n())
# Step Three, create a ratio to see what months have the highest ratio of delays
Ratio <- monthly_flights$flight_count / delayed_flights$delayed_flight_count
print(Ratio)
## [1] 14.829215 15.085248 12.322222 11.175542 12.471200 8.083286 7.702880
## [8] 12.778649 20.732331 21.494792 25.108656 11.020368
# The ratio indicates that they are the highest during September, November and October
Question 7
Seven <- flights %>%
group_by(dest) %>%
summarize(unique_carriers = n_distinct(carrier))
Question 9
delays <- flights %>%
group_by(dest) %>%
summarise(
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
) %>%
filter(count > 20, dest != "HNL")