Justin Kaplan

Workshop 6

Load appropriate packages and data-sets

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
pacman::p_load(nycflights13)

Question 2

# A did not return the desired outcome
select(flights, starts_with("dep"))
## # A tibble: 336,776 × 2
##    dep_time dep_delay
##       <int>     <dbl>
##  1      517         2
##  2      533         4
##  3      542         2
##  4      544        -1
##  5      554        -6
##  6      554        -4
##  7      555        -5
##  8      557        -3
##  9      557        -3
## 10      558        -2
## # ℹ 336,766 more rows
# B did not return the outcome
sortf <- arrange(flights,desc(dep_delay)) 
select(sortf, carrier, flight, tailnum, everything())
## # A tibble: 336,776 × 19
##    carrier flight tailnum  year month   day dep_time sched_dep_time dep_delay
##    <chr>    <int> <chr>   <int> <int> <int>    <int>          <int>     <dbl>
##  1 HA          51 N384HA   2013     1     9      641            900      1301
##  2 MQ        3535 N504MQ   2013     6    15     1432           1935      1137
##  3 MQ        3695 N517MQ   2013     1    10     1121           1635      1126
##  4 AA         177 N338AA   2013     9    20     1139           1845      1014
##  5 MQ        3075 N665MQ   2013     7    22      845           1600      1005
##  6 DL        2391 N959DL   2013     4    10     1100           1900       960
##  7 DL        2119 N927DA   2013     3    17     2321            810       911
##  8 DL        2007 N3762Y   2013     6    27      959           1900       899
##  9 DL        2047 N6716C   2013     7    22     2257            759       898
## 10 AA         172 N5DMAA   2013    12     5      756           1700       896
## # ℹ 336,766 more rows
## # ℹ 10 more variables: arr_time <int>, sched_arr_time <int>, arr_delay <dbl>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
# C did return the correct answer
maxdep <- max(flights$dep_delay, na.rm=TRUE)
maxdep_id <- which(flights$dep_delay==maxdep)
flights[maxdep_id, 10:12]
## # A tibble: 1 × 3
##   carrier flight tailnum
##   <chr>    <int> <chr>  
## 1 HA          51 N384HA
# D did not return the desired outcome
summarise(flights, delay=mean(dep_delay,na.rm=TRUE))
## # A tibble: 1 × 1
##   delay
##   <dbl>
## 1  12.6

Question 3

# A returned an error
# B returned the data that we're looking for
flights %>% 
 group_by(year, month, day) %>% 
 summarise(mean = mean(dep_delay, na.rm = TRUE))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups:   year, month [12]
##     year month   day  mean
##    <int> <int> <int> <dbl>
##  1  2013     1     1 11.5 
##  2  2013     1     2 13.9 
##  3  2013     1     3 11.0 
##  4  2013     1     4  8.95
##  5  2013     1     5  5.73
##  6  2013     1     6  7.15
##  7  2013     1     7  5.42
##  8  2013     1     8  2.55
##  9  2013     1     9  2.28
## 10  2013     1    10  2.84
## # ℹ 355 more rows
# C did not return the data
not_cancelled <- flights %>% 
 filter(!is.na(dep_delay), !is.na(arr_delay))
# D returned the data we are looking for
not_cancelled2 <- flights %>% 
 filter(!is.na(dep_delay))
not_cancelled %>% 
 group_by(year, month, day) %>% 
 summarise(mean = mean(dep_delay))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups:   year, month [12]
##     year month   day  mean
##    <int> <int> <int> <dbl>
##  1  2013     1     1 11.4 
##  2  2013     1     2 13.7 
##  3  2013     1     3 10.9 
##  4  2013     1     4  8.97
##  5  2013     1     5  5.73
##  6  2013     1     6  7.15
##  7  2013     1     7  5.42
##  8  2013     1     8  2.56
##  9  2013     1     9  2.30
## 10  2013     1    10  2.84
## # ℹ 355 more rows

Question 4

# Part One, obtaining the average arrival delay by tailnumber
Four <- flights %>% 
group_by(tailnum) %>% 
summarise(mean = mean(arr_delay, na.rm = TRUE))
# Part two, finding the lowest average arrival delay

Question 5

not_cancelled %>% 
 group_by(year, month, day) %>% 
 summarise(
  first = min(dep_time),
  last = max(dep_time)
)
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 5
## # Groups:   year, month [12]
##     year month   day first  last
##    <int> <int> <int> <int> <int>
##  1  2013     1     1   517  2356
##  2  2013     1     2    42  2354
##  3  2013     1     3    32  2349
##  4  2013     1     4    25  2358
##  5  2013     1     5    14  2357
##  6  2013     1     6    16  2355
##  7  2013     1     7    49  2359
##  8  2013     1     8   454  2351
##  9  2013     1     9     2  2252
## 10  2013     1    10     3  2320
## # ℹ 355 more rows
summary(flights$dep_time)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##       1     907    1401    1349    1744    2400    8255

Question 6

# Step one, find out the number of flights that are going each month
monthly_flights <- flights %>%
  group_by(month) %>%
  summarize(flight_count = n())
# Step two, find the amount of flights that were delayed by over 60 minutes
delayed_flights <- flights %>%
  filter(dep_delay > 60) %>% 
  group_by(month) %>%         
  summarize(delayed_flight_count = n())  
# Step Three, create a ratio to see what months have the highest ratio of delays
Ratio <- monthly_flights$flight_count / delayed_flights$delayed_flight_count
print(Ratio)
##  [1] 14.829215 15.085248 12.322222 11.175542 12.471200  8.083286  7.702880
##  [8] 12.778649 20.732331 21.494792 25.108656 11.020368
# The ratio indicates that they are the highest during September, November and October

Question 7

Seven <- flights %>%
  group_by(dest) %>%
  summarize(unique_carriers = n_distinct(carrier)) 

Question 9

delays <- flights %>% 
 group_by(dest) %>% 
 summarise(
  count = n(),
  dist = mean(distance, na.rm = TRUE),
  delay = mean(arr_delay, na.rm = TRUE)
 ) %>% 
 filter(count > 20, dest != "HNL")