Midterm.

1. Summarize the planes

# put your answer here. 
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(nycflights13)
library(dplyr)
library(tidyr)
library(ggplot2)
planes
## # A tibble: 3,322 x 9
##    tailnum  year type          manufacturer   model  engines seats speed engine 
##    <chr>   <int> <chr>         <chr>          <chr>    <int> <int> <int> <chr>  
##  1 N10156   2004 Fixed wing m~ EMBRAER        EMB-1~       2    55    NA Turbo-~
##  2 N102UW   1998 Fixed wing m~ AIRBUS INDUST~ A320-~       2   182    NA Turbo-~
##  3 N103US   1999 Fixed wing m~ AIRBUS INDUST~ A320-~       2   182    NA Turbo-~
##  4 N104UW   1999 Fixed wing m~ AIRBUS INDUST~ A320-~       2   182    NA Turbo-~
##  5 N10575   2002 Fixed wing m~ EMBRAER        EMB-1~       2    55    NA Turbo-~
##  6 N105UW   1999 Fixed wing m~ AIRBUS INDUST~ A320-~       2   182    NA Turbo-~
##  7 N107US   1999 Fixed wing m~ AIRBUS INDUST~ A320-~       2   182    NA Turbo-~
##  8 N108UW   1999 Fixed wing m~ AIRBUS INDUST~ A320-~       2   182    NA Turbo-~
##  9 N109UW   1999 Fixed wing m~ AIRBUS INDUST~ A320-~       2   182    NA Turbo-~
## 10 N110UW   1999 Fixed wing m~ AIRBUS INDUST~ A320-~       2   182    NA Turbo-~
## # ... with 3,312 more rows
flight1 = flights %>%
  group_by(tailnum) %>%
  summarise(mean_delay=mean(arr_delay,na.rm=TRUE))

flight2 = flights %>%
  group_by(tailnum) %>%
  summarise(mean_distance=mean(distance,na.rm=TRUE))
flight2
## # A tibble: 4,044 x 2
##    tailnum mean_distance
##    <chr>           <dbl>
##  1 D942DN           854.
##  2 N0EGMQ           676.
##  3 N10156           758.
##  4 N102UW           536.
##  5 N103US           535.
##  6 N104UW           535.
##  7 N10575           520.
##  8 N105UW           525.
##  9 N107US           529.
## 10 N108UW           534.
## # ... with 4,034 more rows
flight3 = flights%>%
  group_by(tailnum) %>%
  summarise(n=n())
flight3
## # A tibble: 4,044 x 2
##    tailnum     n
##    <chr>   <int>
##  1 D942DN      4
##  2 N0EGMQ    371
##  3 N10156    153
##  4 N102UW     48
##  5 N103US     46
##  6 N104UW     47
##  7 N10575    289
##  8 N105UW     45
##  9 N107US     41
## 10 N108UW     60
## # ... with 4,034 more rows
p1 = planes %>%
  left_join(flight1, c("tailnum" = "tailnum"))
p1 
## # A tibble: 3,322 x 10
##    tailnum  year type   manufacturer model engines seats speed engine mean_delay
##    <chr>   <int> <chr>  <chr>        <chr>   <int> <int> <int> <chr>       <dbl>
##  1 N10156   2004 Fixed~ EMBRAER      EMB-~       2    55    NA Turbo~     12.7  
##  2 N102UW   1998 Fixed~ AIRBUS INDU~ A320~       2   182    NA Turbo~      2.94 
##  3 N103US   1999 Fixed~ AIRBUS INDU~ A320~       2   182    NA Turbo~     -6.93 
##  4 N104UW   1999 Fixed~ AIRBUS INDU~ A320~       2   182    NA Turbo~      1.80 
##  5 N10575   2002 Fixed~ EMBRAER      EMB-~       2    55    NA Turbo~     20.7  
##  6 N105UW   1999 Fixed~ AIRBUS INDU~ A320~       2   182    NA Turbo~     -0.267
##  7 N107US   1999 Fixed~ AIRBUS INDU~ A320~       2   182    NA Turbo~     -5.73 
##  8 N108UW   1999 Fixed~ AIRBUS INDU~ A320~       2   182    NA Turbo~     -1.25 
##  9 N109UW   1999 Fixed~ AIRBUS INDU~ A320~       2   182    NA Turbo~     -2.52 
## 10 N110UW   1999 Fixed~ AIRBUS INDU~ A320~       2   182    NA Turbo~      2.8  
## # ... with 3,312 more rows
p2 = p1 %>%
  left_join(flight2, c("tailnum" = "tailnum"))
p2
## # A tibble: 3,322 x 11
##    tailnum  year type   manufacturer model engines seats speed engine mean_delay
##    <chr>   <int> <chr>  <chr>        <chr>   <int> <int> <int> <chr>       <dbl>
##  1 N10156   2004 Fixed~ EMBRAER      EMB-~       2    55    NA Turbo~     12.7  
##  2 N102UW   1998 Fixed~ AIRBUS INDU~ A320~       2   182    NA Turbo~      2.94 
##  3 N103US   1999 Fixed~ AIRBUS INDU~ A320~       2   182    NA Turbo~     -6.93 
##  4 N104UW   1999 Fixed~ AIRBUS INDU~ A320~       2   182    NA Turbo~      1.80 
##  5 N10575   2002 Fixed~ EMBRAER      EMB-~       2    55    NA Turbo~     20.7  
##  6 N105UW   1999 Fixed~ AIRBUS INDU~ A320~       2   182    NA Turbo~     -0.267
##  7 N107US   1999 Fixed~ AIRBUS INDU~ A320~       2   182    NA Turbo~     -5.73 
##  8 N108UW   1999 Fixed~ AIRBUS INDU~ A320~       2   182    NA Turbo~     -1.25 
##  9 N109UW   1999 Fixed~ AIRBUS INDU~ A320~       2   182    NA Turbo~     -2.52 
## 10 N110UW   1999 Fixed~ AIRBUS INDU~ A320~       2   182    NA Turbo~      2.8  
## # ... with 3,312 more rows, and 1 more variable: mean_distance <dbl>
p2 %>% ggplot(aes( mean_distance,mean_delay))+geom_point() 
## Warning: Removed 6 rows containing missing values (geom_point).

p3 = p2 %>%
  left_join(flight3, c("tailnum" = "tailnum"))

p3 %>% ggplot(aes(n,mean_delay))+geom_point() 
## Warning: Removed 6 rows containing missing values (geom_point).

p3 %>% ggplot(aes(year,mean_delay))+geom_point() 
## Warning: Removed 76 rows containing missing values (geom_point).

2. Distance vs flight time

2a) Let’s only study flights departing from EWR. For each destination, compute the average flight time (air_time) and the number of miles to that destination. Make a plot with miles to destination on the horizontal axis and average flight time on the vertical axis.

flight4= flights%>%
  filter(origin =="EWR")%>%
  group_by(dest)%>%
  summarise(mean_distance=mean(distance,na.rm=TRUE),flight_time=mean(air_time,na.rm=TRUE))


flight4%>%ggplot(aes(mean_distance,flight_time))+geom_point()
## Warning: Removed 1 rows containing missing values (geom_point).

2b) Using this data, compute the distance divided by the average air_time. Call this the typical_speed. Plot the typical_speed on the vertical axis and distance on the horizontal axis.

flight4
## # A tibble: 86 x 3
##    dest  mean_distance flight_time
##    <chr>         <dbl>       <dbl>
##  1 ALB             143        31.8
##  2 ANC            3370       413. 
##  3 ATL             746       112. 
##  4 AUS            1504       211. 
##  5 AVL             583        89.8
##  6 BDL             116        25.5
##  7 BNA             748       115. 
##  8 BOS             200        40.3
##  9 BQN            1585       196. 
## 10 BTV             266        46.3
## # ... with 76 more rows
flight5=flight4%>%
  group_by(dest)%>%
  summarise(typical_speed=mean_distance/flight_time)
           
           
           
flight5
## # A tibble: 86 x 2
##    dest  typical_speed
##    <chr>         <dbl>
##  1 ALB            4.50
##  2 ANC            8.16
##  3 ATL            6.66
##  4 AUS            7.12
##  5 AVL            6.49
##  6 BDL            4.56
##  7 BNA            6.53
##  8 BOS            4.96
##  9 BQN            8.08
## 10 BTV            5.75
## # ... with 76 more rows
flight6= flight4 %>%
  left_join(flight5, c("dest" = "dest"))

flight6
## # A tibble: 86 x 4
##    dest  mean_distance flight_time typical_speed
##    <chr>         <dbl>       <dbl>         <dbl>
##  1 ALB             143        31.8          4.50
##  2 ANC            3370       413.           8.16
##  3 ATL             746       112.           6.66
##  4 AUS            1504       211.           7.12
##  5 AVL             583        89.8          6.49
##  6 BDL             116        25.5          4.56
##  7 BNA             748       115.           6.53
##  8 BOS             200        40.3          4.96
##  9 BQN            1585       196.           8.08
## 10 BTV             266        46.3          5.75
## # ... with 76 more rows
flight6%>%ggplot(aes(mean_distance,typical_speed))+geom_point()
## Warning: Removed 1 rows containing missing values (geom_point).

3. Bad weather metric

Let’s fit a linear model, for each origin x day. The outcome is the proportion of flights canceled. The features will be simple summaries of the weather.

I’ve done a bunch of work below to get you started… there is one line that you need to finish…

library(ISLR)
library(ggplot2)
flights_canceled = 
  flights %>% 
  group_by(origin, month,day) %>% 
  summarise(Y = mean(is.na(dep_time)))
## `summarise()` has grouped output by 'origin', 'month'. You can override using the `.groups` argument.
bad_weather = 
  weather %>% 
  group_by(origin, month,day) %>% 
  summarize(temp = mean(temp, na.rm = T),
            dewp = mean(dewp, na.rm = T),
            humid = mean(humid, na.rm = T),
            wind_speed = mean(wind_speed, na.rm = T),
            wind_gust = mean(wind_gust, na.rm = T),
            precip = mean(precip, na.rm = T),
            visib = mean(visib, na.rm = T))
## `summarise()` has grouped output by 'origin', 'month'. You can override using the `.groups` argument.
bad_weather 
## # A tibble: 1,092 x 10
## # Groups:   origin, month [36]
##    origin month   day  temp  dewp humid wind_speed wind_gust precip visib
##    <chr>  <int> <int> <dbl> <dbl> <dbl>      <dbl>     <dbl>  <dbl> <dbl>
##  1 EWR        1     1  36.8  22.7  56.7      13.2       24.5      0 10   
##  2 EWR        1     2  28.7  11.7  48.8      10.9       22.8      0 10   
##  3 EWR        1     3  29.6  15.2  55.0       8.58     NaN        0 10   
##  4 EWR        1     4  34.3  19.9  55.7      14.0       25.8      0 10   
##  5 EWR        1     5  36.6  19.8  51.2       9.40      21.3      0 10   
##  6 EWR        1     6  39.9  27.2  62.8       9.11      19.0      0  9.21
##  7 EWR        1     7  40.3  23.8  53.4       7.34      22.1      0 10   
##  8 EWR        1     8  38.6  27.7  66.1       7.19      25.3      0  9.75
##  9 EWR        1     9  42.1  33.0  70.6       5.99      20.7      0  9.04
## 10 EWR        1    10  43.6  24.4  47.8       8.92      20.9      0 10   
## # ... with 1,082 more rows
flights_canceled
## # A tibble: 1,095 x 4
## # Groups:   origin, month [36]
##    origin month   day       Y
##    <chr>  <int> <int>   <dbl>
##  1 EWR        1     1 0.00328
##  2 EWR        1     2 0.0171 
##  3 EWR        1     3 0.00893
##  4 EWR        1     4 0.00590
##  5 EWR        1     5 0.00420
##  6 EWR        1     6 0.00332
##  7 EWR        1     7 0      
##  8 EWR        1     8 0.00599
##  9 EWR        1     9 0.00298
## 10 EWR        1    10 0.00291
## # ... with 1,085 more rows
X = flights_canceled %>% left_join(bad_weather,c("origin","month","day"))

fit = lm(Y~temp + dewp+humid+wind_speed + wind_gust+precip+visib, data = X)
summary(fit) 
## 
## Call:
## lm(formula = Y ~ temp + dewp + humid + wind_speed + wind_gust + 
##     precip + visib, data = X)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.12203 -0.01826 -0.00674  0.00479  0.60254 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.179e-01  1.097e-01   1.074 0.283048    
## temp        -8.413e-04  2.216e-03  -0.380 0.704247    
## dewp         9.274e-04  2.390e-03   0.388 0.698128    
## humid        1.991e-04  1.155e-03   0.172 0.863187    
## wind_speed   2.360e-03  6.867e-04   3.437 0.000619 ***
## wind_gust    4.673e-05  6.157e-04   0.076 0.939521    
## precip       2.024e-01  1.689e-01   1.198 0.231149    
## visib       -1.321e-02  2.607e-03  -5.067 5.04e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05386 on 789 degrees of freedom
##   (298 observations deleted due to missingness)
## Multiple R-squared:  0.1976, Adjusted R-squared:  0.1905 
## F-statistic: 27.76 on 7 and 789 DF,  p-value: < 2.2e-16

4. Planes with four engines

Find all of the planes with 4 engines:

planes%>%
  filter(engines =="4")
## # A tibble: 4 x 9
##   tailnum  year type          manufacturer   model  engines seats speed engine  
##   <chr>   <int> <chr>         <chr>          <chr>    <int> <int> <int> <chr>   
## 1 N281AT     NA Fixed wing m~ AIRBUS INDUST~ A340-~       4   375    NA Turbo-j~
## 2 N381AA   1956 Fixed wing m~ DOUGLAS        DC-7BF       4   102   232 Recipro~
## 3 N670US   1990 Fixed wing m~ BOEING         747-4~       4   450    NA Turbo-j~
## 4 N840MQ   1974 Fixed wing m~ CANADAIR LTD   CF-5D        4     2    NA Turbo-j~