# put your answer here.
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(nycflights13)
library(dplyr)
library(tidyr)
library(ggplot2)
planes
## # A tibble: 3,322 x 9
## tailnum year type manufacturer model engines seats speed engine
## <chr> <int> <chr> <chr> <chr> <int> <int> <int> <chr>
## 1 N10156 2004 Fixed wing m~ EMBRAER EMB-1~ 2 55 NA Turbo-~
## 2 N102UW 1998 Fixed wing m~ AIRBUS INDUST~ A320-~ 2 182 NA Turbo-~
## 3 N103US 1999 Fixed wing m~ AIRBUS INDUST~ A320-~ 2 182 NA Turbo-~
## 4 N104UW 1999 Fixed wing m~ AIRBUS INDUST~ A320-~ 2 182 NA Turbo-~
## 5 N10575 2002 Fixed wing m~ EMBRAER EMB-1~ 2 55 NA Turbo-~
## 6 N105UW 1999 Fixed wing m~ AIRBUS INDUST~ A320-~ 2 182 NA Turbo-~
## 7 N107US 1999 Fixed wing m~ AIRBUS INDUST~ A320-~ 2 182 NA Turbo-~
## 8 N108UW 1999 Fixed wing m~ AIRBUS INDUST~ A320-~ 2 182 NA Turbo-~
## 9 N109UW 1999 Fixed wing m~ AIRBUS INDUST~ A320-~ 2 182 NA Turbo-~
## 10 N110UW 1999 Fixed wing m~ AIRBUS INDUST~ A320-~ 2 182 NA Turbo-~
## # ... with 3,312 more rows
flight1 = flights %>%
group_by(tailnum) %>%
summarise(mean_delay=mean(arr_delay,na.rm=TRUE))
flight2 = flights %>%
group_by(tailnum) %>%
summarise(mean_distance=mean(distance,na.rm=TRUE))
flight2
## # A tibble: 4,044 x 2
## tailnum mean_distance
## <chr> <dbl>
## 1 D942DN 854.
## 2 N0EGMQ 676.
## 3 N10156 758.
## 4 N102UW 536.
## 5 N103US 535.
## 6 N104UW 535.
## 7 N10575 520.
## 8 N105UW 525.
## 9 N107US 529.
## 10 N108UW 534.
## # ... with 4,034 more rows
flight3 = flights%>%
group_by(tailnum) %>%
summarise(n=n())
flight3
## # A tibble: 4,044 x 2
## tailnum n
## <chr> <int>
## 1 D942DN 4
## 2 N0EGMQ 371
## 3 N10156 153
## 4 N102UW 48
## 5 N103US 46
## 6 N104UW 47
## 7 N10575 289
## 8 N105UW 45
## 9 N107US 41
## 10 N108UW 60
## # ... with 4,034 more rows
p1 = planes %>%
left_join(flight1, c("tailnum" = "tailnum"))
p1
## # A tibble: 3,322 x 10
## tailnum year type manufacturer model engines seats speed engine mean_delay
## <chr> <int> <chr> <chr> <chr> <int> <int> <int> <chr> <dbl>
## 1 N10156 2004 Fixed~ EMBRAER EMB-~ 2 55 NA Turbo~ 12.7
## 2 N102UW 1998 Fixed~ AIRBUS INDU~ A320~ 2 182 NA Turbo~ 2.94
## 3 N103US 1999 Fixed~ AIRBUS INDU~ A320~ 2 182 NA Turbo~ -6.93
## 4 N104UW 1999 Fixed~ AIRBUS INDU~ A320~ 2 182 NA Turbo~ 1.80
## 5 N10575 2002 Fixed~ EMBRAER EMB-~ 2 55 NA Turbo~ 20.7
## 6 N105UW 1999 Fixed~ AIRBUS INDU~ A320~ 2 182 NA Turbo~ -0.267
## 7 N107US 1999 Fixed~ AIRBUS INDU~ A320~ 2 182 NA Turbo~ -5.73
## 8 N108UW 1999 Fixed~ AIRBUS INDU~ A320~ 2 182 NA Turbo~ -1.25
## 9 N109UW 1999 Fixed~ AIRBUS INDU~ A320~ 2 182 NA Turbo~ -2.52
## 10 N110UW 1999 Fixed~ AIRBUS INDU~ A320~ 2 182 NA Turbo~ 2.8
## # ... with 3,312 more rows
p2 = p1 %>%
left_join(flight2, c("tailnum" = "tailnum"))
p2
## # A tibble: 3,322 x 11
## tailnum year type manufacturer model engines seats speed engine mean_delay
## <chr> <int> <chr> <chr> <chr> <int> <int> <int> <chr> <dbl>
## 1 N10156 2004 Fixed~ EMBRAER EMB-~ 2 55 NA Turbo~ 12.7
## 2 N102UW 1998 Fixed~ AIRBUS INDU~ A320~ 2 182 NA Turbo~ 2.94
## 3 N103US 1999 Fixed~ AIRBUS INDU~ A320~ 2 182 NA Turbo~ -6.93
## 4 N104UW 1999 Fixed~ AIRBUS INDU~ A320~ 2 182 NA Turbo~ 1.80
## 5 N10575 2002 Fixed~ EMBRAER EMB-~ 2 55 NA Turbo~ 20.7
## 6 N105UW 1999 Fixed~ AIRBUS INDU~ A320~ 2 182 NA Turbo~ -0.267
## 7 N107US 1999 Fixed~ AIRBUS INDU~ A320~ 2 182 NA Turbo~ -5.73
## 8 N108UW 1999 Fixed~ AIRBUS INDU~ A320~ 2 182 NA Turbo~ -1.25
## 9 N109UW 1999 Fixed~ AIRBUS INDU~ A320~ 2 182 NA Turbo~ -2.52
## 10 N110UW 1999 Fixed~ AIRBUS INDU~ A320~ 2 182 NA Turbo~ 2.8
## # ... with 3,312 more rows, and 1 more variable: mean_distance <dbl>
p2 %>% ggplot(aes( mean_distance,mean_delay))+geom_point()
## Warning: Removed 6 rows containing missing values (geom_point).
p3 = p2 %>%
left_join(flight3, c("tailnum" = "tailnum"))
p3 %>% ggplot(aes(n,mean_delay))+geom_point()
## Warning: Removed 6 rows containing missing values (geom_point).
p3 %>% ggplot(aes(year,mean_delay))+geom_point()
## Warning: Removed 76 rows containing missing values (geom_point).
2a) Let’s only study flights departing from EWR. For each destination, compute the average flight time (air_time) and the number of miles to that destination. Make a plot with miles to destination on the horizontal axis and average flight time on the vertical axis.
flight4= flights%>%
filter(origin =="EWR")%>%
group_by(dest)%>%
summarise(mean_distance=mean(distance,na.rm=TRUE),flight_time=mean(air_time,na.rm=TRUE))
flight4%>%ggplot(aes(mean_distance,flight_time))+geom_point()
## Warning: Removed 1 rows containing missing values (geom_point).
2b) Using this data, compute the distance divided by the average air_time. Call this the typical_speed. Plot the typical_speed on the vertical axis and distance on the horizontal axis.
flight4
## # A tibble: 86 x 3
## dest mean_distance flight_time
## <chr> <dbl> <dbl>
## 1 ALB 143 31.8
## 2 ANC 3370 413.
## 3 ATL 746 112.
## 4 AUS 1504 211.
## 5 AVL 583 89.8
## 6 BDL 116 25.5
## 7 BNA 748 115.
## 8 BOS 200 40.3
## 9 BQN 1585 196.
## 10 BTV 266 46.3
## # ... with 76 more rows
flight5=flight4%>%
group_by(dest)%>%
summarise(typical_speed=mean_distance/flight_time)
flight5
## # A tibble: 86 x 2
## dest typical_speed
## <chr> <dbl>
## 1 ALB 4.50
## 2 ANC 8.16
## 3 ATL 6.66
## 4 AUS 7.12
## 5 AVL 6.49
## 6 BDL 4.56
## 7 BNA 6.53
## 8 BOS 4.96
## 9 BQN 8.08
## 10 BTV 5.75
## # ... with 76 more rows
flight6= flight4 %>%
left_join(flight5, c("dest" = "dest"))
flight6
## # A tibble: 86 x 4
## dest mean_distance flight_time typical_speed
## <chr> <dbl> <dbl> <dbl>
## 1 ALB 143 31.8 4.50
## 2 ANC 3370 413. 8.16
## 3 ATL 746 112. 6.66
## 4 AUS 1504 211. 7.12
## 5 AVL 583 89.8 6.49
## 6 BDL 116 25.5 4.56
## 7 BNA 748 115. 6.53
## 8 BOS 200 40.3 4.96
## 9 BQN 1585 196. 8.08
## 10 BTV 266 46.3 5.75
## # ... with 76 more rows
flight6%>%ggplot(aes(mean_distance,typical_speed))+geom_point()
## Warning: Removed 1 rows containing missing values (geom_point).
Let’s fit a linear model, for each origin x day. The outcome is the proportion of flights canceled. The features will be simple summaries of the weather.
I’ve done a bunch of work below to get you started… there is one line that you need to finish…
library(ISLR)
library(ggplot2)
flights_canceled =
flights %>%
group_by(origin, month,day) %>%
summarise(Y = mean(is.na(dep_time)))
## `summarise()` has grouped output by 'origin', 'month'. You can override using the `.groups` argument.
bad_weather =
weather %>%
group_by(origin, month,day) %>%
summarize(temp = mean(temp, na.rm = T),
dewp = mean(dewp, na.rm = T),
humid = mean(humid, na.rm = T),
wind_speed = mean(wind_speed, na.rm = T),
wind_gust = mean(wind_gust, na.rm = T),
precip = mean(precip, na.rm = T),
visib = mean(visib, na.rm = T))
## `summarise()` has grouped output by 'origin', 'month'. You can override using the `.groups` argument.
bad_weather
## # A tibble: 1,092 x 10
## # Groups: origin, month [36]
## origin month day temp dewp humid wind_speed wind_gust precip visib
## <chr> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EWR 1 1 36.8 22.7 56.7 13.2 24.5 0 10
## 2 EWR 1 2 28.7 11.7 48.8 10.9 22.8 0 10
## 3 EWR 1 3 29.6 15.2 55.0 8.58 NaN 0 10
## 4 EWR 1 4 34.3 19.9 55.7 14.0 25.8 0 10
## 5 EWR 1 5 36.6 19.8 51.2 9.40 21.3 0 10
## 6 EWR 1 6 39.9 27.2 62.8 9.11 19.0 0 9.21
## 7 EWR 1 7 40.3 23.8 53.4 7.34 22.1 0 10
## 8 EWR 1 8 38.6 27.7 66.1 7.19 25.3 0 9.75
## 9 EWR 1 9 42.1 33.0 70.6 5.99 20.7 0 9.04
## 10 EWR 1 10 43.6 24.4 47.8 8.92 20.9 0 10
## # ... with 1,082 more rows
flights_canceled
## # A tibble: 1,095 x 4
## # Groups: origin, month [36]
## origin month day Y
## <chr> <int> <int> <dbl>
## 1 EWR 1 1 0.00328
## 2 EWR 1 2 0.0171
## 3 EWR 1 3 0.00893
## 4 EWR 1 4 0.00590
## 5 EWR 1 5 0.00420
## 6 EWR 1 6 0.00332
## 7 EWR 1 7 0
## 8 EWR 1 8 0.00599
## 9 EWR 1 9 0.00298
## 10 EWR 1 10 0.00291
## # ... with 1,085 more rows
X = flights_canceled %>% left_join(bad_weather,c("origin","month","day"))
fit = lm(Y~temp + dewp+humid+wind_speed + wind_gust+precip+visib, data = X)
summary(fit)
##
## Call:
## lm(formula = Y ~ temp + dewp + humid + wind_speed + wind_gust +
## precip + visib, data = X)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.12203 -0.01826 -0.00674 0.00479 0.60254
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.179e-01 1.097e-01 1.074 0.283048
## temp -8.413e-04 2.216e-03 -0.380 0.704247
## dewp 9.274e-04 2.390e-03 0.388 0.698128
## humid 1.991e-04 1.155e-03 0.172 0.863187
## wind_speed 2.360e-03 6.867e-04 3.437 0.000619 ***
## wind_gust 4.673e-05 6.157e-04 0.076 0.939521
## precip 2.024e-01 1.689e-01 1.198 0.231149
## visib -1.321e-02 2.607e-03 -5.067 5.04e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05386 on 789 degrees of freedom
## (298 observations deleted due to missingness)
## Multiple R-squared: 0.1976, Adjusted R-squared: 0.1905
## F-statistic: 27.76 on 7 and 789 DF, p-value: < 2.2e-16
Find all of the planes with 4 engines:
planes%>%
filter(engines =="4")
## # A tibble: 4 x 9
## tailnum year type manufacturer model engines seats speed engine
## <chr> <int> <chr> <chr> <chr> <int> <int> <int> <chr>
## 1 N281AT NA Fixed wing m~ AIRBUS INDUST~ A340-~ 4 375 NA Turbo-j~
## 2 N381AA 1956 Fixed wing m~ DOUGLAS DC-7BF 4 102 232 Recipro~
## 3 N670US 1990 Fixed wing m~ BOEING 747-4~ 4 450 NA Turbo-j~
## 4 N840MQ 1974 Fixed wing m~ CANADAIR LTD CF-5D 4 2 NA Turbo-j~