library(tidyverse)
library(lubridate)
library(knitr)
library(skimr)
MichiganFlights <- readRDS("C:/Users/HP/Downloads/fullMiFlights2021.rds")

unnesting the data so it is easier to call

# Create tibbles in environment from list
list2env(MichiganFlights, envir = .GlobalEnv)
## <environment: R_GlobalEnv>
glimpse(airlines)
## Rows: 15
## Columns: 2
## $ carrier <chr> "9E", "AA", "AS", "B6", "DL", "F9", "G4", "MQ", "NK", "OH", "O…
## $ name    <chr> "Endeavor Air Inc.", "American Airlines Inc.", "Alaska Airline…
glimpse(flights)
## Rows: 149,445
## Columns: 19
## $ year           <int> 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2…
## $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time       <int> 536, 557, 558, 600, 606, 610, 611, 611, 624, 624, 627, …
## $ sched_dep_time <int> 539, 600, 600, 607, 600, 615, 615, 616, 630, 615, 600, …
## $ dep_delay      <dbl> -3, -3, -2, -7, 6, -5, -4, -5, -6, 9, 27, -5, -5, -1, 0…
## $ arr_time       <int> 738, 758, 700, 820, 905, 809, 809, 804, 711, 806, 808, …
## $ sched_arr_time <int> 825, 748, 730, 831, 920, 832, 822, 826, 723, 800, 834, …
## $ arr_delay      <dbl> -47, 10, -30, -11, -15, -23, -13, -22, -12, 6, -26, -7,…
## $ carrier        <chr> "AA", "DL", "NK", "OH", "NK", "OH", "DL", "YX", "DL", "…
## $ flight         <int> 90, 174, 5, 512, 21, 507, 120, 491, 173, 284, 140, 157,…
## $ tailnum        <chr> "N750UW", "N354NB", "N653NK", "N507AE", "N675NK", "N600…
## $ origin         <chr> "DTW", "GRR", "DTW", "FNT", "DTW", "GRR", "DTW", "DTW",…
## $ dest           <chr> "PHX", "ATL", "LAS", "CLT", "FLL", "CLT", "ATL", "CLT",…
## $ air_time       <dbl> 227, 104, 220, 95, 162, 91, 95, 87, 28, 61, 207, 66, 15…
## $ distance       <dbl> 1671, 640, 1749, 555, 1127, 583, 594, 500, 120, 409, 15…
## $ hour           <dbl> 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 6, 7, 7, 7…
## $ minute         <dbl> 39, 0, 0, 7, 0, 15, 15, 16, 30, 15, 0, 0, 0, 0, 0, 50, …
## $ time_hour      <dttm> 2021-01-01 05:00:00, 2021-01-01 06:00:00, 2021-01-01 0…
glimpse(airlines)
## Rows: 15
## Columns: 2
## $ carrier <chr> "9E", "AA", "AS", "B6", "DL", "F9", "G4", "MQ", "NK", "OH", "O…
## $ name    <chr> "Endeavor Air Inc.", "American Airlines Inc.", "Alaska Airline…
skim(weather)
Data summary
Name weather
Number of rows 34897
Number of columns 15
_______________________
Column type frequency:
character 1
numeric 13
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
origin 0 1 3 3 0 4 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1.00 2021.00 0.00 2021.00 2021.00 2021.00 2021.00 2021.00 ▁▁▇▁▁
month 0 1.00 6.51 3.44 1.00 4.00 7.00 9.00 12.00 ▇▆▆▆▇
day 0 1.00 15.67 8.77 1.00 8.00 16.00 23.00 31.00 ▇▇▇▇▆
hour 0 1.00 11.50 6.92 0.00 6.00 12.00 18.00 23.00 ▇▇▆▇▇
temp 34395 0.01 45.75 18.05 10.90 30.90 37.90 64.00 82.90 ▂▇▃▃▃
dewp 34396 0.01 36.89 18.27 5.00 21.90 30.00 51.10 73.90 ▅▇▂▅▃
humid 34397 0.01 72.73 15.76 31.52 61.59 72.07 87.06 100.00 ▂▃▇▆▇
wind_dir 952 0.97 181.80 107.41 0.00 90.00 200.00 270.00 360.00 ▇▃▇▇▆
wind_speed 465 0.99 8.09 5.36 0.00 4.60 8.06 11.51 36.82 ▇▆▂▁▁
wind_gust 465 0.99 9.31 6.17 0.00 5.30 9.27 13.24 42.38 ▇▆▂▁▁
precip 33636 0.04 0.01 0.03 0.00 0.00 0.01 0.01 0.44 ▇▁▁▁▁
pressure 34629 0.01 1011.31 7.07 1000.10 1004.38 1011.15 1018.50 1024.00 ▇▅▅▅▆
visib 99 1.00 8.80 2.29 0.06 9.00 10.00 10.00 10.00 ▁▁▁▁▇

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
time_hour 0 1 2021-01-01 2021-12-30 23:00:00 2021-07-01 23:00:00 8735
skim(planes)
Data summary
Name planes
Number of rows 3962
Number of columns 9
_______________________
Column type frequency:
character 5
numeric 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
tailnum 0 1 5 6 0 3962 0
type 0 1 23 23 0 1 0
manufacturer 0 1 6 29 0 11 0
model 0 1 5 15 0 78 0
engine 0 1 9 9 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 112 0.97 2008.85 7.24 1990 2003 2009 2015 2021 ▁▆▇▅▇
engines 0 1.00 2.00 0.05 2 2 2 2 3 ▇▁▁▁▁
seats 0 1.00 154.73 74.05 20 95 149 190 451 ▆▇▂▁▁
speed 0 1.00 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
flightsCarriers <- flights %>% left_join(airlines,
                                         by = c("carrier" = "carrier"))

flightsCarriers %>% group_by(name) %>% summarize(AveDistance = mean(distance, na.rm = TRUE)) %>% arrange(AveDistance)
## # A tibble: 15 × 2
##    name                   AveDistance
##    <chr>                        <dbl>
##  1 Endeavor Air Inc.             328.
##  2 Envoy Air                     362.
##  3 Republic Airline              384.
##  4 SkyWest Airlines Inc.         428.
##  5 PSA Airlines Inc.             510.
##  6 JetBlue Airways               584.
##  7 Southwest Airlines Co.        600.
##  8 United Air Lines Inc.         679.
##  9 Mesa Airlines Inc.            764.
## 10 American Airlines Inc.        912.
## 11 Delta Air Lines Inc.          960.
## 12 Spirit Air Lines             1113.
## 13 Allegiant Air                1117.
## 14 Frontier Airlines Inc.       1177.
## 15 Alaska Airlines Inc.         1927
weatherFlights <- weather %>% left_join(flights, by = c( "origin", "year", "month", "day", "hour"))

nrow(weatherFlights)
## [1] 168159
FlightWeather <- flights %>% left_join(weather, by = c( "origin", "year", "month", "day", "hour"))

weatherFlightsFull <- flights %>% full_join(weather, by = c( "origin", "year", "month", "day", "hour"))

nrow(weatherFlightsFull)
## [1] 168504

How many flights are missing wind speed?

FlightWeather %>% pull(wind_speed) %>% is.na() %>% sum()
## [1] 1526
#preportion of missing wind speed of flights
FlightWeather %>% pull(wind_speed) %>% is.na() %>% mean()
## [1] 0.01021111

Inner join weather and flights

innerweatherflights <-flights %>% inner_join(weather, by = c( "origin", "year", "month", "day", "hour"))

nrow(innerweatherflights)
## [1] 149100
nrow(weatherFlights)
## [1] 168159
nrow(weatherFlights)-nrow(innerweatherflights)
## [1] 19059
completeWeatherFlights <- complete(innerweatherflights)
#weatherflights is the same as compleWeatherFlights

completeWeatherFlights %>% mutate(precip = replace_na(precip, 0))
## # A tibble: 149,100 × 29
##     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
##    <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
##  1  2021     1     1      536        539      -3     738     825     -47 AA     
##  2  2021     1     1      557        600      -3     758     748      10 DL     
##  3  2021     1     1      558        600      -2     700     730     -30 NK     
##  4  2021     1     1      600        607      -7     820     831     -11 OH     
##  5  2021     1     1      606        600       6     905     920     -15 NK     
##  6  2021     1     1      610        615      -5     809     832     -23 OH     
##  7  2021     1     1      611        615      -4     809     822     -13 DL     
##  8  2021     1     1      611        616      -5     804     826     -22 YX     
##  9  2021     1     1      624        630      -6     711     723     -12 DL     
## 10  2021     1     1      624        615       9     806     800       6 WN     
## # … with 149,090 more rows, 19 more variables: flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour.x <dttm>, temp <dbl>, dewp <dbl>, humid <dbl>,
## #   wind_dir <dbl>, wind_speed <dbl>, wind_gust <dbl>, precip <dbl>,
## #   pressure <dbl>, visib <dbl>, time_hour.y <dttm>, and abbreviated variable
## #   names ¹​sched_dep_time, ²​dep_delay, ³​arr_time, ⁴​sched_arr_time, ⁵​arr_delay