library(tidyverse)
library(lubridate)
library(knitr)
library(skimr)
MichiganFlights <- readRDS("C:/Users/HP/Downloads/fullMiFlights2021.rds")
unnesting the data so it is easier to call
# Create tibbles in environment from list
list2env(MichiganFlights, envir = .GlobalEnv)
## <environment: R_GlobalEnv>
glimpse(airlines)
## Rows: 15
## Columns: 2
## $ carrier <chr> "9E", "AA", "AS", "B6", "DL", "F9", "G4", "MQ", "NK", "OH", "O…
## $ name <chr> "Endeavor Air Inc.", "American Airlines Inc.", "Alaska Airline…
glimpse(flights)
## Rows: 149,445
## Columns: 19
## $ year <int> 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2…
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time <int> 536, 557, 558, 600, 606, 610, 611, 611, 624, 624, 627, …
## $ sched_dep_time <int> 539, 600, 600, 607, 600, 615, 615, 616, 630, 615, 600, …
## $ dep_delay <dbl> -3, -3, -2, -7, 6, -5, -4, -5, -6, 9, 27, -5, -5, -1, 0…
## $ arr_time <int> 738, 758, 700, 820, 905, 809, 809, 804, 711, 806, 808, …
## $ sched_arr_time <int> 825, 748, 730, 831, 920, 832, 822, 826, 723, 800, 834, …
## $ arr_delay <dbl> -47, 10, -30, -11, -15, -23, -13, -22, -12, 6, -26, -7,…
## $ carrier <chr> "AA", "DL", "NK", "OH", "NK", "OH", "DL", "YX", "DL", "…
## $ flight <int> 90, 174, 5, 512, 21, 507, 120, 491, 173, 284, 140, 157,…
## $ tailnum <chr> "N750UW", "N354NB", "N653NK", "N507AE", "N675NK", "N600…
## $ origin <chr> "DTW", "GRR", "DTW", "FNT", "DTW", "GRR", "DTW", "DTW",…
## $ dest <chr> "PHX", "ATL", "LAS", "CLT", "FLL", "CLT", "ATL", "CLT",…
## $ air_time <dbl> 227, 104, 220, 95, 162, 91, 95, 87, 28, 61, 207, 66, 15…
## $ distance <dbl> 1671, 640, 1749, 555, 1127, 583, 594, 500, 120, 409, 15…
## $ hour <dbl> 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 6, 7, 7, 7…
## $ minute <dbl> 39, 0, 0, 7, 0, 15, 15, 16, 30, 15, 0, 0, 0, 0, 0, 50, …
## $ time_hour <dttm> 2021-01-01 05:00:00, 2021-01-01 06:00:00, 2021-01-01 0…
glimpse(airlines)
## Rows: 15
## Columns: 2
## $ carrier <chr> "9E", "AA", "AS", "B6", "DL", "F9", "G4", "MQ", "NK", "OH", "O…
## $ name <chr> "Endeavor Air Inc.", "American Airlines Inc.", "Alaska Airline…
skim(weather)
| Name | weather |
| Number of rows | 34897 |
| Number of columns | 15 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| numeric | 13 |
| POSIXct | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| origin | 0 | 1 | 3 | 3 | 0 | 4 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| year | 0 | 1.00 | 2021.00 | 0.00 | 2021.00 | 2021.00 | 2021.00 | 2021.00 | 2021.00 | ▁▁▇▁▁ |
| month | 0 | 1.00 | 6.51 | 3.44 | 1.00 | 4.00 | 7.00 | 9.00 | 12.00 | ▇▆▆▆▇ |
| day | 0 | 1.00 | 15.67 | 8.77 | 1.00 | 8.00 | 16.00 | 23.00 | 31.00 | ▇▇▇▇▆ |
| hour | 0 | 1.00 | 11.50 | 6.92 | 0.00 | 6.00 | 12.00 | 18.00 | 23.00 | ▇▇▆▇▇ |
| temp | 34395 | 0.01 | 45.75 | 18.05 | 10.90 | 30.90 | 37.90 | 64.00 | 82.90 | ▂▇▃▃▃ |
| dewp | 34396 | 0.01 | 36.89 | 18.27 | 5.00 | 21.90 | 30.00 | 51.10 | 73.90 | ▅▇▂▅▃ |
| humid | 34397 | 0.01 | 72.73 | 15.76 | 31.52 | 61.59 | 72.07 | 87.06 | 100.00 | ▂▃▇▆▇ |
| wind_dir | 952 | 0.97 | 181.80 | 107.41 | 0.00 | 90.00 | 200.00 | 270.00 | 360.00 | ▇▃▇▇▆ |
| wind_speed | 465 | 0.99 | 8.09 | 5.36 | 0.00 | 4.60 | 8.06 | 11.51 | 36.82 | ▇▆▂▁▁ |
| wind_gust | 465 | 0.99 | 9.31 | 6.17 | 0.00 | 5.30 | 9.27 | 13.24 | 42.38 | ▇▆▂▁▁ |
| precip | 33636 | 0.04 | 0.01 | 0.03 | 0.00 | 0.00 | 0.01 | 0.01 | 0.44 | ▇▁▁▁▁ |
| pressure | 34629 | 0.01 | 1011.31 | 7.07 | 1000.10 | 1004.38 | 1011.15 | 1018.50 | 1024.00 | ▇▅▅▅▆ |
| visib | 99 | 1.00 | 8.80 | 2.29 | 0.06 | 9.00 | 10.00 | 10.00 | 10.00 | ▁▁▁▁▇ |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| time_hour | 0 | 1 | 2021-01-01 | 2021-12-30 23:00:00 | 2021-07-01 23:00:00 | 8735 |
skim(planes)
| Name | planes |
| Number of rows | 3962 |
| Number of columns | 9 |
| _______________________ | |
| Column type frequency: | |
| character | 5 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| tailnum | 0 | 1 | 5 | 6 | 0 | 3962 | 0 |
| type | 0 | 1 | 23 | 23 | 0 | 1 | 0 |
| manufacturer | 0 | 1 | 6 | 29 | 0 | 11 | 0 |
| model | 0 | 1 | 5 | 15 | 0 | 78 | 0 |
| engine | 0 | 1 | 9 | 9 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| year | 112 | 0.97 | 2008.85 | 7.24 | 1990 | 2003 | 2009 | 2015 | 2021 | ▁▆▇▅▇ |
| engines | 0 | 1.00 | 2.00 | 0.05 | 2 | 2 | 2 | 2 | 3 | ▇▁▁▁▁ |
| seats | 0 | 1.00 | 154.73 | 74.05 | 20 | 95 | 149 | 190 | 451 | ▆▇▂▁▁ |
| speed | 0 | 1.00 | 0.00 | 0.00 | 0 | 0 | 0 | 0 | 0 | ▁▁▇▁▁ |
flightsCarriers <- flights %>% left_join(airlines,
by = c("carrier" = "carrier"))
flightsCarriers %>% group_by(name) %>% summarize(AveDistance = mean(distance, na.rm = TRUE)) %>% arrange(AveDistance)
## # A tibble: 15 × 2
## name AveDistance
## <chr> <dbl>
## 1 Endeavor Air Inc. 328.
## 2 Envoy Air 362.
## 3 Republic Airline 384.
## 4 SkyWest Airlines Inc. 428.
## 5 PSA Airlines Inc. 510.
## 6 JetBlue Airways 584.
## 7 Southwest Airlines Co. 600.
## 8 United Air Lines Inc. 679.
## 9 Mesa Airlines Inc. 764.
## 10 American Airlines Inc. 912.
## 11 Delta Air Lines Inc. 960.
## 12 Spirit Air Lines 1113.
## 13 Allegiant Air 1117.
## 14 Frontier Airlines Inc. 1177.
## 15 Alaska Airlines Inc. 1927
weatherFlights <- weather %>% left_join(flights, by = c( "origin", "year", "month", "day", "hour"))
nrow(weatherFlights)
## [1] 168159
FlightWeather <- flights %>% left_join(weather, by = c( "origin", "year", "month", "day", "hour"))
weatherFlightsFull <- flights %>% full_join(weather, by = c( "origin", "year", "month", "day", "hour"))
nrow(weatherFlightsFull)
## [1] 168504
How many flights are missing wind speed?
FlightWeather %>% pull(wind_speed) %>% is.na() %>% sum()
## [1] 1526
#preportion of missing wind speed of flights
FlightWeather %>% pull(wind_speed) %>% is.na() %>% mean()
## [1] 0.01021111
Inner join weather and flights
innerweatherflights <-flights %>% inner_join(weather, by = c( "origin", "year", "month", "day", "hour"))
nrow(innerweatherflights)
## [1] 149100
nrow(weatherFlights)
## [1] 168159
nrow(weatherFlights)-nrow(innerweatherflights)
## [1] 19059
completeWeatherFlights <- complete(innerweatherflights)
#weatherflights is the same as compleWeatherFlights
completeWeatherFlights %>% mutate(precip = replace_na(precip, 0))
## # A tibble: 149,100 × 29
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2021 1 1 536 539 -3 738 825 -47 AA
## 2 2021 1 1 557 600 -3 758 748 10 DL
## 3 2021 1 1 558 600 -2 700 730 -30 NK
## 4 2021 1 1 600 607 -7 820 831 -11 OH
## 5 2021 1 1 606 600 6 905 920 -15 NK
## 6 2021 1 1 610 615 -5 809 832 -23 OH
## 7 2021 1 1 611 615 -4 809 822 -13 DL
## 8 2021 1 1 611 616 -5 804 826 -22 YX
## 9 2021 1 1 624 630 -6 711 723 -12 DL
## 10 2021 1 1 624 615 9 806 800 6 WN
## # … with 149,090 more rows, 19 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour.x <dttm>, temp <dbl>, dewp <dbl>, humid <dbl>,
## # wind_dir <dbl>, wind_speed <dbl>, wind_gust <dbl>, precip <dbl>,
## # pressure <dbl>, visib <dbl>, time_hour.y <dttm>, and abbreviated variable
## # names ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay