library(tidyverse)
library(lubridate)
library(knitr)
library(skimr)
miFlights <- read_csv("miFlights2019-2021.csv")
skim(miFlights)
| Name | miFlights |
| Number of rows | 463818 |
| Number of columns | 37 |
| _______________________ | |
| Column type frequency: | |
| character | 9 |
| numeric | 27 |
| POSIXct | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| carrier | 0 | 1.00 | 2 | 2 | 0 | 16 | 0 |
| tailnum | 2189 | 1.00 | 3 | 6 | 0 | 5250 | 0 |
| origin | 0 | 1.00 | 3 | 3 | 0 | 4 | 0 |
| dest | 0 | 1.00 | 3 | 3 | 0 | 130 | 0 |
| carrier_name | 0 | 1.00 | 9 | 34 | 0 | 16 | 0 |
| plane_type | 11140 | 0.98 | 23 | 23 | 0 | 1 | 0 |
| plane_manufacturer | 11140 | 0.98 | 6 | 29 | 0 | 16 | 0 |
| plane_model | 11140 | 0.98 | 5 | 15 | 0 | 93 | 0 |
| plane_engine | 11140 | 0.98 | 9 | 9 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| year | 0 | 1.00 | 2019.92 | 0.85 | 2019.00 | 2019.00 | 2020.00 | 2021.00 | 2021.00 | ▇▁▆▁▆ |
| month | 0 | 1.00 | 6.53 | 3.48 | 1.00 | 3.00 | 7.00 | 10.00 | 12.00 | ▇▅▅▅▇ |
| day | 0 | 1.00 | 15.74 | 8.76 | 1.00 | 8.00 | 16.00 | 23.00 | 31.00 | ▇▇▇▇▆ |
| dep_time | 9060 | 0.98 | 1372.26 | 490.52 | 1.00 | 950.00 | 1355.00 | 1754.00 | 2400.00 | ▁▇▇▇▆ |
| sched_dep_time | 0 | 1.00 | 1368.40 | 481.75 | 49.00 | 948.00 | 1355.00 | 1750.00 | 2336.00 | ▁▇▇▇▆ |
| dep_delay | 9063 | 0.98 | 7.11 | 44.97 | -54.00 | -5.00 | -3.00 | 0.00 | 2672.00 | ▇▁▁▁▁ |
| arr_time | 9324 | 0.98 | 1481.01 | 506.74 | 1.00 | 1053.00 | 1502.00 | 1905.00 | 2400.00 | ▁▅▇▇▆ |
| sched_arr_time | 0 | 1.00 | 1496.79 | 495.16 | 1.00 | 1103.00 | 1510.00 | 1910.00 | 2359.00 | ▁▃▇▇▇ |
| arr_delay | 10239 | 0.98 | 0.16 | 47.21 | -85.00 | -17.00 | -9.00 | 2.00 | 2649.00 | ▇▁▁▁▁ |
| flight | 0 | 1.00 | 413.37 | 269.57 | 1.00 | 189.00 | 387.00 | 600.00 | 1322.00 | ▇▇▆▂▁ |
| air_time | 10239 | 0.98 | 94.59 | 63.12 | 15.00 | 50.00 | 74.00 | 130.00 | 581.00 | ▇▂▁▁▁ |
| distance | 0 | 1.00 | 641.00 | 488.23 | 74.00 | 296.00 | 500.00 | 957.00 | 4475.00 | ▇▂▁▁▁ |
| hour | 0 | 1.00 | 13.41 | 4.79 | 0.00 | 9.00 | 13.00 | 17.00 | 23.00 | ▁▇▇▇▆ |
| minute | 0 | 1.00 | 27.49 | 17.94 | 0.00 | 11.00 | 27.00 | 44.00 | 59.00 | ▇▆▇▆▆ |
| temp | 441760 | 0.05 | 42.16 | 15.40 | -4.00 | 32.00 | 37.90 | 48.90 | 90.00 | ▁▆▇▂▁ |
| dewp | 441762 | 0.05 | 31.91 | 13.42 | -9.00 | 23.00 | 28.90 | 39.90 | 75.90 | ▁▆▇▃▁ |
| humid | 441773 | 0.05 | 68.87 | 15.10 | 25.87 | 57.93 | 71.82 | 80.66 | 100.00 | ▁▃▅▇▃ |
| wind_dir | 9205 | 0.98 | 181.02 | 109.46 | 0.00 | 80.00 | 200.00 | 270.00 | 360.00 | ▇▃▆▇▆ |
| wind_speed | 4367 | 0.99 | 8.59 | 5.64 | 0.00 | 4.60 | 8.06 | 11.51 | 42.58 | ▇▆▁▁▁ |
| wind_gust | 4367 | 0.99 | 9.88 | 6.50 | 0.00 | 5.30 | 9.27 | 13.24 | 49.00 | ▇▆▁▁▁ |
| precip | 430846 | 0.07 | 0.01 | 0.02 | 0.00 | 0.00 | 0.00 | 0.01 | 0.44 | ▇▁▁▁▁ |
| pressure | 447131 | 0.04 | 1018.83 | 7.60 | 990.40 | 1014.10 | 1019.00 | 1023.40 | 1038.50 | ▁▂▇▇▂ |
| visib | 1934 | 1.00 | 8.18 | 2.56 | 0.06 | 7.00 | 10.00 | 10.00 | 10.00 | ▁▁▁▂▇ |
| plane_year | 21647 | 0.95 | 2008.12 | 7.15 | 1987.00 | 2003.00 | 2007.00 | 2015.00 | 2021.00 | ▁▂▇▃▅ |
| plane_engines | 11140 | 0.98 | 2.00 | 0.02 | 2.00 | 2.00 | 2.00 | 2.00 | 3.00 | ▇▁▁▁▁ |
| plane_seats | 11140 | 0.98 | 127.86 | 66.68 | 20.00 | 80.00 | 95.00 | 182.00 | 451.00 | ▇▃▂▁▁ |
| plane_speed | 11140 | 0.98 | 0.01 | 1.72 | 0.00 | 0.00 | 0.00 | 0.00 | 438.00 | ▇▁▁▁▁ |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| time_hour | 0 | 1 | 2019-01-01 05:00:00 | 2021-12-31 22:00:00 | 2020-03-26 06:00:00 | 19059 |
glimpse(miFlights)
## Rows: 463,818
## Columns: 37
## $ year <dbl> 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 201…
## $ month <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ day <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ dep_time <dbl> 55, 455, 506, 531, 534, 550, 555, 555, 555, 600, 60…
## $ sched_dep_time <dbl> 2115, 500, 511, 535, 545, 600, 600, 555, 600, 600, …
## $ dep_delay <dbl> 220, -5, -5, -4, -11, -10, -5, 0, -5, 0, -2, 3, 4, …
## $ arr_time <dbl> 426, 830, 710, 647, 750, 712, 822, 709, 755, 559, 9…
## $ sched_arr_time <dbl> 2323, 834, 730, 710, 742, 748, 834, 715, 817, 615, …
## $ arr_delay <dbl> 303, -4, -20, -23, 8, -36, -12, -6, -22, -16, 3, -7…
## $ carrier <chr> "OH", "YX", "AA", "WN", "B6", "YX", "OO", "WN", "DL…
## $ flight <dbl> 1019, 954, 185, 203, 310, 790, 803, 295, 348, 218, …
## $ tailnum <chr> "N567NN", "N433YX", "N853NN", "N227WN", "N203JB", "…
## $ origin <chr> "DTW", "GRR", "DTW", "DTW", "DTW", "DTW", "FNT", "D…
## $ dest <chr> "CLT", "MIA", "DFW", "BWI", "BOS", "EWR", "ATL", "D…
## $ air_time <dbl> 88, 187, 162, 64, 71, 61, 125, 174, 105, 45, 179, 1…
## $ distance <dbl> 500, 1214, 986, 409, 632, 488, 645, 1123, 640, 228,…
## $ hour <dbl> 21, 5, 5, 5, 5, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6,…
## $ minute <dbl> 15, 0, 11, 35, 45, 0, 0, 55, 0, 0, 3, 0, 0, 5, 0, 1…
## $ time_hour <dttm> 2019-01-01 21:00:00, 2019-01-01 05:00:00, 2019-01-…
## $ temp <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ dewp <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ humid <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ wind_dir <dbl> 10, 320, 290, 290, 290, 290, 330, 290, 290, 290, 29…
## $ wind_speed <dbl> 8.05546, 9.20624, 19.56326, 19.56326, 19.56326, 13.…
## $ wind_gust <dbl> 9.270062, 10.594357, 22.513008, 22.513008, 22.51300…
## $ precip <dbl> NA, 1e-04, NA, NA, NA, NA, 1e-04, NA, NA, NA, NA, N…
## $ pressure <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ visib <dbl> 1.25, 9.00, 7.00, 7.00, 7.00, 4.00, 9.00, 7.00, 10.…
## $ carrier_name <chr> "PSA Airlines Inc.", "Republic Airline", "American …
## $ plane_year <dbl> 2015, 2014, 2010, 2005, 2006, 2016, 2006, 1999, 200…
## $ plane_type <chr> "Fixed wing multi engine", "Fixed wing multi engine…
## $ plane_manufacturer <chr> "BOMBARDIER INC", "EMBRAER S A", "BOEING", "BOEING"…
## $ plane_model <chr> "CL-600-2D24", "ERJ 170-200 LR", "737-823", "737-7H…
## $ plane_engines <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ plane_seats <dbl> 95, 88, 162, 140, 20, 88, 95, 149, 100, 140, 162, 1…
## $ plane_speed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ plane_engine <chr> "Turbo-fan", "Turbo-fan", "Turbo-fan", "Turbo-fan",…
miFlights %>% count(origin) %>% ggplot(aes(x = fct_reorder(origin, - n), y = n)) + geom_col(fill = "dodgerblue", color="black") + labs(
title ="Michigan departing flights, 2019-2021",
x = "Airports",
y = "Number of flights",
caption ="Datasource: anyflights R package") + theme_bw()
janFlights <- miFlights %>% filter(month == 1, day == 1)
janFlights %>% slice_head(n = 5)
dec25 <- miFlights %>% filter(month == 12, day == 25)
dec25 %>% slice_head(n = 5)
novDec <- miFlights %>% filter(month == 11 | month == 12)
novDec %>% slice_head(n = 5)
novDec <- miFlights %>% filter(month %in% c(11,12))
novDec %>% slice_head(n = 5)
notNovDec <- miFlights %>% filter(!(month == 11 | month == 12))
notNovDec %>% slice_head(n = 5)
#It will include flights with arrival and departure delays that are 120 minutes or less.
dplyr::filter(miFlights, !(arr_delay > 120 | dep_delay > 120))
#It will include flights where both the arrival and departure delays are 120 minutes or less. It is equivalent to the previous code
dplyr::filter(miFlights, arr_delay <= 120, dep_delay <= 120)
miFlightsMiss <- miFlights %>% filter(is.na(dep_time))
miFlightsMiss %>% slice_head(n = 5)
miFlightsComplete <- miFlights %>% filter(!(is.na(dep_time)))
miFlightsComplete %>% slice_head(n = 5)
miFlights %>% arrange(day) %>% select(1:4) %>% slice_head(n=5)
miFlights %>% arrange(desc(day)) %>% select(1:4) %>% slice_head(n=5)
miFlightsorted <- miFlights %>% arrange(year, month, day)
miFlightsorted %>% slice_head(n = 5)
miFlights %>% arrange(desc(arr_delay)) %>% slice_head(n=3)
miFlights %>% arrange(dep_delay) %>% slice_head(n=3)
miFlights %>% arrange( desc( distance / air_time)) %>% slice_head(n=3)
miFlights %>% filter(origin == "GRR") %>% arrange(desc(distance), arr_time) %>% slice_head(n=3)
miDropped <- miFlights %>% select(-month, -year)
miDropped %>% slice_head(n=3)
miDropped2 <- miFlights %>% select(-(year:month))
miDropped2 %>% slice_head(n=3)
timeFlights <- miFlights %>% select(ends_with("time"))
timeFlights %>% slice_head(n=3)
departureInfo <- miFlights %>% select(starts_with("dep"))
departureInfo %>% slice_head(n=3)
newFlights <- miFlights %>% select(flight, origin, dest, everything(), -tailnum)
newFlights %>% slice_head(n=3)
flights_sml <- miFlights %>% select(ends_with("delay"), distance, air_time)
flightSpeed <- flights_sml %>% mutate(gain = dep_delay - arr_delay, speed = 60*distance / air_time)
flightSpeed %>% slice_head(n=3)
flightSpeed %>% ggplot(aes(x =speed,
y = gain)) +
geom_point() +
labs(title = "Michigan flights gains by speed 2019-2021",
x = "Speed (mph)",
y = "Gain (min)",
color = "Distance (miles)",
caption = "data source: anyFlights R package" )
flightSpeed %>% ggplot(aes(x =speed,
y = gain,
color = distance)) +
geom_point(alpha = .2) +
labs(title = "Michigan flights gains by speed 2019-2021",
x = "speed (mph)",
y = "Gain (min)",
color = "Distance (miles)",
caption = "data source: anyFlights R package" ) +
theme(legend.position = "bottom")
#There is a noticeable pattern i.e It's a U shaped plot, it's like having two distribution
miFlights %>% group_by(year, month, day) %>%
summarize(delay = mean(dep_delay, na.rm = TRUE)) %>%
slice_head(n = 5)
delaySummary <- miFlights %>% group_by(carrier_name) %>%
summarize(Delay = mean(arr_delay, na.rm = T))
delaySummary %>% ggplot(aes(x = fct_reorder(carrier_name, Delay),
y = Delay,
fill = Delay > 0)) +
geom_col() +
labs(title = "Average delay flights by Carrier \n Michigan flights, 2019-2021",
x = "Carrier",
y = "Delay (minutes)") +
scale_fill_manual(values = c("red", "blue")) +
theme_bw() +
theme(axis.text.x = element_text(angle = 60, vjust = .7), legend.position = "none")
# You Try Section
monthlyAirportDelays <- miFlights %>% group_by(month, origin) %>%
summarize(Delay = median(arr_delay, na.rm = T))
# Month which has worst Delays: 2
# Month flights are most early/on-time: 4
# To display the typical delay each month based on the median flight delay statistics, a line plot as a line plot can show the trend of delays over time.
monthlyAirportDelays %>% ggplot(aes(x = month,
y = Delay,
color = origin)) +
geom_line() +
labs(title = "Monthly Arrival delays",
subtitle = "Michigan flights, 2019-2021",
x = "Month",
y = "Median Delay (minutes)",
caption = "data source: anyFlights R package" ) +
scale_x_continuous(breaks = 1:12) +
theme_bw()
monthlyAirportDelays %>% ggplot(aes(x = month,
y = Delay,
color = origin)) +
geom_line() +
labs(title = "Monthly Arrival delays",
subtitle = "Michigan flights, 2019-2021",
x = "Month",
y = "Median Delay (minutes)",
caption = "data source: anyFlights R package" ) + facet_wrap(~ origin)+
scale_x_continuous(breaks = 1:12) +
theme_bw()
monthlyAirportDelaysMean <- miFlights %>% group_by(month, origin) %>%
summarize(Delay = mean(arr_delay, na.rm = T))
monthlyAirportDelaysMean %>% ggplot(aes(x = month,
y = Delay,
color = origin)) +
geom_line() +
labs(title = "Monthly Arrival delays",
subtitle = "Michigan flights, 2019-2021",
x = "Month",
y = "Mean Delay (minutes)",
caption = "data source: anyFlights R package" ) +
scale_x_continuous(breaks = 1:12) +
theme_bw()