library(tidyverse)
library(lubridate)
library(knitr)
library(skimr)
library(readr)
miFlights <- read_csv("miFlights2019-2021.csv")
skim(miFlights)
| Name | miFlights |
| Number of rows | 463818 |
| Number of columns | 37 |
| _______________________ | |
| Column type frequency: | |
| character | 9 |
| numeric | 27 |
| POSIXct | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| carrier | 0 | 1.00 | 2 | 2 | 0 | 16 | 0 |
| tailnum | 2189 | 1.00 | 3 | 6 | 0 | 5250 | 0 |
| origin | 0 | 1.00 | 3 | 3 | 0 | 4 | 0 |
| dest | 0 | 1.00 | 3 | 3 | 0 | 130 | 0 |
| carrier_name | 0 | 1.00 | 9 | 34 | 0 | 16 | 0 |
| plane_type | 11140 | 0.98 | 23 | 23 | 0 | 1 | 0 |
| plane_manufacturer | 11140 | 0.98 | 6 | 29 | 0 | 16 | 0 |
| plane_model | 11140 | 0.98 | 5 | 15 | 0 | 93 | 0 |
| plane_engine | 11140 | 0.98 | 9 | 9 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| year | 0 | 1.00 | 2019.92 | 0.85 | 2019.00 | 2019.00 | 2020.00 | 2021.00 | 2021.00 | ▇▁▆▁▆ |
| month | 0 | 1.00 | 6.53 | 3.48 | 1.00 | 3.00 | 7.00 | 10.00 | 12.00 | ▇▅▅▅▇ |
| day | 0 | 1.00 | 15.74 | 8.76 | 1.00 | 8.00 | 16.00 | 23.00 | 31.00 | ▇▇▇▇▆ |
| dep_time | 9060 | 0.98 | 1372.26 | 490.52 | 1.00 | 950.00 | 1355.00 | 1754.00 | 2400.00 | ▁▇▇▇▆ |
| sched_dep_time | 0 | 1.00 | 1368.40 | 481.75 | 49.00 | 948.00 | 1355.00 | 1750.00 | 2336.00 | ▁▇▇▇▆ |
| dep_delay | 9063 | 0.98 | 7.11 | 44.97 | -54.00 | -5.00 | -3.00 | 0.00 | 2672.00 | ▇▁▁▁▁ |
| arr_time | 9324 | 0.98 | 1481.01 | 506.74 | 1.00 | 1053.00 | 1502.00 | 1905.00 | 2400.00 | ▁▅▇▇▆ |
| sched_arr_time | 0 | 1.00 | 1496.79 | 495.16 | 1.00 | 1103.00 | 1510.00 | 1910.00 | 2359.00 | ▁▃▇▇▇ |
| arr_delay | 10239 | 0.98 | 0.16 | 47.21 | -85.00 | -17.00 | -9.00 | 2.00 | 2649.00 | ▇▁▁▁▁ |
| flight | 0 | 1.00 | 413.37 | 269.57 | 1.00 | 189.00 | 387.00 | 600.00 | 1322.00 | ▇▇▆▂▁ |
| air_time | 10239 | 0.98 | 94.59 | 63.12 | 15.00 | 50.00 | 74.00 | 130.00 | 581.00 | ▇▂▁▁▁ |
| distance | 0 | 1.00 | 641.00 | 488.23 | 74.00 | 296.00 | 500.00 | 957.00 | 4475.00 | ▇▂▁▁▁ |
| hour | 0 | 1.00 | 13.41 | 4.79 | 0.00 | 9.00 | 13.00 | 17.00 | 23.00 | ▁▇▇▇▆ |
| minute | 0 | 1.00 | 27.49 | 17.94 | 0.00 | 11.00 | 27.00 | 44.00 | 59.00 | ▇▆▇▆▆ |
| temp | 441760 | 0.05 | 42.16 | 15.40 | -4.00 | 32.00 | 37.90 | 48.90 | 90.00 | ▁▆▇▂▁ |
| dewp | 441762 | 0.05 | 31.91 | 13.42 | -9.00 | 23.00 | 28.90 | 39.90 | 75.90 | ▁▆▇▃▁ |
| humid | 441773 | 0.05 | 68.87 | 15.10 | 25.87 | 57.93 | 71.82 | 80.66 | 100.00 | ▁▃▅▇▃ |
| wind_dir | 9205 | 0.98 | 181.02 | 109.46 | 0.00 | 80.00 | 200.00 | 270.00 | 360.00 | ▇▃▆▇▆ |
| wind_speed | 4367 | 0.99 | 8.59 | 5.64 | 0.00 | 4.60 | 8.06 | 11.51 | 42.58 | ▇▆▁▁▁ |
| wind_gust | 4367 | 0.99 | 9.88 | 6.50 | 0.00 | 5.30 | 9.27 | 13.24 | 49.00 | ▇▆▁▁▁ |
| precip | 430846 | 0.07 | 0.01 | 0.02 | 0.00 | 0.00 | 0.00 | 0.01 | 0.44 | ▇▁▁▁▁ |
| pressure | 447131 | 0.04 | 1018.83 | 7.60 | 990.40 | 1014.10 | 1019.00 | 1023.40 | 1038.50 | ▁▂▇▇▂ |
| visib | 1934 | 1.00 | 8.18 | 2.56 | 0.06 | 7.00 | 10.00 | 10.00 | 10.00 | ▁▁▁▂▇ |
| plane_year | 21647 | 0.95 | 2008.12 | 7.15 | 1987.00 | 2003.00 | 2007.00 | 2015.00 | 2021.00 | ▁▂▇▃▅ |
| plane_engines | 11140 | 0.98 | 2.00 | 0.02 | 2.00 | 2.00 | 2.00 | 2.00 | 3.00 | ▇▁▁▁▁ |
| plane_seats | 11140 | 0.98 | 127.86 | 66.68 | 20.00 | 80.00 | 95.00 | 182.00 | 451.00 | ▇▃▂▁▁ |
| plane_speed | 11140 | 0.98 | 0.01 | 1.72 | 0.00 | 0.00 | 0.00 | 0.00 | 438.00 | ▇▁▁▁▁ |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| time_hour | 0 | 1 | 2019-01-01 05:00:00 | 2021-12-31 22:00:00 | 2020-03-26 06:00:00 | 19059 |
glipse(miFlights)
## Error in glipse(miFlights): could not find function "glipse"
#using geom_bar()
miFlights %>% ggplot(aes(x = origin)) + geom_bar()
# make vecotr of airport names
airport<- c("detroit metro ariport"= "DTW",
"Gerald R Ford international Airport" = "GRR",
"Flint bishop international airport" = "FNT",
"Capital Regin international airport" = "LAN")
#using geom_col()
miFlights %>% mutate(origin = fct_recode(origin , !!!airport)) %>% count(origin) %>% ggplot(aes(x = fct_reorder(origin, -n),
y = n)) +
geom_col() +
labs(title = "Michigan flights 2019-2021",
x = "Origin",
y = "Number of flights",
caption = "datasource miFlights20192021") +
scale_y_continuous(
expand = expansion(mult = c(0,0))) +
theme_bw()
janFlights <- miFlights %>% filter(month == 1, day == 1)
janFlights %>% slice_head(n = 5)
## # A tibble: 5 × 37
## year month day dep_time sched_dep…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 2019 1 1 55 2115 220 426 2323 303 OH
## 2 2019 1 1 455 500 -5 830 834 -4 YX
## 3 2019 1 1 506 511 -5 710 730 -20 AA
## 4 2019 1 1 531 535 -4 647 710 -23 WN
## 5 2019 1 1 534 545 -11 750 742 8 B6
## # … with 27 more variables: flight <dbl>, tailnum <chr>, origin <chr>,
## # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>, temp <dbl>, dewp <dbl>, humid <dbl>, wind_dir <dbl>,
## # wind_speed <dbl>, wind_gust <dbl>, precip <dbl>, pressure <dbl>,
## # visib <dbl>, carrier_name <chr>, plane_year <dbl>, plane_type <chr>,
## # plane_manufacturer <chr>, plane_model <chr>, plane_engines <dbl>,
## # plane_seats <dbl>, plane_speed <dbl>, plane_engine <chr>, and abbreviated …
dec25 <- miFlights %>% filter(month == 12, day == 25)
dec25 %>% slice_head(n = 5)
## # A tibble: 5 × 37
## year month day dep_time sched_dep…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 2019 12 25 522 535 -13 709 732 -23 B6
## 2 2019 12 25 540 545 -5 736 803 -27 DL
## 3 2019 12 25 552 557 -5 827 812 15 OH
## 4 2019 12 25 557 600 -3 904 922 -18 F9
## 5 2019 12 25 557 600 -3 723 734 -11 NK
## # … with 27 more variables: flight <dbl>, tailnum <chr>, origin <chr>,
## # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>, temp <dbl>, dewp <dbl>, humid <dbl>, wind_dir <dbl>,
## # wind_speed <dbl>, wind_gust <dbl>, precip <dbl>, pressure <dbl>,
## # visib <dbl>, carrier_name <chr>, plane_year <dbl>, plane_type <chr>,
## # plane_manufacturer <chr>, plane_model <chr>, plane_engines <dbl>,
## # plane_seats <dbl>, plane_speed <dbl>, plane_engine <chr>, and abbreviated …
#all flights in november or december
novDec <- miFlights %>% filter(month == 11 | month == 12)
novDec <- miFlights %>% filter(month %in% c(11,12))
#all flights not in nov or dec
notnovDec <- miFlights %>% filter(!(month == 11 | month == 12))
notnovDec %>% slice_head(n=5)
## # A tibble: 5 × 37
## year month day dep_time sched_dep…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 2019 1 1 55 2115 220 426 2323 303 OH
## 2 2019 1 1 455 500 -5 830 834 -4 YX
## 3 2019 1 1 506 511 -5 710 730 -20 AA
## 4 2019 1 1 531 535 -4 647 710 -23 WN
## 5 2019 1 1 534 545 -11 750 742 8 B6
## # … with 27 more variables: flight <dbl>, tailnum <chr>, origin <chr>,
## # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>, temp <dbl>, dewp <dbl>, humid <dbl>, wind_dir <dbl>,
## # wind_speed <dbl>, wind_gust <dbl>, precip <dbl>, pressure <dbl>,
## # visib <dbl>, carrier_name <chr>, plane_year <dbl>, plane_type <chr>,
## # plane_manufacturer <chr>, plane_model <chr>, plane_engines <dbl>,
## # plane_seats <dbl>, plane_speed <dbl>, plane_engine <chr>, and abbreviated …
##missing values
# All missing departure time
miFlightsMiss <- miFlights %>% filter(is.na(dep_time))
#All non-missing departure time
miFlightsComplete <- miFlights %>% filter(!(is.na(dep_time)))
##arrange rows with arrange()
#smallest to largest
miFlights %>% arrange(day) %>% select(1:4) %>% slice_head(n=5)
## # A tibble: 5 × 4
## year month day dep_time
## <dbl> <dbl> <dbl> <dbl>
## 1 2019 1 1 55
## 2 2019 1 1 455
## 3 2019 1 1 506
## 4 2019 1 1 531
## 5 2019 1 1 534
#largest to smallest
miFlights %>% arrange(desc(day)) %>% select(1:4) %>% slice_head(n=5)
## # A tibble: 5 × 4
## year month day dep_time
## <dbl> <dbl> <dbl> <dbl>
## 1 2019 1 31 59
## 2 2019 1 31 535
## 3 2019 1 31 540
## 4 2019 1 31 548
## 5 2019 1 31 549
#Sort by year month and day
miFlightsorted <- miFlights %>% arrange(year, month, day)
miFlights %>% arrange(desc(arr_delay)) %>% slice_head(n=3)
## # A tibble: 3 × 37
## year month day dep_time sched_dep…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 2019 2 17 1400 1728 2672 1531 1922 2649 MQ
## 2 2021 7 12 1643 815 1948 1754 913 1961 G4
## 3 2019 12 19 1402 722 1840 1556 1004 1792 AA
## # … with 27 more variables: flight <dbl>, tailnum <chr>, origin <chr>,
## # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>, temp <dbl>, dewp <dbl>, humid <dbl>, wind_dir <dbl>,
## # wind_speed <dbl>, wind_gust <dbl>, precip <dbl>, pressure <dbl>,
## # visib <dbl>, carrier_name <chr>, plane_year <dbl>, plane_type <chr>,
## # plane_manufacturer <chr>, plane_model <chr>, plane_engines <dbl>,
## # plane_seats <dbl>, plane_speed <dbl>, plane_engine <chr>, and abbreviated …
miFlights %>% arrange(dep_delay) %>% slice_head(n=3)
## # A tibble: 3 × 37
## year month day dep_time sched_dep…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 2020 3 29 2019 2113 -54 2301 2344 -43 G4
## 2 2020 3 31 1521 1610 -49 1750 1842 -52 G4
## 3 2019 1 11 2059 2145 -46 2250 2348 -58 NK
## # … with 27 more variables: flight <dbl>, tailnum <chr>, origin <chr>,
## # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>, temp <dbl>, dewp <dbl>, humid <dbl>, wind_dir <dbl>,
## # wind_speed <dbl>, wind_gust <dbl>, precip <dbl>, pressure <dbl>,
## # visib <dbl>, carrier_name <chr>, plane_year <dbl>, plane_type <chr>,
## # plane_manufacturer <chr>, plane_model <chr>, plane_engines <dbl>,
## # plane_seats <dbl>, plane_speed <dbl>, plane_engine <chr>, and abbreviated …
miFlights %>% arrange( desc( distance / air_time)) %>% slice_head(n=3)
## # A tibble: 3 × 37
## year month day dep_time sched_dep…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 2019 7 14 1230 1215 15 1339 1326 13 OO
## 2 2020 8 21 1645 1605 40 1621 1626 -5 OO
## 3 2021 11 21 1000 1000 0 1119 1132 -13 OO
## # … with 27 more variables: flight <dbl>, tailnum <chr>, origin <chr>,
## # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>, temp <dbl>, dewp <dbl>, humid <dbl>, wind_dir <dbl>,
## # wind_speed <dbl>, wind_gust <dbl>, precip <dbl>, pressure <dbl>,
## # visib <dbl>, carrier_name <chr>, plane_year <dbl>, plane_type <chr>,
## # plane_manufacturer <chr>, plane_model <chr>, plane_engines <dbl>,
## # plane_seats <dbl>, plane_speed <dbl>, plane_engine <chr>, and abbreviated …
miFlights %>% filter(origin == "GRR") %>% arrange(desc(distance), arr_time) %>% slice_head(n=3)
## # A tibble: 3 × 37
## year month day dep_time sched_dep…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 2021 3 28 555 600 -5 714 758 -44 G4
## 2 2021 4 11 552 600 -8 716 740 -24 G4
## 3 2021 3 18 558 600 -2 720 758 -38 G4
## # … with 27 more variables: flight <dbl>, tailnum <chr>, origin <chr>,
## # dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>, temp <dbl>, dewp <dbl>, humid <dbl>, wind_dir <dbl>,
## # wind_speed <dbl>, wind_gust <dbl>, precip <dbl>, pressure <dbl>,
## # visib <dbl>, carrier_name <chr>, plane_year <dbl>, plane_type <chr>,
## # plane_manufacturer <chr>, plane_model <chr>, plane_engines <dbl>,
## # plane_seats <dbl>, plane_speed <dbl>, plane_engine <chr>, and abbreviated …
miDropped <-miFlights %>% select( -month, - year)
miDropped2 <-miFlights %>% select(-(year:month))
timeFlights <-miFlights %>% select(ends_with("time"))
departureInfor <-miFlights %>% select(starts_with("dep"))
newFlights <-miFlights %>% select(flight, origin, dest, everything(), -tailnum)
flights_sml <- miFlights %>% select(ends_with("delay"), distance, air_time)
flightSpeed <- flights_sml %>% mutate(gain = dep_delay - arr_delay, speed = 60*distance / air_time)
flightSpeed %>% ggplot(aes(x =speed,
y = gain,
color = distance)) +
geom_point(alpha = .2) +
labs(title = "Michigan flights gains by speed 2019-2021",
x = "speed (mph)",
y = "Gain (min)",
color = "Distance (miles)",
caption = "data source: anyFlights R package" ) +
theme(legend.position = "bottom")
#Using group by
miFlights %>% group_by(year, month, day) %>%
summarize(delay = mean(dep_delay, na.rm = TRUE)) %>%
ungroup() %>% slice_head(n = 5)
## # A tibble: 5 × 4
## year month day delay
## <dbl> <dbl> <dbl> <dbl>
## 1 2019 1 1 8.58
## 2 2019 1 2 12.5
## 3 2019 1 3 0.721
## 4 2019 1 4 -0.477
## 5 2019 1 5 2.17
#flight delay by carrier
delaySummary <- miFlights %>% group_by(carrier_name) %>%
summarize(Delay = mean(arr_delay, na.rm = T))
#plot
delaySummary %>% ggplot(aes(x = fct_reorder(carrier_name, Delay),
y = Delay,
fill = Delay > 0)) +
geom_col() +
labs(title = "average delay flights by Carrier \n Michigan flights, 2019-2021",
x = "Carrier",
y = "Delay (minutes)") +
scale_fill_manual(values = c("red", "blue")) +
theme_bw() +
theme(axis.text.x = element_text(angle = 60, vjust = .7), legend.position = "none")
#flight delay by carrier
delaySummary <- miFlights %>% group_by(month, origin) %>%
summarize(Delay = median(arr_delay, na.rm = T))
#plot
delaySummary %>% ggplot(aes(x = month,
y = Delay,
color = origin)) +
geom_line() +
labs(title = "average delay flights by Carrier \n Michigan flights, 2019-2021",
x = "month",
y = "Delay (minutes)") +
scale_fill_manual(values = "blue") +
scale_x_continuous(1:12) +
theme_bw() +
theme(axis.text.x = element_text(vjust = .7), legend.position = "none")