library(tidyverse)
library(lubridate)
library(knitr)
library(skimr)

importing data set and exploring it

library(readr)
miFlights <- read_csv("miFlights2019-2021.csv")

skim(miFlights)
Data summary
Name miFlights
Number of rows 463818
Number of columns 37
_______________________
Column type frequency:
character 9
numeric 27
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
carrier 0 1.00 2 2 0 16 0
tailnum 2189 1.00 3 6 0 5250 0
origin 0 1.00 3 3 0 4 0
dest 0 1.00 3 3 0 130 0
carrier_name 0 1.00 9 34 0 16 0
plane_type 11140 0.98 23 23 0 1 0
plane_manufacturer 11140 0.98 6 29 0 16 0
plane_model 11140 0.98 5 15 0 93 0
plane_engine 11140 0.98 9 9 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1.00 2019.92 0.85 2019.00 2019.00 2020.00 2021.00 2021.00 ▇▁▆▁▆
month 0 1.00 6.53 3.48 1.00 3.00 7.00 10.00 12.00 ▇▅▅▅▇
day 0 1.00 15.74 8.76 1.00 8.00 16.00 23.00 31.00 ▇▇▇▇▆
dep_time 9060 0.98 1372.26 490.52 1.00 950.00 1355.00 1754.00 2400.00 ▁▇▇▇▆
sched_dep_time 0 1.00 1368.40 481.75 49.00 948.00 1355.00 1750.00 2336.00 ▁▇▇▇▆
dep_delay 9063 0.98 7.11 44.97 -54.00 -5.00 -3.00 0.00 2672.00 ▇▁▁▁▁
arr_time 9324 0.98 1481.01 506.74 1.00 1053.00 1502.00 1905.00 2400.00 ▁▅▇▇▆
sched_arr_time 0 1.00 1496.79 495.16 1.00 1103.00 1510.00 1910.00 2359.00 ▁▃▇▇▇
arr_delay 10239 0.98 0.16 47.21 -85.00 -17.00 -9.00 2.00 2649.00 ▇▁▁▁▁
flight 0 1.00 413.37 269.57 1.00 189.00 387.00 600.00 1322.00 ▇▇▆▂▁
air_time 10239 0.98 94.59 63.12 15.00 50.00 74.00 130.00 581.00 ▇▂▁▁▁
distance 0 1.00 641.00 488.23 74.00 296.00 500.00 957.00 4475.00 ▇▂▁▁▁
hour 0 1.00 13.41 4.79 0.00 9.00 13.00 17.00 23.00 ▁▇▇▇▆
minute 0 1.00 27.49 17.94 0.00 11.00 27.00 44.00 59.00 ▇▆▇▆▆
temp 441760 0.05 42.16 15.40 -4.00 32.00 37.90 48.90 90.00 ▁▆▇▂▁
dewp 441762 0.05 31.91 13.42 -9.00 23.00 28.90 39.90 75.90 ▁▆▇▃▁
humid 441773 0.05 68.87 15.10 25.87 57.93 71.82 80.66 100.00 ▁▃▅▇▃
wind_dir 9205 0.98 181.02 109.46 0.00 80.00 200.00 270.00 360.00 ▇▃▆▇▆
wind_speed 4367 0.99 8.59 5.64 0.00 4.60 8.06 11.51 42.58 ▇▆▁▁▁
wind_gust 4367 0.99 9.88 6.50 0.00 5.30 9.27 13.24 49.00 ▇▆▁▁▁
precip 430846 0.07 0.01 0.02 0.00 0.00 0.00 0.01 0.44 ▇▁▁▁▁
pressure 447131 0.04 1018.83 7.60 990.40 1014.10 1019.00 1023.40 1038.50 ▁▂▇▇▂
visib 1934 1.00 8.18 2.56 0.06 7.00 10.00 10.00 10.00 ▁▁▁▂▇
plane_year 21647 0.95 2008.12 7.15 1987.00 2003.00 2007.00 2015.00 2021.00 ▁▂▇▃▅
plane_engines 11140 0.98 2.00 0.02 2.00 2.00 2.00 2.00 3.00 ▇▁▁▁▁
plane_seats 11140 0.98 127.86 66.68 20.00 80.00 95.00 182.00 451.00 ▇▃▂▁▁
plane_speed 11140 0.98 0.01 1.72 0.00 0.00 0.00 0.00 438.00 ▇▁▁▁▁

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
time_hour 0 1 2019-01-01 05:00:00 2021-12-31 22:00:00 2020-03-26 06:00:00 19059
glipse(miFlights)
## Error in glipse(miFlights): could not find function "glipse"

Bar Chart

#using geom_bar()
miFlights  %>% ggplot(aes(x = origin)) + geom_bar()

# make vecotr of airport names
airport<- c("detroit metro ariport"= "DTW",
            "Gerald R Ford international Airport" = "GRR",
            "Flint bishop international airport" = "FNT",
            "Capital Regin international airport" = "LAN")
#using geom_col()
miFlights %>% mutate(origin = fct_recode(origin , !!!airport)) %>% count(origin) %>% ggplot(aes(x = fct_reorder(origin, -n),
                                           y = n)) + 
  geom_col() +
  labs(title = "Michigan flights 2019-2021",
       x = "Origin",
       y = "Number of flights",
       caption = "datasource miFlights20192021")  +
  scale_y_continuous(
                     expand = expansion(mult = c(0,0))) +
  theme_bw()

Using filter() functions

janFlights <- miFlights %>% filter(month == 1, day == 1)

janFlights %>% slice_head(n = 5)
## # A tibble: 5 × 37
##    year month   day dep_time sched_dep…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
##   <dbl> <dbl> <dbl>    <dbl>       <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <chr>  
## 1  2019     1     1       55        2115     220     426    2323     303 OH     
## 2  2019     1     1      455         500      -5     830     834      -4 YX     
## 3  2019     1     1      506         511      -5     710     730     -20 AA     
## 4  2019     1     1      531         535      -4     647     710     -23 WN     
## 5  2019     1     1      534         545     -11     750     742       8 B6     
## # … with 27 more variables: flight <dbl>, tailnum <chr>, origin <chr>,
## #   dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>, temp <dbl>, dewp <dbl>, humid <dbl>, wind_dir <dbl>,
## #   wind_speed <dbl>, wind_gust <dbl>, precip <dbl>, pressure <dbl>,
## #   visib <dbl>, carrier_name <chr>, plane_year <dbl>, plane_type <chr>,
## #   plane_manufacturer <chr>, plane_model <chr>, plane_engines <dbl>,
## #   plane_seats <dbl>, plane_speed <dbl>, plane_engine <chr>, and abbreviated …
dec25 <- miFlights %>% filter(month == 12, day == 25)

dec25 %>% slice_head(n = 5)
## # A tibble: 5 × 37
##    year month   day dep_time sched_dep…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
##   <dbl> <dbl> <dbl>    <dbl>       <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <chr>  
## 1  2019    12    25      522         535     -13     709     732     -23 B6     
## 2  2019    12    25      540         545      -5     736     803     -27 DL     
## 3  2019    12    25      552         557      -5     827     812      15 OH     
## 4  2019    12    25      557         600      -3     904     922     -18 F9     
## 5  2019    12    25      557         600      -3     723     734     -11 NK     
## # … with 27 more variables: flight <dbl>, tailnum <chr>, origin <chr>,
## #   dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>, temp <dbl>, dewp <dbl>, humid <dbl>, wind_dir <dbl>,
## #   wind_speed <dbl>, wind_gust <dbl>, precip <dbl>, pressure <dbl>,
## #   visib <dbl>, carrier_name <chr>, plane_year <dbl>, plane_type <chr>,
## #   plane_manufacturer <chr>, plane_model <chr>, plane_engines <dbl>,
## #   plane_seats <dbl>, plane_speed <dbl>, plane_engine <chr>, and abbreviated …
#all flights in november or december 
novDec <- miFlights %>% filter(month == 11 | month == 12)


novDec <- miFlights %>% filter(month %in% c(11,12))


#all flights not in nov or dec
notnovDec <- miFlights %>% filter(!(month == 11 | month == 12))

notnovDec %>% slice_head(n=5)
## # A tibble: 5 × 37
##    year month   day dep_time sched_dep…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
##   <dbl> <dbl> <dbl>    <dbl>       <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <chr>  
## 1  2019     1     1       55        2115     220     426    2323     303 OH     
## 2  2019     1     1      455         500      -5     830     834      -4 YX     
## 3  2019     1     1      506         511      -5     710     730     -20 AA     
## 4  2019     1     1      531         535      -4     647     710     -23 WN     
## 5  2019     1     1      534         545     -11     750     742       8 B6     
## # … with 27 more variables: flight <dbl>, tailnum <chr>, origin <chr>,
## #   dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>, temp <dbl>, dewp <dbl>, humid <dbl>, wind_dir <dbl>,
## #   wind_speed <dbl>, wind_gust <dbl>, precip <dbl>, pressure <dbl>,
## #   visib <dbl>, carrier_name <chr>, plane_year <dbl>, plane_type <chr>,
## #   plane_manufacturer <chr>, plane_model <chr>, plane_engines <dbl>,
## #   plane_seats <dbl>, plane_speed <dbl>, plane_engine <chr>, and abbreviated …

##missing values

# All missing departure time
miFlightsMiss <- miFlights %>% filter(is.na(dep_time))
#All non-missing departure time
miFlightsComplete <- miFlights %>% filter(!(is.na(dep_time)))

##arrange rows with arrange()

#smallest to largest 
miFlights %>% arrange(day) %>% select(1:4) %>% slice_head(n=5)
## # A tibble: 5 × 4
##    year month   day dep_time
##   <dbl> <dbl> <dbl>    <dbl>
## 1  2019     1     1       55
## 2  2019     1     1      455
## 3  2019     1     1      506
## 4  2019     1     1      531
## 5  2019     1     1      534
#largest to smallest
miFlights %>% arrange(desc(day)) %>% select(1:4) %>% slice_head(n=5)
## # A tibble: 5 × 4
##    year month   day dep_time
##   <dbl> <dbl> <dbl>    <dbl>
## 1  2019     1    31       59
## 2  2019     1    31      535
## 3  2019     1    31      540
## 4  2019     1    31      548
## 5  2019     1    31      549
#Sort by year month and day
miFlightsorted <- miFlights %>% arrange(year, month, day)

you try section

miFlights %>% arrange(desc(arr_delay)) %>% slice_head(n=3)
## # A tibble: 3 × 37
##    year month   day dep_time sched_dep…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
##   <dbl> <dbl> <dbl>    <dbl>       <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <chr>  
## 1  2019     2    17     1400        1728    2672    1531    1922    2649 MQ     
## 2  2021     7    12     1643         815    1948    1754     913    1961 G4     
## 3  2019    12    19     1402         722    1840    1556    1004    1792 AA     
## # … with 27 more variables: flight <dbl>, tailnum <chr>, origin <chr>,
## #   dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>, temp <dbl>, dewp <dbl>, humid <dbl>, wind_dir <dbl>,
## #   wind_speed <dbl>, wind_gust <dbl>, precip <dbl>, pressure <dbl>,
## #   visib <dbl>, carrier_name <chr>, plane_year <dbl>, plane_type <chr>,
## #   plane_manufacturer <chr>, plane_model <chr>, plane_engines <dbl>,
## #   plane_seats <dbl>, plane_speed <dbl>, plane_engine <chr>, and abbreviated …
miFlights %>% arrange(dep_delay) %>% slice_head(n=3)
## # A tibble: 3 × 37
##    year month   day dep_time sched_dep…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
##   <dbl> <dbl> <dbl>    <dbl>       <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <chr>  
## 1  2020     3    29     2019        2113     -54    2301    2344     -43 G4     
## 2  2020     3    31     1521        1610     -49    1750    1842     -52 G4     
## 3  2019     1    11     2059        2145     -46    2250    2348     -58 NK     
## # … with 27 more variables: flight <dbl>, tailnum <chr>, origin <chr>,
## #   dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>, temp <dbl>, dewp <dbl>, humid <dbl>, wind_dir <dbl>,
## #   wind_speed <dbl>, wind_gust <dbl>, precip <dbl>, pressure <dbl>,
## #   visib <dbl>, carrier_name <chr>, plane_year <dbl>, plane_type <chr>,
## #   plane_manufacturer <chr>, plane_model <chr>, plane_engines <dbl>,
## #   plane_seats <dbl>, plane_speed <dbl>, plane_engine <chr>, and abbreviated …
miFlights %>% arrange( desc( distance / air_time)) %>% slice_head(n=3)
## # A tibble: 3 × 37
##    year month   day dep_time sched_dep…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
##   <dbl> <dbl> <dbl>    <dbl>       <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <chr>  
## 1  2019     7    14     1230        1215      15    1339    1326      13 OO     
## 2  2020     8    21     1645        1605      40    1621    1626      -5 OO     
## 3  2021    11    21     1000        1000       0    1119    1132     -13 OO     
## # … with 27 more variables: flight <dbl>, tailnum <chr>, origin <chr>,
## #   dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>, temp <dbl>, dewp <dbl>, humid <dbl>, wind_dir <dbl>,
## #   wind_speed <dbl>, wind_gust <dbl>, precip <dbl>, pressure <dbl>,
## #   visib <dbl>, carrier_name <chr>, plane_year <dbl>, plane_type <chr>,
## #   plane_manufacturer <chr>, plane_model <chr>, plane_engines <dbl>,
## #   plane_seats <dbl>, plane_speed <dbl>, plane_engine <chr>, and abbreviated …
miFlights %>% filter(origin == "GRR") %>% arrange(desc(distance), arr_time) %>% slice_head(n=3)
## # A tibble: 3 × 37
##    year month   day dep_time sched_dep…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
##   <dbl> <dbl> <dbl>    <dbl>       <dbl>   <dbl>   <dbl>   <dbl>   <dbl> <chr>  
## 1  2021     3    28      555         600      -5     714     758     -44 G4     
## 2  2021     4    11      552         600      -8     716     740     -24 G4     
## 3  2021     3    18      558         600      -2     720     758     -38 G4     
## # … with 27 more variables: flight <dbl>, tailnum <chr>, origin <chr>,
## #   dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>, temp <dbl>, dewp <dbl>, humid <dbl>, wind_dir <dbl>,
## #   wind_speed <dbl>, wind_gust <dbl>, precip <dbl>, pressure <dbl>,
## #   visib <dbl>, carrier_name <chr>, plane_year <dbl>, plane_type <chr>,
## #   plane_manufacturer <chr>, plane_model <chr>, plane_engines <dbl>,
## #   plane_seats <dbl>, plane_speed <dbl>, plane_engine <chr>, and abbreviated …
miDropped <-miFlights %>% select( -month, - year)

miDropped2 <-miFlights %>% select(-(year:month))
timeFlights <-miFlights %>% select(ends_with("time"))

departureInfor <-miFlights %>% select(starts_with("dep")) 
newFlights <-miFlights %>% select(flight, origin, dest, everything(), -tailnum)
flights_sml <- miFlights %>% select(ends_with("delay"), distance, air_time)

flightSpeed <- flights_sml %>% mutate(gain = dep_delay - arr_delay, speed = 60*distance / air_time)

flightSpeed %>% ggplot(aes(x =speed,
                            y = gain,
                           color = distance)) +
  geom_point(alpha = .2) +
  labs(title = "Michigan flights gains by speed 2019-2021",
       x = "speed (mph)", 
       y = "Gain (min)",
       color = "Distance (miles)",
       caption = "data source: anyFlights R package" ) +
  theme(legend.position = "bottom")

#Using group by

miFlights %>% group_by(year, month, day) %>% 
summarize(delay = mean(dep_delay, na.rm = TRUE)) %>% 
  ungroup() %>% slice_head(n = 5)
## # A tibble: 5 × 4
##    year month   day  delay
##   <dbl> <dbl> <dbl>  <dbl>
## 1  2019     1     1  8.58 
## 2  2019     1     2 12.5  
## 3  2019     1     3  0.721
## 4  2019     1     4 -0.477
## 5  2019     1     5  2.17
#flight delay by carrier
delaySummary <- miFlights %>% group_by(carrier_name) %>% 
  summarize(Delay = mean(arr_delay, na.rm = T))

#plot
delaySummary %>% ggplot(aes(x = fct_reorder(carrier_name, Delay),
                            y = Delay,
                            fill = Delay > 0)) +
  geom_col() +
  labs(title = "average delay flights by Carrier \n Michigan flights, 2019-2021",
       x = "Carrier",
       y = "Delay (minutes)") +
  scale_fill_manual(values = c("red", "blue")) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 60, vjust = .7), legend.position = "none")

#flight delay by carrier
delaySummary <- miFlights %>% group_by(month, origin) %>% 
  summarize(Delay = median(arr_delay, na.rm = T))

#plot
delaySummary %>% ggplot(aes(x = month,
                            y = Delay,
                            color = origin)) +
  geom_line() +
  labs(title = "average delay flights by Carrier \n Michigan flights, 2019-2021",
       x = "month",
       y = "Delay (minutes)") +
  scale_fill_manual(values =  "blue") +
  scale_x_continuous(1:12) +
  theme_bw() +
  theme(axis.text.x = element_text(vjust = .7), legend.position = "none")