Loading Packages

library(tidyverse)
library(lubridate)
library(knitr)
library(skimr)

Loading the dataset

miFlights <- read_csv("miFlights2019-2021.csv")

skim(miFlights)
Data summary
Name miFlights
Number of rows 463818
Number of columns 37
_______________________
Column type frequency:
character 9
numeric 27
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
carrier 0 1.00 2 2 0 16 0
tailnum 2189 1.00 3 6 0 5250 0
origin 0 1.00 3 3 0 4 0
dest 0 1.00 3 3 0 130 0
carrier_name 0 1.00 9 34 0 16 0
plane_type 11140 0.98 23 23 0 1 0
plane_manufacturer 11140 0.98 6 29 0 16 0
plane_model 11140 0.98 5 15 0 93 0
plane_engine 11140 0.98 9 9 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1.00 2019.92 0.85 2019.00 2019.00 2020.00 2021.00 2021.00 ▇▁▆▁▆
month 0 1.00 6.53 3.48 1.00 3.00 7.00 10.00 12.00 ▇▅▅▅▇
day 0 1.00 15.74 8.76 1.00 8.00 16.00 23.00 31.00 ▇▇▇▇▆
dep_time 9060 0.98 1372.26 490.52 1.00 950.00 1355.00 1754.00 2400.00 ▁▇▇▇▆
sched_dep_time 0 1.00 1368.40 481.75 49.00 948.00 1355.00 1750.00 2336.00 ▁▇▇▇▆
dep_delay 9063 0.98 7.11 44.97 -54.00 -5.00 -3.00 0.00 2672.00 ▇▁▁▁▁
arr_time 9324 0.98 1481.01 506.74 1.00 1053.00 1502.00 1905.00 2400.00 ▁▅▇▇▆
sched_arr_time 0 1.00 1496.79 495.16 1.00 1103.00 1510.00 1910.00 2359.00 ▁▃▇▇▇
arr_delay 10239 0.98 0.16 47.21 -85.00 -17.00 -9.00 2.00 2649.00 ▇▁▁▁▁
flight 0 1.00 413.37 269.57 1.00 189.00 387.00 600.00 1322.00 ▇▇▆▂▁
air_time 10239 0.98 94.59 63.12 15.00 50.00 74.00 130.00 581.00 ▇▂▁▁▁
distance 0 1.00 641.00 488.23 74.00 296.00 500.00 957.00 4475.00 ▇▂▁▁▁
hour 0 1.00 13.41 4.79 0.00 9.00 13.00 17.00 23.00 ▁▇▇▇▆
minute 0 1.00 27.49 17.94 0.00 11.00 27.00 44.00 59.00 ▇▆▇▆▆
temp 441760 0.05 42.16 15.40 -4.00 32.00 37.90 48.90 90.00 ▁▆▇▂▁
dewp 441762 0.05 31.91 13.42 -9.00 23.00 28.90 39.90 75.90 ▁▆▇▃▁
humid 441773 0.05 68.87 15.10 25.87 57.93 71.82 80.66 100.00 ▁▃▅▇▃
wind_dir 9205 0.98 181.02 109.46 0.00 80.00 200.00 270.00 360.00 ▇▃▆▇▆
wind_speed 4367 0.99 8.59 5.64 0.00 4.60 8.06 11.51 42.58 ▇▆▁▁▁
wind_gust 4367 0.99 9.88 6.50 0.00 5.30 9.27 13.24 49.00 ▇▆▁▁▁
precip 430846 0.07 0.01 0.02 0.00 0.00 0.00 0.01 0.44 ▇▁▁▁▁
pressure 447131 0.04 1018.83 7.60 990.40 1014.10 1019.00 1023.40 1038.50 ▁▂▇▇▂
visib 1934 1.00 8.18 2.56 0.06 7.00 10.00 10.00 10.00 ▁▁▁▂▇
plane_year 21647 0.95 2008.12 7.15 1987.00 2003.00 2007.00 2015.00 2021.00 ▁▂▇▃▅
plane_engines 11140 0.98 2.00 0.02 2.00 2.00 2.00 2.00 3.00 ▇▁▁▁▁
plane_seats 11140 0.98 127.86 66.68 20.00 80.00 95.00 182.00 451.00 ▇▃▂▁▁
plane_speed 11140 0.98 0.01 1.72 0.00 0.00 0.00 0.00 438.00 ▇▁▁▁▁

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
time_hour 0 1 2019-01-01 05:00:00 2021-12-31 22:00:00 2020-03-26 06:00:00 19059
glimpse(miFlights)
## Rows: 463,818
## Columns: 37
## $ year               <dbl> 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 201…
## $ month              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ day                <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ dep_time           <dbl> 55, 455, 506, 531, 534, 550, 555, 555, 555, 600, 60…
## $ sched_dep_time     <dbl> 2115, 500, 511, 535, 545, 600, 600, 555, 600, 600, …
## $ dep_delay          <dbl> 220, -5, -5, -4, -11, -10, -5, 0, -5, 0, -2, 3, 4, …
## $ arr_time           <dbl> 426, 830, 710, 647, 750, 712, 822, 709, 755, 559, 9…
## $ sched_arr_time     <dbl> 2323, 834, 730, 710, 742, 748, 834, 715, 817, 615, …
## $ arr_delay          <dbl> 303, -4, -20, -23, 8, -36, -12, -6, -22, -16, 3, -7…
## $ carrier            <chr> "OH", "YX", "AA", "WN", "B6", "YX", "OO", "WN", "DL…
## $ flight             <dbl> 1019, 954, 185, 203, 310, 790, 803, 295, 348, 218, …
## $ tailnum            <chr> "N567NN", "N433YX", "N853NN", "N227WN", "N203JB", "…
## $ origin             <chr> "DTW", "GRR", "DTW", "DTW", "DTW", "DTW", "FNT", "D…
## $ dest               <chr> "CLT", "MIA", "DFW", "BWI", "BOS", "EWR", "ATL", "D…
## $ air_time           <dbl> 88, 187, 162, 64, 71, 61, 125, 174, 105, 45, 179, 1…
## $ distance           <dbl> 500, 1214, 986, 409, 632, 488, 645, 1123, 640, 228,…
## $ hour               <dbl> 21, 5, 5, 5, 5, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6,…
## $ minute             <dbl> 15, 0, 11, 35, 45, 0, 0, 55, 0, 0, 3, 0, 0, 5, 0, 1…
## $ time_hour          <dttm> 2019-01-01 21:00:00, 2019-01-01 05:00:00, 2019-01-…
## $ temp               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ dewp               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ humid              <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ wind_dir           <dbl> 10, 320, 290, 290, 290, 290, 330, 290, 290, 290, 29…
## $ wind_speed         <dbl> 8.05546, 9.20624, 19.56326, 19.56326, 19.56326, 13.…
## $ wind_gust          <dbl> 9.270062, 10.594357, 22.513008, 22.513008, 22.51300…
## $ precip             <dbl> NA, 1e-04, NA, NA, NA, NA, 1e-04, NA, NA, NA, NA, N…
## $ pressure           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ visib              <dbl> 1.25, 9.00, 7.00, 7.00, 7.00, 4.00, 9.00, 7.00, 10.…
## $ carrier_name       <chr> "PSA Airlines Inc.", "Republic Airline", "American …
## $ plane_year         <dbl> 2015, 2014, 2010, 2005, 2006, 2016, 2006, 1999, 200…
## $ plane_type         <chr> "Fixed wing multi engine", "Fixed wing multi engine…
## $ plane_manufacturer <chr> "BOMBARDIER INC", "EMBRAER S A", "BOEING", "BOEING"…
## $ plane_model        <chr> "CL-600-2D24", "ERJ 170-200 LR", "737-823", "737-7H…
## $ plane_engines      <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ plane_seats        <dbl> 95, 88, 162, 140, 20, 88, 95, 149, 100, 140, 162, 1…
## $ plane_speed        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ plane_engine       <chr> "Turbo-fan", "Turbo-fan", "Turbo-fan", "Turbo-fan",…

How many variables and how many observations are in this data set?

Answer: There are 37 variables and 463818 observations

Are there any notable patterns of missing values?

Answer: It’s a big dataset, It appears that weather variables i.e. “temp” “dewp” “humid” “precip” “pressure” there’s a lot of missing values in these columns.

Create a bar chart showing how many flights departed out of each airport (origin) using the count() and geom_col() functions. Also sort the bars by descending height using the fct_reorder() function.

miFlights %>% count(origin) %>% ggplot(aes(x = fct_reorder(origin, - n), y = n)) + geom_col(fill = "dodgerblue", color="black") + labs( 
  title ="Michigan departing flights, 2019-2021",
  x = "Airports",
  y = "Number of flights",
  caption ="Datasource: anyflights R package") + theme_bw()

Selecting all flights on January 1st in the data set, create a new object called janFlights

janFlights <- miFlights %>% filter(month == 1, day == 1)
janFlights %>% slice_head(n = 5)

Suppose we want to create a data set called dec25 that contains flight data from December 25th. What code would we need using the filter() function to create dec25?

dec25 <- miFlights %>% filter(month == 12, day == 25)
dec25 %>% slice_head(n = 5)

Find all flights that departed in November or December, creating an object called novDec.

novDec <- miFlights %>% filter(month == 11 | month == 12)
novDec %>% slice_head(n = 5)

Find all flights that departed in November or December using the %in% operator, creating an object called novDec.

novDec <- miFlights %>% filter(month %in% c(11,12))
novDec %>% slice_head(n = 5)

Select all flights except those in the months of November and December using !.

notNovDec <- miFlights %>% filter(!(month == 11 | month == 12))
notNovDec %>% slice_head(n = 5)

Knowing that arr_delay and dep_delay represent the arrival and departure delays in minutes respectively, what data set is produced using the code below?

#It will include flights with arrival and departure delays that are 120 minutes or less.
dplyr::filter(miFlights, !(arr_delay > 120 | dep_delay > 120))
#It will include flights where both the arrival and departure delays are 120 minutes or less. It is equivalent to the previous code
dplyr::filter(miFlights, arr_delay <= 120, dep_delay <= 120)

What does running the code below produce?

Answer: NA > 5 #NA ; 10 == NA #NA ; NA + 10 #NA ; NA / 2 #NA

How about the following code: NA == NA

Answer: NA

Create a new object called miFlightsComplete where all departure times are non-missing, and miFlightsMiss where all departure times are missing

miFlightsMiss <- miFlights %>% filter(is.na(dep_time))
miFlightsMiss %>% slice_head(n = 5)
miFlightsComplete <-   miFlights %>% filter(!(is.na(dep_time)))
miFlightsComplete %>% slice_head(n = 5)

Sort miFlights by the day of the flight (smallest to largest), and print the first 4 columns and 5 rows of the resulting data set using the slice_head() function.

miFlights %>% arrange(day) %>% select(1:4) %>% slice_head(n=5)

Sort miFlights by the day of the flight (largest to smallest), and print the first 4 columns and 5 rows of the resulting data set using the slice_head() function.

miFlights %>% arrange(desc(day)) %>% select(1:4) %>% slice_head(n=5)

Sort miFlights by the year, month, and day of the flight.

miFlightsorted <- miFlights %>% arrange(year, month, day)
miFlightsorted %>% slice_head(n = 5)

You Try Section

Sort miFlights to find the 3 most delayed flights (arr_delay), and the 3 that left the earliest relative to their scheduled departure (dep_delay).

miFlights %>% arrange(desc(arr_delay)) %>% slice_head(n=3)
miFlights %>% arrange(dep_delay) %>% slice_head(n=3)

Sort miFlights to find the 3 fastest (highest speed) flights.

miFlights %>% arrange( desc( distance / air_time)) %>% slice_head(n=3)

For flights coming out of GRR, find the 3 flights that traveled the farthest (distance) and that arrived the earliest in the morning (arr_time) simultaneously.

miFlights %>% filter(origin == "GRR") %>% arrange(desc(distance), arr_time) %>% slice_head(n=3)

Drop the year and month columns from miFlights creating a new data set called miDropped.

miDropped <- miFlights %>% select(-month, -year)
miDropped %>% slice_head(n=3)

Drop all variables between year and day columns (inclusive) from miFlights creating a new data set called miDropped2.

miDropped2 <- miFlights %>% select(-(year:month))
miDropped2 %>% slice_head(n=3)

You Try Section

Create a subset of the miFlights data set called timeFlights that only contains variables that end with the word “time”.

timeFlights <- miFlights %>% select(ends_with("time"))
timeFlights %>% slice_head(n=3)

Create a new data frame called departureInfo that only has variables that start with “dep”

departureInfo <- miFlights %>% select(starts_with("dep"))
departureInfo %>% slice_head(n=3)

Create a new data frame call newFlights by rearranging the columns of the full miFlights data set so that flight number (flight), origin (origin), and destination (dest) are provided first, then all other columns except the tail number (tailnum).

newFlights <- miFlights %>% select(flight, origin, dest, everything(), -tailnum)
newFlights %>% slice_head(n=3)

Extending the code provided with a single call to mutate(), create a new variable, speed, that is equal to distance divided by air_time, producing a new data set called flightSpeeds.

flights_sml <- miFlights %>% select(ends_with("delay"), distance, air_time)

flightSpeed <- flights_sml %>% mutate(gain = dep_delay - arr_delay, speed = 60*distance / air_time)

flightSpeed %>% slice_head(n=3)

Create a plot showing the relationship between the speed and time gain of each flight, adding appropriate axis and title labels.

flightSpeed %>% ggplot(aes(x =speed,
                            y = gain)) +
  geom_point() +
  labs(title = "Michigan flights gains by speed 2019-2021",
       x = "Speed (mph)", 
       y = "Gain (min)",
       color = "Distance (miles)",
       caption = "data source: anyFlights R package" )

Add color to the plot to display the distance the flight traveled as well. Is there a noticeable pattern?

flightSpeed %>% ggplot(aes(x =speed,
                            y = gain,
                           color = distance)) +
  geom_point(alpha = .2) +
  labs(title = "Michigan flights gains by speed 2019-2021",
       x = "speed (mph)", 
       y = "Gain (min)",
       color = "Distance (miles)",
       caption = "data source: anyFlights R package" ) +
  theme(legend.position = "bottom")

#There is a noticeable pattern i.e It's a U shaped plot, it's like having two distribution 

Group-wise operations and statistics with group_by() & summarize()

miFlights %>% group_by(year, month, day) %>% 
summarize(delay = mean(dep_delay, na.rm = TRUE)) %>% 
  slice_head(n = 5)
delaySummary <- miFlights %>% group_by(carrier_name) %>% 
  summarize(Delay = mean(arr_delay, na.rm = T))

Reproduce the waterfall plot below using this summary table.

delaySummary %>% ggplot(aes(x = fct_reorder(carrier_name, Delay),
                            y = Delay,
                            fill = Delay > 0)) +
  geom_col() +
  labs(title = "Average delay flights by Carrier \n Michigan flights, 2019-2021",
       x = "Carrier",
       y = "Delay (minutes)") +
  scale_fill_manual(values = c("red", "blue")) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 60, vjust = .7), legend.position = "none")

# You Try Section

Create a data frame summarizing the median flight delay (arr_delay) by month. Which month has the worst delays? In which month are flights most early / on-time?

monthlyAirportDelays <- miFlights %>% group_by(month, origin) %>% 
  summarize(Delay = median(arr_delay, na.rm = T))


# Month which has worst Delays: 2
# Month flights are most early/on-time: 4

Extend the plot in 2. by faceting by where the flight departed from (origin). You will need to use group_by() again to do this. What are your observations?

monthlyAirportDelays %>% ggplot(aes(x = month,
                            y = Delay,
                            color = origin)) +
  geom_line() +
  labs(title = "Monthly Arrival delays",
      subtitle = "Michigan flights, 2019-2021",
       x = "Month",
       y = "Median Delay (minutes)",
       caption = "data source: anyFlights R package" ) + facet_wrap(~ origin)+
  scale_x_continuous(breaks = 1:12) +
  theme_bw()

Bonus (optional): Create a line chart showing the average daily flight delay across time for each of the major airports

monthlyAirportDelaysMean <- miFlights %>% group_by(month, origin) %>% 
  summarize(Delay = mean(arr_delay, na.rm = T))
monthlyAirportDelaysMean %>% ggplot(aes(x = month,
                            y = Delay,
                            color = origin)) +
  geom_line() +
  labs(title = "Monthly Arrival delays",
      subtitle = "Michigan flights, 2019-2021",
       x = "Month",
       y = "Mean Delay (minutes)",
       caption = "data source: anyFlights R package" ) + 
  scale_x_continuous(breaks = 1:12) +
  theme_bw()