This analysis was created as a self-selected case study for the Google Data Analytics Certificate. The data from this case study was found on Kaggle, was posted by Akul Bahl, and is originally from the US Department of Transportation https://www.kaggle.com/akulbahl/covid19-airline-flight-delays-and-cancellations/metadata. The purpose of this script is to reduce the data size (to make it compatible with the free version of rstudio), clean the data, and then answer the question: “Which airline has the most reliable flights to and from Austin?”
Step 4: Clean Data and Prepare for Analysis
#Looks for total number of null values (NA)
sum(is.na(flight_data_v2))
[1] 5196
#Displays rows with nulls. Looks like most are in column DEP_DELAY when a flight was also marked as cancelled.
flight_data_v2[rowSums(is.na(flight_data_v2)) > 0, ]
#This shows that there are no null values outside of DEP_DELAY.
flight_data_v2[rowSums(is.na(flight_data_v2)) > 0 & !is.na(flight_data_v2$DEP_DELAY), ]
#This shows that the only null values in DEP_DELAY occur when the flight was cancelled.
flight_data_v2[sum(is.na(flight_data_v2$DEP_DELAY)) > 0 & isTRUE(flight_data_v2$CANCELLED == 1), ]
#Change FL_DATE from chr (character) to date in format: YYYY-MM-DD
flight_data_v2$FL_DATE = as.Date(flight_data_v2$FL_DATE, "%m/%d/%Y")
head(flight_data_v2)
#Check that all dates are within expected bounds (January, 1st to October 7th, 2020)
if (flight_data_v2$FL_DATE < "2020-01-01" | flight_data_v2$FL_DATE > "2020-10-07") {
print("There is at least one date out of range.")
} else {print("No dates are out of range.")}
[1] "No dates are out of range."
Step 6: Graphical Representation
AA: American Airlines
AS: Alaska Airlines
B6: JetBlue
DL: Delta Air Lines
F9: Frontier Airlines
G4: Allegiant Air
HA: Hawaiian Airlines
NK: Spirit Airlines
UA: United Airlines
WN: Southwest Airlines
To determine which airlines were the least reliable, I first looked at the breakdown of flight delays for each airline. For the purposes of analysis, I categorized delays based on their impact to connecting flights/chance of missing planned meetings.
On Time: Flight arrived early or exactly at scheduled time
Mild Delay: Flight is less than 15 minutes late
Moderate Delay: Flight is between 15 and 60 minutes late (inclusive)
Extreme Delay: Flight is more than 60 minutes late
Cancelled: Flight is cancelled
# graph of percent cancelled flights
grouped_data_by_airline$MKT_UNIQUE_CARRIER <- as.factor(grouped_data_by_airline$MKT_UNIQUE_CARRIER)
library(RColorBrewer)
chart_data <- pivot_longer(select(grouped_data_by_airline, c('MKT_UNIQUE_CARRIER', 'cancelled', 'extreme_delay', 'mild_delay', 'moderate_delay', 'on_time', 'total_flights')), cols = c(cancelled, extreme_delay, moderate_delay, mild_delay, on_time), names_to = "Flight_Status", values_to = "Percent")
chart_data$Flight_Status <- as.factor(chart_data$Flight_Status)
chart_data$Flight_Status <- factor(chart_data$Flight_Status, levels = c("cancelled", "extreme_delay", "moderate_delay", "mild_delay", "on_time"))
chart_data$MKT_UNIQUE_CARRIER <- as.factor(chart_data$MKT_UNIQUE_CARRIER)
head(chart_data)
ggplot(chart_data, aes(fill = Flight_Status, y = Percent, x = MKT_UNIQUE_CARRIER)) +
geom_bar(position = "dodge", stat = "identity") +
theme(plot.title = element_text(size = 10)) +
ggtitle("Percent of Cancelled, Delayed, On-Time Flights By Airline in Austin") +
xlab("Airline")

# This graph shows the breakdown of flights to/from Austin by flight delay status for each airline.
grouped_data_by_airline %>%
ggplot( aes(x = MKT_UNIQUE_CARRIER, y = cancelled, fill = MKT_UNIQUE_CARRIER)) +
geom_bar(stat = "identity") +
ggtitle("Percent of Flights Cancelled Per Airline") +
xlab("Airline") +
ylab("Percent of Flights Cancelled") +
geom_text(aes(label = round(cancelled, digits = 1)), position = position_dodge(width = 0.9), vjust = -0.25) +
theme(legend.position = "none")

grouped_data_by_airline %>%
ggplot( aes(x = MKT_UNIQUE_CARRIER, y = extreme_delay, fill = MKT_UNIQUE_CARRIER)) +
geom_bar(stat = "identity") +
ggtitle("Percent of Flights Extremely Delayed Per Airline") +
xlab("Airline") +
ylab("Percent of Flights Extremely Delayed") +
geom_text(aes(label = round(extreme_delay, digits = 1)), position = position_dodge(width = 0.9), vjust = -0.25) +
theme(legend.position = "none")

# This graph shows that the airline G4, which is Alliegent Air, is the least reliable.
grouped_data_by_airline %>%
ggplot( aes(x = MKT_UNIQUE_CARRIER, y = extreme_delay + cancelled, fill = MKT_UNIQUE_CARRIER)) +
geom_bar(stat = "identity") +
ggtitle("Percent of Flights Cancelled or Extremely Delayed Per Airline") +
xlab("Airline") +
ylab("Percent of Flights Cancelled or Extremely Delayed") +
geom_text(aes(label = round(extreme_delay + cancelled, digits = 1)), position = position_dodge(width = 0.9), vjust = -0.25) +
theme(legend.position = "none")

grouped_data_by_airline %>%
ggplot( aes(x = MKT_UNIQUE_CARRIER, y = moderate_delay, fill = MKT_UNIQUE_CARRIER)) +
geom_bar(stat = "identity") +
ggtitle("Percent of Flights Moderately Delayed Per Airline") +
xlab("Airline") +
ylab("Percent of Flights Moderately Delayed (between 15 min. and 1 hour") +
geom_text(aes(label = round(moderate_delay, digits = 1)), position = position_dodge(width = 0.9), vjust = -0.25) +
theme(legend.position = "none")

grouped_data_by_airline %>%
ggplot( aes(x = MKT_UNIQUE_CARRIER, y = mild_delay, fill = MKT_UNIQUE_CARRIER)) +
geom_bar(stat = "identity") +
ggtitle("Percent of Flights Mildly Delayed (15 min or less) Per Airline") +
xlab("Airline") +
ylab("Percent of Flights Mildly Delayed") +
geom_text(aes(label = round(mild_delay, digits = 1)), position = position_dodge(width = 0.9), vjust = -0.25) +
theme(legend.position = "none")

grouped_data_by_airline %>%
ggplot( aes(x = MKT_UNIQUE_CARRIER, y = on_time, fill = MKT_UNIQUE_CARRIER)) +
geom_bar(stat = "identity") +
ggtitle("Percent of Flights On Time Per Airline") +
xlab("Airline") +
ylab("Percent of Flights On Time") +
geom_text(aes(label = round(on_time, digits = 1)), position = position_dodge(width = 0.9), vjust = -0.25) +
theme(legend.position = "none")

# This boxplot of delay times per airline shows that airlines typically had about 75% of flights on time. I excluded this graph from my actual analysis because I had to exclude extreme delays for the graph to be readable.
flight_data_v2 %>%
ggplot( aes(x = MKT_UNIQUE_CARRIER, y = DEP_DELAY, fill = MKT_UNIQUE_CARRIER)) +
geom_boxplot(outlier.shape = NA) +
ylim(-20, 15) +
scale_fill_viridis(discrete = TRUE, alpha = 0.6) +
ggtitle("Airline Delay Times") +
xlab("Airline") +
ylab("Delay Time")

# This graph looks at the distribution of delays less than or equal to 1 hour.
delayed_mild_mod <- flight_data_v2[flight_data_v2$DEP_DELAY > 0 & flight_data_v2$DEP_DELAY <= 60, ]
delayed_mild_mod %>%
ggplot( aes(x = DEP_DELAY, group = MKT_UNIQUE_CARRIER, fill = MKT_UNIQUE_CARRIER)) +
geom_density(adjust = 1.5) +
facet_wrap(~MKT_UNIQUE_CARRIER) +
ggtitle("Airline Delay Times") +
xlab("Airline") +
ylab("Delay Time")

# This graph shows the distribution of delays and has quartile lines to better show the distribution of flight delays. Based on this graph, of the mild-moderate delays, WN, which is Southwest Airlines, has the shortest mild-moderate delays.
delayed_mild_mod %>%
ggplot( aes(x = DEP_DELAY, y = MKT_UNIQUE_CARRIER, fill = factor(stat(quantile)))) +
stat_density_ridges(geom = "density_ridges_gradient", calc_ecdf = TRUE, quantiles = 4, quantile_lines = TRUE) +
scale_fill_viridis_d(name = "Quartiles") +
theme_ridges() +
theme(legend.position = "none")
Picking joint bandwidth of 3.78

# Overall, the most reliable airline is NK, which is Spirit Airlines.
grouped_data_by_airline %>%
ggplot( aes(x = MKT_UNIQUE_CARRIER, y = on_time + mild_delay, fill = MKT_UNIQUE_CARRIER)) +
geom_bar(stat = "identity") +
ggtitle("Percent of Flights On Time or Delayed Less Than 15 Min") +
xlab("Airline") +
ylab("Percent of Flights On Time or Mildly Delayed") +
geom_text(aes(label = round(on_time + mild_delay, digits = 1)), position = position_dodge(width = 0.9), vjust = -0.25) +
theme(legend.position = "none")

---
title: Flights_From_Austin
output: html_notebook
date: 2/7/2022
author: Malory Wodka
---

This analysis was created as a self-selected case study for the Google Data Analytics Certificate. The data from this case study was found on Kaggle, was posted by Akul Bahl, and is originally from the US Department of Transportation <https://www.kaggle.com/akulbahl/covid19-airline-flight-delays-and-cancellations/metadata>. The purpose of this script is to reduce the data size (to make it compatible with the free version of rstudio), clean the data, and then answer the question: "Which airline has the most reliable flights to and from Austin?" 

# Step 1: Load required packages
* tidyverse for data wrangling
* readr for loading csv file
* lubridate for time functions
* dplyr for data transformation
* ggplot2, viridis, and ggridges for graphing

```{r Step 1, message=FALSE, warning=FALSE}
library(tidyverse)
library(readr)
library(lubridate)
library(dplyr)
library(ggplot2)
library(viridis)
library(ggridges)
```

# Step 2: Select potentially useful columns from original dataset
* The original dataset was too large for the free version of rstudio to handle without crashing, so I selected the columns (using Kaggle) to export the smaller dataset 'flight_data_v2'.

The following was completed in a Kaggle notebook.
-----
This code creates a subset of the original dataset from <https://www.kaggle.com/akulbahl/covid19-airline-flight-delays-and-cancellations/metadata> of the columns I might likely use for my data analysis.

1. library(tidyverse) 

 // metapackage of all tidyverse packages

2. flight_data <- read_csv("../input/covid19-airline-flight-delays-and-cancellations/jantojun2020.csv")

3. library(readr)

4. colnames(flight_data)

5. flight_data_v2 <- select(flight_data, FL_DATE, MKT_UNIQUE_CARRIER,  ORIGIN_CITY_NAME, DEST_STATE_ABR, DEST_CITY_NAME, DEP_DELAY, CANCELLED)

6. head(flight_data_v2)

//Checked to ensure correct columns with corresponding values were selected

7. colnames(flight_data_v2)

// The columns selected to export and double-checked with colnames() were FL_DATE, MKT_UNIQUE_CARRIER, ORIGIN_CITY_NAME, DEST_STATE_ABR, DEST_CITY_NAME, DEP_DELAY, CANCELLED

8. flight_data_v2 <- filter(flight_data_v2, ORIGIN_CITY_NAME == "Austin, TX" | DEST_CITY_NAME == "Austin, TX")

//I was only interested in flights leaving Austin or returning to Austin for this case study.

9. write.csv(flight_data_v2, "flight_data_v2.csv", row.names = F)

// This creates the new csv file that I downloaded to my computer and uploaded to RStudio.

# Step 3: Import Data to RStudio

```{r Step 3, message=FALSE, warning=FALSE}
flight_data_v2 <- read_csv("Flight_Data_Kaggle/flight_data_v2.csv")
head(flight_data_v2)
```

# Step 4: Clean Data and Prepare for Analysis

```{r Step 4: Data Cleaning, message=FALSE, warning=FALSE}
#Looks for total number of null values (NA)
sum(is.na(flight_data_v2))

#Displays rows with nulls. Looks like most are in column DEP_DELAY when a flight was also marked as cancelled.
flight_data_v2[rowSums(is.na(flight_data_v2)) > 0, ]

#This shows that there are no null values outside of DEP_DELAY.
flight_data_v2[rowSums(is.na(flight_data_v2)) > 0 & !is.na(flight_data_v2$DEP_DELAY), ]

#This shows that the only null values in DEP_DELAY occur when the flight was cancelled.
flight_data_v2[sum(is.na(flight_data_v2$DEP_DELAY)) > 0 & isTRUE(flight_data_v2$CANCELLED == 1), ]

#Change FL_DATE from chr (character) to date in format: YYYY-MM-DD
flight_data_v2$FL_DATE = as.Date(flight_data_v2$FL_DATE, "%m/%d/%Y")
head(flight_data_v2)

#Check that all dates are within expected bounds (January, 1st to October 7th, 2020)
if (flight_data_v2$FL_DATE < "2020-01-01" | flight_data_v2$FL_DATE > "2020-10-07") {
  print("There is at least one date out of range.")
  } else {print("No dates are out of range.")}

```
# Step 5: Descriptive Analysis

```{r Step 5: Descriptive Analysis, message=TRUE, warning=FALSE, paged.print=TRUE}

grouped_data_by_airline <- flight_data_v2 %>%
  group_by(MKT_UNIQUE_CARRIER) %>%
  summarise(mean_dep_delay = mean(DEP_DELAY, na.rm = TRUE),
            median_dep_delay = median(DEP_DELAY, na.rm = TRUE),
            total_flights_cancelled = sum(CANCELLED),
            total_flights_from_austin = sum(ORIGIN_CITY_NAME == "Austin, TX"),
            total_cancelled_from_austin = sum(CANCELLED & ORIGIN_CITY_NAME == "Austin, TX"),
            total_flights_to_austin = sum(DEST_CITY_NAME == "Austin, TX"),
            total_cancelled_to_austin = sum(CANCELLED & DEST_CITY_NAME == "Austin, TX"),
            total_flights = sum(total_flights_from_austin, total_flights_to_austin),
            extreme_delay_n = sum(DEP_DELAY > 60 & !is.na(DEP_DELAY)),
            moderate_delay_n = sum(DEP_DELAY > 15 & DEP_DELAY <= 60 & !is.na(DEP_DELAY)),
            mild_delay_n = sum(DEP_DELAY > 0 & DEP_DELAY <= 15 & !is.na(DEP_DELAY)))

grouped_data_by_airline <- mutate(grouped_data_by_airline, cancelled = (total_flights_cancelled / total_flights)*100)

grouped_data_by_airline <- mutate(grouped_data_by_airline, extreme_delay =  (extreme_delay_n / total_flights)*100)
  
grouped_data_by_airline <- mutate(grouped_data_by_airline, moderate_delay =  (moderate_delay_n / total_flights)*100)

grouped_data_by_airline <- mutate(grouped_data_by_airline, mild_delay =  (mild_delay_n / total_flights)*100)

grouped_data_by_airline <- mutate(grouped_data_by_airline, on_time =  (100 - (mild_delay_n + moderate_delay_n + extreme_delay_n + total_flights_cancelled) / total_flights*100))

view(grouped_data_by_airline)

```

# Step 6: Graphical Representation

        AA: American Airlines
				AS: Alaska Airlines
				B6: JetBlue
				DL: Delta Air Lines
				F9: Frontier Airlines
				G4: Allegiant Air
				HA: Hawaiian Airlines
				NK: Spirit Airlines
				UA: United Airlines
				WN: Southwest Airlines
				
To determine which airlines were the least reliable, I first looked at the breakdown of flight delays for each airline. For the purposes of analysis, I categorized delays based on their impact to connecting flights/chance of missing planned meetings. 

    On Time: Flight arrived early or exactly at scheduled time
    Mild Delay: Flight is less than 15 minutes late
    Moderate Delay: Flight is between 15 and 60 minutes late (inclusive)
    Extreme Delay: Flight is more than 60 minutes late
    Cancelled: Flight is cancelled
```{r Step 6: Graphical Representation, echo=TRUE, message=TRUE, warning=FALSE, paged.print=TRUE}

# graph of percent cancelled flights
grouped_data_by_airline$MKT_UNIQUE_CARRIER <- as.factor(grouped_data_by_airline$MKT_UNIQUE_CARRIER)
library(RColorBrewer)

chart_data <- pivot_longer(select(grouped_data_by_airline, c('MKT_UNIQUE_CARRIER', 'cancelled', 'extreme_delay', 'mild_delay', 'moderate_delay', 'on_time', 'total_flights')), cols = c(cancelled, extreme_delay, moderate_delay, mild_delay, on_time), names_to = "Flight_Status", values_to = "Percent")

chart_data$Flight_Status <- as.factor(chart_data$Flight_Status)
chart_data$Flight_Status <- factor(chart_data$Flight_Status, levels = c("cancelled", "extreme_delay", "moderate_delay", "mild_delay", "on_time"))
chart_data$MKT_UNIQUE_CARRIER <- as.factor(chart_data$MKT_UNIQUE_CARRIER)

head(chart_data)

ggplot(chart_data, aes(fill = Flight_Status, y = Percent, x = MKT_UNIQUE_CARRIER)) +
  geom_bar(position = "dodge", stat = "identity") + 
  theme(plot.title = element_text(size = 10)) + 
  ggtitle("Percent of Cancelled, Delayed, On-Time Flights By Airline in Austin") + 
  xlab("Airline")
# This graph shows the breakdown of flights to/from Austin by flight delay status for each airline. 

grouped_data_by_airline %>% 
  ggplot( aes(x = MKT_UNIQUE_CARRIER, y = cancelled, fill = MKT_UNIQUE_CARRIER)) + 
  geom_bar(stat = "identity") +
  ggtitle("Percent of Flights Cancelled Per Airline") +
  xlab("Airline") +
  ylab("Percent of Flights Cancelled") +
  geom_text(aes(label = round(cancelled, digits = 1)), position = position_dodge(width = 0.9), vjust = -0.25) +
  theme(legend.position = "none")

grouped_data_by_airline %>% 
  ggplot( aes(x = MKT_UNIQUE_CARRIER, y = extreme_delay, fill = MKT_UNIQUE_CARRIER)) + 
  geom_bar(stat = "identity") +
  ggtitle("Percent of Flights Extremely Delayed Per Airline") +
  xlab("Airline") +
  ylab("Percent of Flights Extremely Delayed") +
  geom_text(aes(label = round(extreme_delay, digits = 1)), position = position_dodge(width = 0.9), vjust = -0.25) +
  theme(legend.position = "none")

# This graph shows that the airline G4, which is Alliegent Air, is the least reliable.
grouped_data_by_airline %>% 
  ggplot( aes(x = MKT_UNIQUE_CARRIER, y = extreme_delay + cancelled, fill = MKT_UNIQUE_CARRIER)) + 
  geom_bar(stat = "identity") +
  ggtitle("Percent of Flights Cancelled or Extremely Delayed Per Airline") +
  xlab("Airline") +
  ylab("Percent of Flights Cancelled or Extremely Delayed") +
  geom_text(aes(label = round(extreme_delay + cancelled, digits = 1)), position = position_dodge(width = 0.9), vjust = -0.25) +
  theme(legend.position = "none")

grouped_data_by_airline %>% 
  ggplot( aes(x = MKT_UNIQUE_CARRIER, y = moderate_delay, fill = MKT_UNIQUE_CARRIER)) + 
  geom_bar(stat = "identity") +
  ggtitle("Percent of Flights Moderately Delayed Per Airline") +
  xlab("Airline") +
  ylab("Percent of Flights Moderately Delayed (between 15 min. and 1 hour") +
  geom_text(aes(label = round(moderate_delay, digits = 1)), position = position_dodge(width = 0.9), vjust = -0.25) +
  theme(legend.position = "none")

grouped_data_by_airline %>% 
  ggplot( aes(x = MKT_UNIQUE_CARRIER, y = mild_delay, fill = MKT_UNIQUE_CARRIER)) + 
  geom_bar(stat = "identity") +
  ggtitle("Percent of Flights Mildly Delayed (15 min or less) Per Airline") +
  xlab("Airline") +
  ylab("Percent of Flights Mildly Delayed") +
  geom_text(aes(label = round(mild_delay, digits = 1)), position = position_dodge(width = 0.9), vjust = -0.25) +
  theme(legend.position = "none")

grouped_data_by_airline %>% 
  ggplot( aes(x = MKT_UNIQUE_CARRIER, y = on_time, fill = MKT_UNIQUE_CARRIER)) + 
  geom_bar(stat = "identity") +
  ggtitle("Percent of Flights On Time Per Airline") +
  xlab("Airline") +
  ylab("Percent of Flights On Time") +
  geom_text(aes(label = round(on_time, digits = 1)), position = position_dodge(width = 0.9), vjust = -0.25) +
  theme(legend.position = "none")

# This boxplot of delay times per airline shows that airlines typically had about 75% of flights on time. I excluded this graph from my actual analysis because I had to exclude extreme delays for the graph to be readable. 
flight_data_v2 %>% 
  ggplot( aes(x = MKT_UNIQUE_CARRIER, y = DEP_DELAY, fill = MKT_UNIQUE_CARRIER)) +
  geom_boxplot(outlier.shape = NA) + 
  ylim(-20, 15) +
  scale_fill_viridis(discrete = TRUE, alpha = 0.6) + 
  ggtitle("Airline Delay Times") +
  xlab("Airline") + 
  ylab("Delay Time")

# This graph looks at the distribution of delays less than or equal to 1 hour.
delayed_mild_mod <- flight_data_v2[flight_data_v2$DEP_DELAY > 0 & flight_data_v2$DEP_DELAY <= 60, ]
delayed_mild_mod %>% 
  ggplot( aes(x = DEP_DELAY, group = MKT_UNIQUE_CARRIER, fill = MKT_UNIQUE_CARRIER)) +
  geom_density(adjust = 1.5) + 
  facet_wrap(~MKT_UNIQUE_CARRIER) +
  ggtitle("Airline Delay Times") +
  xlab("Airline") + 
  ylab("Delay Time")

# This graph shows the distribution of delays and has quartile lines to better show the distribution of flight delays. Based on this graph, of the mild-moderate delays, WN, which is Southwest Airlines, has the shortest mild-moderate delays.
delayed_mild_mod %>% 
  ggplot( aes(x = DEP_DELAY, y = MKT_UNIQUE_CARRIER, fill = factor(stat(quantile)))) + 
  stat_density_ridges(geom = "density_ridges_gradient", calc_ecdf = TRUE, quantiles = 4, quantile_lines = TRUE) + 
  scale_fill_viridis_d(name = "Quartiles") +
  theme_ridges() + 
  theme(legend.position = "none")

# Overall, the most reliable airline is NK, which is Spirit Airlines.
grouped_data_by_airline %>% 
  ggplot( aes(x = MKT_UNIQUE_CARRIER, y = on_time + mild_delay, fill = MKT_UNIQUE_CARRIER)) + 
  geom_bar(stat = "identity") +
  ggtitle("Percent of Flights On Time or Delayed Less Than 15 Min") +
  xlab("Airline") +
  ylab("Percent of Flights On Time or Mildly Delayed") +
  geom_text(aes(label = round(on_time + mild_delay, digits = 1)), position = position_dodge(width = 0.9), vjust = -0.25) +
  theme(legend.position = "none")

```

