2015 Flight Delays and Cancellations

These visuals explore Flight Delays and Cancellations in 2015. They take a look at which airlines underwent the most cancellations or delays, what type of delays they are, and the actual time these delays lasted in minutes. This dataset has 31 columns including the plane tail number, the flight number, the airline, length of arrival delay, length of departure delay and more.

# Add Library Package

library(data.table)


library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(scales)
library(DescTools)
## 
## Attaching package: 'DescTools'
## The following object is masked from 'package:data.table':
## 
##     %like%
library(ggplot2)
# Read the Csv file

filename <-"C:/Users/AkiMa/Documents/DS Masters 2022/Flightinfo/flights.csv"
df <- fread(filename, na.string=c(NA, ""))



# Non NA flight delays



# Understand Dataset section

##End

# Creating a Dataset which only includes values of airports as letters

df2 <- df[(nchar(df$ORIGIN_AIRPORT)== 3 & nchar(df$DESTINATION_AIRPORT) == 3),]







# Creating a dataset which only includes the mean departure delays grouped by the airline

df3 <- as.data.frame(with(df2,tapply(df$DEPARTURE_DELAY, df$AIRLINE, FUN=mean, na.rm=TRUE)),col.names = names("AIRLINE"))


# Understanding D
## End 

df4 <- df2


# Rename Variable

names(df3)[names(df3) == "with(df2, tapply(df$DEPARTURE_DELAY, df$AIRLINE, FUN = mean, "] <- "Delay"

df3$Airline = rownames(df3)

# Reorder dataset

df3 <- df3[order(df3$Delay, decreasing = TRUE),]

# Create Bar graph







df2 <- df2[!is.na(df2$ARRIVAL_DELAY),]
df2 <- df2 %>% 
  group_by(MONTH,AIRLINE) %>% 
  mutate(MEAN_A_DELAY= mean(ARRIVAL_DELAY))

# Remove Duplicates

df2 <- df2[!duplicated(df2[,c('AIRLINE','MONTH')]),]

df2$MONTH<- as.factor(df2$MONTH)


# Pie Chart
library(ggthemes)

# Most populated months
top_months <- count(df4, MONTH)
top_months$MONTH <- month.abb[top_months$MONTH]

# Convert to percentages
#top_months[top_months$MONTH %in% top_months$MONTH, "n"] / sum(top_months$n)
top_months$Average <- (top_months[top_months$MONTH %in% top_months$MONTH, "n"] / sum(top_months$n)) * 100

top_months$Average <- round(top_months$Average, digits = 3)


 top_planes <- count(df4, TAIL_NUMBER)

 
# Array of Top Tails
top4tails <- c("N480HA", "N488HA", "N484HA", "N493HA")
   
   
top4planes <- df4[(df4$TAIL_NUMBER %in% top4tails) & (df4$MONTH == 12)] 

# Change to days of week to nominals
top4planes <- top4planes[order(top4planes$DEPARTURE_DELAY, decreasing = TRUE)]
top4planes$DAY_OF_WEEK<- as.factor(top4planes$DAY_OF_WEEK)


# Create Dataframe for heatmap

airline_by_week <- df4 %>%
  group_by(AIRLINE, DAY_OF_WEEK) %>%
  summarise(n = length(AIRLINE), .groups = 'keep') %>%
  data.frame()

#Confirmation Count
#count(df4, df4$AIRLINE)
airline_by_week$DAY_OF_WEEK<- as.factor(airline_by_week$DAY_OF_WEEK)

HeatMAP

This is a heatmap of how many cancellations or delays were experienced by each airline, and what days of the weeks the cancellations or delays were seen. In this heatmap we can see the days of the week represented as numbers on the x-axis and the airlines represented as with their tags on the y-axis. This heatmap allows us to see particular airlines that are having more cancellations or delays than others. The X axis allows us to see if there are days that are experiencing more cancellations or delays than others. As we can see, WN had the most amount of flight delays and the delays occurred most frequently on a Sunday.

ggplot(airline_by_week, aes(x = DAY_OF_WEEK, y= AIRLINE, fill = n)) +
  geom_tile(color= "black") +
  geom_text(aes(label =comma(n))) +
  coord_equal(ratio =1) +
  labs(title = "Heatmap of Flight Delays by Day of week and Airline") + 
  theme(plot.title = element_text(hjust= 0.5))

Pie Chart

This Pie Chart allows us to see which months are more likely to experience cancellations or delays than others. It helps the viewer see if there is a month that performs significantly worse than others. This chart shows that there is no month that is significantly more likely to experience cancellations or delays.

ggplot(data=top_months, aes(x="", y = n, fill = MONTH))+
  geom_bar(stat="identity", position= "fill") +
  coord_polar(theta="y", start=0) +
  labs(fill ="MONTH", x = NULL, y= NULL, title ="Percent of Flight delays by Month") +
  theme_light()+
  theme(plot.title = element_text(hjust = 0.5),
  axis.text = element_blank(), 
  axis.ticks = element_blank(),
  panel.grid = element_blank())+
  geom_text(aes(x=1.7, label= paste0(Average,"%")),
            size=4,
            position = position_fill(vjust = 0.5))

Mean Departure Delay by Airline

This is a bar graph which shows the mean departure delays by airline. Here we can see which airlines are experiencing the longer delays than others. The y-axis indicates the name of the airline while the x-axis indicates the departure delay in minutes. From this data it is evident that Spirit had the longest departure delay of 15.94 minutes, while Hawaiian Airlines had the shortest delay, with .49 minutes.

df3$Delay <- round(df3$Delay, 2)
ggplot(df3[1:14,], aes(x=Delay, y=Airline)) + 
  geom_bar(colour="darkblue", fill="lightblue", stat="identity") +
  labs(title = " Mean Departure Delay by Airline", x = "Departure Delay (min)") +
  theme(plot.title = element_text(hjust= 0.5)) +
  geom_text(aes(label = Delay),colour = "black", hjust=1) 

Mean Arrival Delay

This is a multi-bar graph which shows the mean arrival delays by Airline grouped by months. In this graph the viewer is able to see what airlines are experiencing the most delays and what months they occur in. On the y-axis we can see the mean arrival delay in minutes and in the x-axis we can see each Airline. Each color indicates a different month that the delays occurred in. We can see that Spirit airlines had the longest flight delay- amounting over 30 minutes, which occurred in June.

ggplot(df2[1:149,], aes(x=AIRLINE, y=MEAN_A_DELAY, fill=MONTH)) + 
  labs(title = " Mean Arrival Delay by Airline and Month", y = "Mean Arrival Delay (min)") +
  theme(plot.title = element_text(hjust= 0.5))+
  geom_bar(stat="identity", position ='dodge')

Line Graph

This Line Graph shows the total time of Departure Delays for the planes that had the most delays and cancellations. On the y-axis we have the total time for their delays in minutes and on the x-axis we have the day of the week those delays are most frequently seen on. From the line graph, we can conclude that the plane with tail number N488HA was delayed for the greatest amount of time (over 150 minutes) and this occurred on a Sunday.

ggplot(top4planes, aes(x=DAY_OF_WEEK , y=DEPARTURE_DELAY, group=TAIL_NUMBER ))+
  labs(title = "Departure Delays for Top 4 most entered Tail Numbers by Day of Week", y=" Departure Delay (min)")+
theme(plot.title = element_text(hjust= 0.5))+
geom_line(aes(color=TAIL_NUMBER), size=2)