library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(tidyr)
library(stringr) 
library(ggplot2)

(1) Create a .CSV file (or optionally, a MySQL database!) that includes all of the information above. You’re encouraged to use a “wide” structure similar to how the information appears above, so that you can practice tidying and transformations as described below.

Firstly, a data frame called “Arrival_Delay” is built , then, transformed it into “DATA 607 Assignment 5.csv” file for the chart given in problem statement that describes arrival delays for two airlines (ALASKA and AMWEST) across five destinations with some missing values and information about whether the flights were on time or delayed.

# Create a data frame 
Arrival_Delayed <- data.frame(
  X1 = c("ALASKA", NA, NA, "AMWEST", NA),
  X2 = c("ontime", "delayed", NA, "ontime", "delayed"),
  Los_Angeles = c(497, 62, NA, 694, 117),
  Phoenix = c(221, 12, NA, 4848, 415),
  San_Diego = c(212, 20, NA, 383, 65),
  San_Francisco = c(503, 102, NA, 320, 129),
  Seattle = c(1841, 305, NA, 201, 61)
)
# Export the data frame to a CSV file local directory
write.csv(Arrival_Delayed, "DATA 607 Assignment 5.csv", row.names = FALSE)

(2) Read the information from your.CSV file into R, and use tidyr and dplyr asneededtotidy and transform your data.

#Read the .csv file 
Arrival_Delayed <- read.csv("DATA 607 Assignment 5.csv")

Arrival_Delayed
##       X1      X2 Los_Angeles Phoenix San_Diego San_Francisco Seattle
## 1 ALASKA  ontime         497     221       212           503    1841
## 2   <NA> delayed          62      12        20           102     305
## 3   <NA>    <NA>          NA      NA        NA            NA      NA
## 4 AMWEST  ontime         694    4848       383           320     201
## 5   <NA> delayed         117     415        65           129      61

(3) Perform analysis to compare the arrival delays for the two airlines.

Flight data is prepared for transformation and analysis of arrival delays by filling missing columns’ names with “Airline” and “Status” and removing all entire blank row.

##   Airline  Status Los Angeles Phoenix San Diego San Francisco Seattle
## 1  ALASKA  ontime         497     221       212           503    1841
## 2  ALASKA delayed          62      12        20           102     305
## 4  AMWEST  ontime         694    4848       383           320     201
## 5  AMWEST delayed         117     415        65           129      61

Flight data is transformed from a wide format to a long format using the ‘gather’ function from columns 3 through 7 with columns “Destination” and “Number_of_flights.”

# reshape the data using 'gather' functions
transformed_data <- Arrival_Delayed %>%
  gather(Destination, Number_of_flights, 3:7)
transformed_data
##    Airline  Status   Destination Number_of_flights
## 1   ALASKA  ontime   Los Angeles               497
## 2   ALASKA delayed   Los Angeles                62
## 3   AMWEST  ontime   Los Angeles               694
## 4   AMWEST delayed   Los Angeles               117
## 5   ALASKA  ontime       Phoenix               221
## 6   ALASKA delayed       Phoenix                12
## 7   AMWEST  ontime       Phoenix              4848
## 8   AMWEST delayed       Phoenix               415
## 9   ALASKA  ontime     San Diego               212
## 10  ALASKA delayed     San Diego                20
## 11  AMWEST  ontime     San Diego               383
## 12  AMWEST delayed     San Diego                65
## 13  ALASKA  ontime San Francisco               503
## 14  ALASKA delayed San Francisco               102
## 15  AMWEST  ontime San Francisco               320
## 16  AMWEST delayed San Francisco               129
## 17  ALASKA  ontime       Seattle              1841
## 18  ALASKA delayed       Seattle               305
## 19  AMWEST  ontime       Seattle               201
## 20  AMWEST delayed       Seattle                61

The data set is now getting ready for analyzing arrival delays of two airlines. To determine which airline has the least arrival delays:

  1. Calculate the total number of flights per airlines and create a bar plot to visualize the total number of flights per airline, we can see the flight volumes of each airlines
# Calculate the total number of flights per airline
total_flights_per_airline <- transformed_data %>%
  group_by(Airline) %>%   # group the data by the "Airline" column 
  summarise(Total_flights = sum(Number_of_flights, na.rm = TRUE))  # summarize along with sum on the "Number_of_flights" column.

# plot flights volume per airline
ggplot(total_flights_per_airline, aes(x = Airline, y = Total_flights)) +
  geom_bar(stat = "identity", fill = "skyblue") + 
  geom_text(aes(label = Total_flights), vjust = -0.45) + 
  labs(
    title = "Total Number of Flights per Airline",
    x = "Airline",
    y = "Flights Volumes"
  ) +
  theme_minimal()

The plot shows that AM West has far more flights than Alaska.

  1. Determine how many flights were on time and how many had arrival delays for each of the airlines and summarize the arrival delay status for each airline
# Calculate the number of on-time and delayed flights for each airline
arrival_delay_summary <- transformed_data %>%
  group_by(Airline, Status) %>%
  summarise(Count = sum(Number_of_flights, na.rm = TRUE), .groups = "drop")

# Create a bar plot
ggplot(data = arrival_delay_summary) + 
  aes(x = Airline, y = Count, fill = Airline) + 
  geom_col() + 
  facet_grid(~Status) + 
  geom_text(aes(label = Count), vjust = -.25) + 
  ylab("Number of Flights") + 
  xlab("Airline") + 
  labs(title = "On-Time and Delayed by Airline",
    x = "Airline",
    y = "Count") + 
  theme_minimal() 

The graph compares the counts of on-time and delayed flights for each airline. It is clearly seen that AMWEST has greater on-time flights that means much lesser arrival delays than ALASKA.

  1. Calculate the percentage of arrival delays by dividing the “Count” (number of delayed flights) by the sum of all counts for that airline and then multiplying by 100. It is found that AMWEST has a smaller percentage of arrival delays compared to Alaska Airlines.
# Calculate the percentage of arrival delays to total flights for each airline
arrival_delay_percentages <- arrival_delay_summary %>%
  group_by(Airline) %>% 
  mutate(Percent = (Count / sum(Count)) * 100)
arrival_delay_percentages
## # A tibble: 4 × 4
## # Groups:   Airline [2]
##   Airline Status  Count Percent
##   <chr>   <chr>   <int>   <dbl>
## 1 ALASKA  delayed   501    13.3
## 2 ALASKA  ontime   3274    86.7
## 3 AMWEST  delayed   787    10.9
## 4 AMWEST  ontime   6446    89.1
# Create a bar plot for arrival delay percentages
ggplot(data = arrival_delay_percentages, aes(x = Airline, y = Percent, fill = Status)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    title = "On-Time and Delayed Percentages by Airline",
    x = "Airline",
    y = "Percentage (%)",
    fill = "Status"
  ) +
  geom_text(aes(label = round(Percent, 2)), position = position_dodge(width = 0.9), vjust = -0.5) +
  theme_minimal()

The bar plot shows the comparison of on-time and delayed percentages for AMWEST and ALASKA Airlines.

To further analysis of the arrival delayed percentages for each destination (e.g., Los Angeles, Phoenix, San Diego, San Francisco, Seattle) per Airline, we can create a grouped bar plot to see how the arrival delayed percentages vary by destination.

arrival_delay_destination <- transformed_data %>%
  group_by(Airline, Destination) %>%
  summarise(Count = sum(Number_of_flights, na.rm = TRUE), .groups = "drop")
arrival_delay_destination
## # A tibble: 10 × 3
##    Airline Destination   Count
##    <chr>   <chr>         <int>
##  1 ALASKA  Los Angeles     559
##  2 ALASKA  Phoenix         233
##  3 ALASKA  San Diego       232
##  4 ALASKA  San Francisco   605
##  5 ALASKA  Seattle        2146
##  6 AMWEST  Los Angeles     811
##  7 AMWEST  Phoenix        5263
##  8 AMWEST  San Diego       448
##  9 AMWEST  San Francisco   449
## 10 AMWEST  Seattle         262
# Create a bar plot
ggplot(data = arrival_delay_destination) + 
  aes(x = Airline, y = Count, fill = Destination) + 
  geom_col() + 
  facet_grid(~Destination) + 
  geom_text(aes(label = Count), vjust = -.45) + 
  ylab("Number of Flights") + 
  xlab("Airline") + 
  labs(title = "Number of Flights per Destination",
    x = "Airline",
    y = "Flight Counts") + 
  theme_minimal() 

# Calculate the percentage of arrival delays to total flights for each airline
destination_delayed_percentages <- arrival_delay_destination %>%
  group_by(Destination) %>% 
  mutate(Percent = (Count / sum(Count)) * 100)


# Create a side-by-side bar plot for arrival delay percentages by airline in terms of destination
ggplot(data = destination_delayed_percentages, aes(x = Destination, y = Percent, fill = Airline)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    title = "Delayed Percentages per Airline by Destination",
    x = "Destination",
    y = "Arrival Delayed Percentage (%)",
    fill = "Airline"
  ) +
  geom_text(aes(label = round(Percent, 2)), position = position_dodge(width = 0.9), vjust = -0.5) +
  theme_minimal()

Conclusions

The plot above shows ALASKA airline has a lowest percentage of flights with arrival delays to Phoenix than AMWEST airline. It might be reasonable comments because the maximum number of fights were going to Phoenix.