library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(tidyr)
library(stringr)
library(ggplot2)
Firstly, a data frame called “Arrival_Delay” is built , then, transformed it into “DATA 607 Assignment 5.csv” file for the chart given in problem statement that describes arrival delays for two airlines (ALASKA and AMWEST) across five destinations with some missing values and information about whether the flights were on time or delayed.
# Create a data frame
Arrival_Delayed <- data.frame(
X1 = c("ALASKA", NA, NA, "AMWEST", NA),
X2 = c("ontime", "delayed", NA, "ontime", "delayed"),
Los_Angeles = c(497, 62, NA, 694, 117),
Phoenix = c(221, 12, NA, 4848, 415),
San_Diego = c(212, 20, NA, 383, 65),
San_Francisco = c(503, 102, NA, 320, 129),
Seattle = c(1841, 305, NA, 201, 61)
)
# Export the data frame to a CSV file local directory
write.csv(Arrival_Delayed, "DATA 607 Assignment 5.csv", row.names = FALSE)
#Read the .csv file
Arrival_Delayed <- read.csv("DATA 607 Assignment 5.csv")
Arrival_Delayed
## X1 X2 Los_Angeles Phoenix San_Diego San_Francisco Seattle
## 1 ALASKA ontime 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 <NA> <NA> NA NA NA NA NA
## 4 AMWEST ontime 694 4848 383 320 201
## 5 <NA> delayed 117 415 65 129 61
Flight data is prepared for transformation and analysis of arrival delays by filling missing columns’ names with “Airline” and “Status” and removing all entire blank row.
## Airline Status Los Angeles Phoenix San Diego San Francisco Seattle
## 1 ALASKA ontime 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 4 AMWEST ontime 694 4848 383 320 201
## 5 AMWEST delayed 117 415 65 129 61
Flight data is transformed from a wide format to a long format using the ‘gather’ function from columns 3 through 7 with columns “Destination” and “Number_of_flights.”
# reshape the data using 'gather' functions
transformed_data <- Arrival_Delayed %>%
gather(Destination, Number_of_flights, 3:7)
transformed_data
## Airline Status Destination Number_of_flights
## 1 ALASKA ontime Los Angeles 497
## 2 ALASKA delayed Los Angeles 62
## 3 AMWEST ontime Los Angeles 694
## 4 AMWEST delayed Los Angeles 117
## 5 ALASKA ontime Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AMWEST ontime Phoenix 4848
## 8 AMWEST delayed Phoenix 415
## 9 ALASKA ontime San Diego 212
## 10 ALASKA delayed San Diego 20
## 11 AMWEST ontime San Diego 383
## 12 AMWEST delayed San Diego 65
## 13 ALASKA ontime San Francisco 503
## 14 ALASKA delayed San Francisco 102
## 15 AMWEST ontime San Francisco 320
## 16 AMWEST delayed San Francisco 129
## 17 ALASKA ontime Seattle 1841
## 18 ALASKA delayed Seattle 305
## 19 AMWEST ontime Seattle 201
## 20 AMWEST delayed Seattle 61
The data set is now getting ready for analyzing arrival delays of two airlines. To determine which airline has the least arrival delays:
# Calculate the total number of flights per airline
total_flights_per_airline <- transformed_data %>%
group_by(Airline) %>% # group the data by the "Airline" column
summarise(Total_flights = sum(Number_of_flights, na.rm = TRUE)) # summarize along with sum on the "Number_of_flights" column.
# plot flights volume per airline
ggplot(total_flights_per_airline, aes(x = Airline, y = Total_flights)) +
geom_bar(stat = "identity", fill = "skyblue") +
geom_text(aes(label = Total_flights), vjust = -0.45) +
labs(
title = "Total Number of Flights per Airline",
x = "Airline",
y = "Flights Volumes"
) +
theme_minimal()
The plot shows that AM West has far more flights than Alaska.
# Calculate the number of on-time and delayed flights for each airline
arrival_delay_summary <- transformed_data %>%
group_by(Airline, Status) %>%
summarise(Count = sum(Number_of_flights, na.rm = TRUE), .groups = "drop")
# Create a bar plot
ggplot(data = arrival_delay_summary) +
aes(x = Airline, y = Count, fill = Airline) +
geom_col() +
facet_grid(~Status) +
geom_text(aes(label = Count), vjust = -.25) +
ylab("Number of Flights") +
xlab("Airline") +
labs(title = "On-Time and Delayed by Airline",
x = "Airline",
y = "Count") +
theme_minimal()
The graph compares the counts of on-time and delayed flights for each airline. It is clearly seen that AMWEST has greater on-time flights that means much lesser arrival delays than ALASKA.
# Calculate the percentage of arrival delays to total flights for each airline
arrival_delay_percentages <- arrival_delay_summary %>%
group_by(Airline) %>%
mutate(Percent = (Count / sum(Count)) * 100)
arrival_delay_percentages
## # A tibble: 4 × 4
## # Groups: Airline [2]
## Airline Status Count Percent
## <chr> <chr> <int> <dbl>
## 1 ALASKA delayed 501 13.3
## 2 ALASKA ontime 3274 86.7
## 3 AMWEST delayed 787 10.9
## 4 AMWEST ontime 6446 89.1
# Create a bar plot for arrival delay percentages
ggplot(data = arrival_delay_percentages, aes(x = Airline, y = Percent, fill = Status)) +
geom_bar(stat = "identity", position = "dodge") +
labs(
title = "On-Time and Delayed Percentages by Airline",
x = "Airline",
y = "Percentage (%)",
fill = "Status"
) +
geom_text(aes(label = round(Percent, 2)), position = position_dodge(width = 0.9), vjust = -0.5) +
theme_minimal()
The bar plot shows the comparison of on-time and delayed percentages for AMWEST and ALASKA Airlines.
To further analysis of the arrival delayed percentages for each destination (e.g., Los Angeles, Phoenix, San Diego, San Francisco, Seattle) per Airline, we can create a grouped bar plot to see how the arrival delayed percentages vary by destination.
arrival_delay_destination <- transformed_data %>%
group_by(Airline, Destination) %>%
summarise(Count = sum(Number_of_flights, na.rm = TRUE), .groups = "drop")
arrival_delay_destination
## # A tibble: 10 × 3
## Airline Destination Count
## <chr> <chr> <int>
## 1 ALASKA Los Angeles 559
## 2 ALASKA Phoenix 233
## 3 ALASKA San Diego 232
## 4 ALASKA San Francisco 605
## 5 ALASKA Seattle 2146
## 6 AMWEST Los Angeles 811
## 7 AMWEST Phoenix 5263
## 8 AMWEST San Diego 448
## 9 AMWEST San Francisco 449
## 10 AMWEST Seattle 262
# Create a bar plot
ggplot(data = arrival_delay_destination) +
aes(x = Airline, y = Count, fill = Destination) +
geom_col() +
facet_grid(~Destination) +
geom_text(aes(label = Count), vjust = -.45) +
ylab("Number of Flights") +
xlab("Airline") +
labs(title = "Number of Flights per Destination",
x = "Airline",
y = "Flight Counts") +
theme_minimal()
# Calculate the percentage of arrival delays to total flights for each airline
destination_delayed_percentages <- arrival_delay_destination %>%
group_by(Destination) %>%
mutate(Percent = (Count / sum(Count)) * 100)
# Create a side-by-side bar plot for arrival delay percentages by airline in terms of destination
ggplot(data = destination_delayed_percentages, aes(x = Destination, y = Percent, fill = Airline)) +
geom_bar(stat = "identity", position = "dodge") +
labs(
title = "Delayed Percentages per Airline by Destination",
x = "Destination",
y = "Arrival Delayed Percentage (%)",
fill = "Airline"
) +
geom_text(aes(label = round(Percent, 2)), position = position_dodge(width = 0.9), vjust = -0.5) +
theme_minimal()
Conclusions
The plot above shows ALASKA airline has a lowest percentage of flights with arrival delays to Phoenix than AMWEST airline. It might be reasonable comments because the maximum number of fights were going to Phoenix.