The data can be downloaded from github.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(knitr)
# Read the CSV file, specifying missing values
file_path <- "C:\\ProgramData\\MySQL\\MySQL Server 8.0\\Uploads\\airlines.csv"
airlines_data <- read_csv(file_path, na = c("-1"))
## Rows: 4 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Airline, Status
## dbl (5): Los Angeles, Phoenix, San Diego, San Francisco, Seattle
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
kable(airlines_data, caption = "Complete Airlines Dataset")
| Airline | Status | Los Angeles | Phoenix | San Diego | San Francisco | Seattle |
|---|---|---|---|---|---|---|
| ALASKA | on time | 497 | 221 | 212 | 503 | 1841 |
| ALASKA | delayed | 62 | 12 | 20 | 102 | 305 |
| AM WEST | on time | 694 | 4840 | 383 | 320 | 201 |
| AM WEST | delayed | 117 | 415 | 65 | 129 | 61 |
airlines_long <- airlines_data %>%
pivot_longer(cols = c("Los Angeles", "Phoenix", "San Diego", "San Francisco", "Seattle"),
names_to = "Destination",
values_to = "Result")
head (airlines_long)
## # A tibble: 6 × 4
## Airline Status Destination Result
## <chr> <chr> <chr> <dbl>
## 1 ALASKA on time Los Angeles 497
## 2 ALASKA on time Phoenix 221
## 3 ALASKA on time San Diego 212
## 4 ALASKA on time San Francisco 503
## 5 ALASKA on time Seattle 1841
## 6 ALASKA delayed Los Angeles 62
# Summarize total flights by airline
Airline_summary <- airlines_long %>%
group_by(Airline) %>%
summarise(total_flights = sum(Result))
kable(Airline_summary, caption = "Total Flights by Airline")
| Airline | total_flights |
|---|---|
| ALASKA | 3775 |
| AM WEST | 7225 |
# Summarize total flights for each destination by airline
destination_summary <- airlines_long %>%
group_by(Destination, Airline) %>%
summarise(total_flights = sum(Result))
## `summarise()` has grouped output by 'Destination'. You can override using the
## `.groups` argument.
kable(destination_summary, caption = "Total Flights by Destination")
| Destination | Airline | total_flights |
|---|---|---|
| Los Angeles | ALASKA | 559 |
| Los Angeles | AM WEST | 811 |
| Phoenix | ALASKA | 233 |
| Phoenix | AM WEST | 5255 |
| San Diego | ALASKA | 232 |
| San Diego | AM WEST | 448 |
| San Francisco | ALASKA | 605 |
| San Francisco | AM WEST | 449 |
| Seattle | ALASKA | 2146 |
| Seattle | AM WEST | 262 |
# Grouped bar chart to show total flights for each destination by airline
ggplot(data = destination_summary, aes(x = Destination, y = total_flights, fill = Airline)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
labs(title = "Total Flights to Each Destination by Airline",
x = "Destination",
y = "Total Number of Flights",
fill = "Airline") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_manual(values = c("ALASKA" = "#0077C8", "AM WEST" = "#C8102E"))
The graph shows that AM WEST has significantly more flights to Phoenix compared to ALASKA and also has a higher number of flights to Los Angeles and San Diego. On the other hand, ALASKA has the most flights to Seattle, and relatively higher flights to San Francisco.
airlines_long <- airlines_long %>%
group_by(Destination, Airline) %>%
mutate(percentage = ifelse(Status == "delayed", (Result / sum(Result)) * 100, NA))
airlines_long <- airlines_long %>%
group_by(Destination, Airline) %>%
mutate(label_y = ifelse(Status == "delayed",
lag(cumsum(Result), default = 0) + (Result / 2),
NA))
print(airlines_long)
## # A tibble: 20 × 6
## # Groups: Destination, Airline [10]
## Airline Status Destination Result percentage label_y
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 ALASKA on time Los Angeles 497 NA NA
## 2 ALASKA on time Phoenix 221 NA NA
## 3 ALASKA on time San Diego 212 NA NA
## 4 ALASKA on time San Francisco 503 NA NA
## 5 ALASKA on time Seattle 1841 NA NA
## 6 ALASKA delayed Los Angeles 62 11.1 528
## 7 ALASKA delayed Phoenix 12 5.15 227
## 8 ALASKA delayed San Diego 20 8.62 222
## 9 ALASKA delayed San Francisco 102 16.9 554
## 10 ALASKA delayed Seattle 305 14.2 1994.
## 11 AM WEST on time Los Angeles 694 NA NA
## 12 AM WEST on time Phoenix 4840 NA NA
## 13 AM WEST on time San Diego 383 NA NA
## 14 AM WEST on time San Francisco 320 NA NA
## 15 AM WEST on time Seattle 201 NA NA
## 16 AM WEST delayed Los Angeles 117 14.4 752.
## 17 AM WEST delayed Phoenix 415 7.90 5048.
## 18 AM WEST delayed San Diego 65 14.5 416.
## 19 AM WEST delayed San Francisco 129 28.7 384.
## 20 AM WEST delayed Seattle 61 23.3 232.
ggplot(data = airlines_long, aes(x = Destination, y = Result, fill = Status)) +
geom_bar(stat = "identity", width = 0.7) +
geom_text(data = airlines_long %>% filter(Status == "delayed"),
aes(label = paste0(round(percentage, 1), "%"), y = label_y),
color = "black", size = 4) +
facet_grid(~ Airline) +
theme_minimal() +
labs(title = "Comparison of Delays and On-Time Flights by Airline and Destination",
x = "Destination",
y = "Total Flights",
fill = "Status") +
theme(axis.text.x = element_text(angle = 60, hjust = 1, vjust = 1))
Alaska Airline experiences the highest percentage of delays for flights to San Francisco at 16.9%, followed by Seattle at 14.2%. Delays for Phoenix and San Diego are relatively low. AM West Airline shows the most significant percentage of delays for flights to San Francisco (28.7%), followed by Seattle (23.3%) and Los Angeles (8.5%). Phoenix has low delay percentages.
# Summarize flight status by airline
status_summary <- airlines_long %>%
group_by(Status, Airline) %>%
summarise(total_flights = sum(Result))
## `summarise()` has grouped output by 'Status'. You can override using the
## `.groups` argument.
status_summary <- status_summary %>%
group_by(Airline) %>%
mutate(percentage = ifelse(Status == "delayed", (total_flights / sum(total_flights)) * 100, NA))
print(status_summary)
## # A tibble: 4 × 4
## # Groups: Airline [2]
## Status Airline total_flights percentage
## <chr> <chr> <dbl> <dbl>
## 1 delayed ALASKA 501 13.3
## 2 delayed AM WEST 787 10.9
## 3 on time ALASKA 3274 NA
## 4 on time AM WEST 6438 NA
# Create a stacked bar chart_airlines
ggplot(data = status_summary, aes(x = Airline, y = total_flights, fill = Status)) +
geom_bar(stat = "identity", width = 0.5) +
geom_text(data = status_summary %>% filter(Status == "delayed"),
aes(label = paste0(round(percentage, 1), "%")),
position = position_stack(vjust = 9.5),
color = "black", size = 4) +
theme_minimal() +
labs(title = "Comparison of Delays and On-Time Flights for Airlines",
x = "Airline",
y = "Number of Flights",
fill = "Flight Status") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ALASKA Airline has 501 delays and 3274 on time flights, and delays accounted for 13.3% of their total flights. AM WEST Airline has 787 delayed flights and 6438 on time flights, and delayed flights represented 10.9% of their total flights.
# Summarize flights for each destination by flight status
destination_status <- airlines_long %>%
group_by(Status, Destination) %>%
summarise(total_flights = sum(Result))
## `summarise()` has grouped output by 'Status'. You can override using the
## `.groups` argument.
destination_status <- destination_status %>%
group_by(Destination) %>%
mutate(percentage = ifelse(Status == "delayed", (total_flights / sum(total_flights)) * 100, NA))
print(destination_status)
## # A tibble: 10 × 4
## # Groups: Destination [5]
## Status Destination total_flights percentage
## <chr> <chr> <dbl> <dbl>
## 1 delayed Los Angeles 179 13.1
## 2 delayed Phoenix 427 7.78
## 3 delayed San Diego 85 12.5
## 4 delayed San Francisco 231 21.9
## 5 delayed Seattle 366 15.2
## 6 on time Los Angeles 1191 NA
## 7 on time Phoenix 5061 NA
## 8 on time San Diego 595 NA
## 9 on time San Francisco 823 NA
## 10 on time Seattle 2042 NA
# Create a stacked bar chart_destination
ggplot(data = destination_status, aes(x = Destination, y = total_flights, fill = Status)) +
geom_bar(stat = "identity", width = 0.5) +
theme_minimal() +
labs(title = "Comparison of Delays and On-Time Flights by destination",
x = "Destination",
y = "Number of Flights",
fill = "Flight Status") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
San Francisco has the highest percentage of delayed flights at 21.9%, followed by Seattle at 15.2%. Phoenix has the lowest percentage of delayed flights, with only 7.8% of flights being delayed.
Comparing flight performance city-by-city versus overall reveals that certain cities experience more delays. AM WEST performs well overall but has a high delay rate in San Francisco and a low rate in Phoenix.