# Load libraries
install.packages("ggplot")
## Warning: package 'ggplot' is not available for this version of R
##
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
library(DBI)
library(RPostgres)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
# Check that DBI loaded properly
if (!exists("dbGetQuery")) {
stop("DBI package not loaded properly.")
}
# Connect using RPostgres
con <- dbConnect(
RPostgres::Postgres(),
dbname = "Data607",
host = "localhost",
port = 5432,
user = "postgres",
password = "admin%$509"
)
# Test if connection works:
test_result <- dbGetQuery(con, "SELECT 1 as test")
print("Connection test:")
## [1] "Connection test:"
print(test_result)
## test
## 1 1
# Define the table name
table_name <- "flight_performance"
# Get all data from flight_performance table
flight_data <- dbGetQuery(con, paste("SELECT * FROM", table_name))
# Save to CSV
write.csv(flight_data, "flight_performance.csv", row.names = FALSE)
print("CSV file saved successfully!")
## [1] "CSV file saved successfully!"
# 1. Get basic table info using SQL queries
print("COLUMN NAMES AND TYPES:")
## [1] "COLUMN NAMES AND TYPES:"
columns_info <- dbGetQuery(con, paste0("
SELECT column_name, data_type
FROM information_schema.columns
WHERE table_name = '", table_name, "'
ORDER BY ordinal_position
"))
print(columns_info)
## column_name data_type
## 1 airline character varying
## 2 status character varying
## 3 los_angeles integer
## 4 phoenix integer
## 5 san_diego integer
## 6 san_francisco integer
## 7 seattle integer
# 2. Get total row count
row_count <- dbGetQuery(con, paste("SELECT COUNT(*) as total_rows FROM", table_name))
print(paste("TOTAL ROWS:", row_count$total_rows))
## [1] "TOTAL ROWS: 4"
# 3. Look at first 10 rows
print("FIRST 10 ROWS:")
## [1] "FIRST 10 ROWS:"
sample_data <- dbGetQuery(con, paste("SELECT * FROM", table_name, "LIMIT 10"))
print(sample_data)
## airline status los_angeles phoenix san_diego san_francisco seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AMWEST on time 694 4840 383 320 201
## 4 AMWEST delayed 117 415 65 129 61
# 4. Basic data exploration
print("COLUMN NAMES:")
## [1] "COLUMN NAMES:"
print(names(flight_data))
## [1] "airline" "status" "los_angeles" "phoenix"
## [5] "san_diego" "san_francisco" "seattle"
print("DATA STRUCTURE:")
## [1] "DATA STRUCTURE:"
str(flight_data)
## 'data.frame': 4 obs. of 7 variables:
## $ airline : chr "ALASKA" "ALASKA" "AMWEST" "AMWEST"
## $ status : chr "on time" "delayed" "on time" "delayed"
## $ los_angeles : int 497 62 694 117
## $ phoenix : int 221 12 4840 415
## $ san_diego : int 212 20 383 65
## $ san_francisco: int 503 102 320 129
## $ seattle : int 1841 305 201 61
print("SUMMARY STATISTICS:")
## [1] "SUMMARY STATISTICS:"
summary(flight_data)
## airline status los_angeles phoenix
## Length:4 Length:4 Min. : 62.0 Min. : 12.0
## Class :character Class :character 1st Qu.:103.2 1st Qu.: 168.8
## Mode :character Mode :character Median :307.0 Median : 318.0
## Mean :342.5 Mean :1372.0
## 3rd Qu.:546.2 3rd Qu.:1521.2
## Max. :694.0 Max. :4840.0
## san_diego san_francisco seattle
## Min. : 20.00 Min. :102.0 Min. : 61
## 1st Qu.: 53.75 1st Qu.:122.2 1st Qu.: 166
## Median :138.50 Median :224.5 Median : 253
## Mean :170.00 Mean :263.5 Mean : 602
## 3rd Qu.:254.75 3rd Qu.:365.8 3rd Qu.: 689
## Max. :383.00 Max. :503.0 Max. :1841
# Check for rows that are completely empty (all NA or empty values)
empty_rows <- apply(flight_data, 1, function(x) all(is.na(x) | x == ""))
num_empty_rows <- sum(empty_rows)
print(paste("Number of completely empty rows:", num_empty_rows))
## [1] "Number of completely empty rows: 0"
if(num_empty_rows > 0) {
print("Empty row indices:")
print(which(empty_rows))
} else {
print("No completely empty rows found")
}
## [1] "No completely empty rows found"
# Check for rows with any NA values
rows_with_na <- apply(flight_data, 1, function(x) any(is.na(x)))
num_rows_with_na <- sum(rows_with_na)
print(paste("Number of rows with any missing values:", num_rows_with_na))
## [1] "Number of rows with any missing values: 0"
if(num_rows_with_na > 0) {
print("Rows with missing values:")
print(which(rows_with_na))
print("These rows contain NA:")
print(flight_data[rows_with_na, ])
} else {
print("No rows with missing values found")
}
## [1] "No rows with missing values found"
Note that the echo = FALSE
parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
library(tidyr)
# Converting flight_data to a long format
flight_data_long <- flight_data %>%
pivot_longer(
cols = los_angeles:seattle,
names_to = "destination",
values_to = "flight_count"
)
print(flight_data_long)
## # A tibble: 20 × 4
## airline status destination flight_count
## <chr> <chr> <chr> <int>
## 1 ALASKA on time los_angeles 497
## 2 ALASKA on time phoenix 221
## 3 ALASKA on time san_diego 212
## 4 ALASKA on time san_francisco 503
## 5 ALASKA on time seattle 1841
## 6 ALASKA delayed los_angeles 62
## 7 ALASKA delayed phoenix 12
## 8 ALASKA delayed san_diego 20
## 9 ALASKA delayed san_francisco 102
## 10 ALASKA delayed seattle 305
## 11 AMWEST on time los_angeles 694
## 12 AMWEST on time phoenix 4840
## 13 AMWEST on time san_diego 383
## 14 AMWEST on time san_francisco 320
## 15 AMWEST on time seattle 201
## 16 AMWEST delayed los_angeles 117
## 17 AMWEST delayed phoenix 415
## 18 AMWEST delayed san_diego 65
## 19 AMWEST delayed san_francisco 129
## 20 AMWEST delayed seattle 61
flight_data_long %>%
group_by(destination) %>%
summarise(
total_flights = sum(flight_count, na.rm = TRUE)
) %>%
arrange(desc(total_flights)) %>%
print()
## # A tibble: 5 × 2
## destination total_flights
## <chr> <int>
## 1 phoenix 5488
## 2 seattle 2408
## 3 los_angeles 1370
## 4 san_francisco 1054
## 5 san_diego 680
flight_data_long %>%
group_by(destination) %>%
summarise(
total_flights = sum(flight_count, na.rm = TRUE)
) %>%
arrange(desc(total_flights)) %>%
ggplot(aes(x = destination, y = total_flights)) +
geom_col(fill = "steelblue") +
labs(title = "Total Flights by City",
x = "City",
y = "Total Flights") +
theme_minimal()
For each city, there are 4 records: one for on-time flights and one for delayed flights, across two airlines. The average sums these values divides by 4 for the 4 categories and give an average value per category.
flight_data_long %>%
group_by(destination) %>%
summarise(
average_flights = mean(flight_count, na.rm = TRUE)
) %>%
arrange(desc(average_flights)) %>%
print()
## # A tibble: 5 × 2
## destination average_flights
## <chr> <dbl>
## 1 phoenix 1372
## 2 seattle 602
## 3 los_angeles 342.
## 4 san_francisco 264.
## 5 san_diego 170
flight_data_long %>%
group_by(destination) %>%
summarise(
average_flights = mean(flight_count, na.rm = TRUE)
) %>%
arrange(desc(average_flights)) %>%
ggplot(aes(x = reorder(destination, average_flights), y = average_flights)) +
geom_col(fill = "darkgreen", alpha = 0.7) +
geom_text(aes(label = round(average_flights, 1)), hjust = -0.1) +
coord_flip() +
labs(title = "Average Flights by Destination",
x = "Destination",
y = "Average Flights") +
theme_classic()
My approach to calculating the overall airline delay performance will be calculated in the following steps:
Add all delayed flights across all cities and airlines
Add all total flights across all cities and airlines
Divide delayed by total flights for percentage
Result shows network-wide delay rate
One number summarizes entire system performance
overall_flight_percentage <- flight_data_long %>%
summarise(
total_delayed_flights = sum(flight_count[status == "delayed"], na.rm = TRUE),
total_all_flights = sum(flight_count, na.rm = TRUE),
overall_delay_rate = (total_delayed_flights / total_all_flights) * 100
) %>%
print()
## # A tibble: 1 × 3
## total_delayed_flights total_all_flights overall_delay_rate
## <int> <int> <dbl>
## 1 1288 11000 11.7
library(ggplot2)
library(dplyr)
# Calculate overall performance for Alaska
alaska_overall <- flight_data_long %>%
filter(airline == "ALASKA") %>%
summarise(
total_delayed_flights = sum(flight_count[status == "delayed"], na.rm = TRUE),
total_all_flights = sum(flight_count, na.rm = TRUE),
overall_delay_rate = (total_delayed_flights / total_all_flights) * 100
)
# Calculate overall performance for AmWest
amwest_overall <- flight_data_long %>%
filter(airline == "AMWEST") %>%
summarise(
total_delayed_flights = sum(flight_count[status == "delayed"], na.rm = TRUE),
total_all_flights = sum(flight_count, na.rm = TRUE),
overall_delay_rate = (total_delayed_flights / total_all_flights) * 100
)
# Create Alaska pie chart data
alaska_pie_data <- data.frame(
status = c("On Time", "Delayed"),
flights = c(alaska_overall$total_all_flights - alaska_overall$total_delayed_flights,
alaska_overall$total_delayed_flights),
percentage = c(100 - alaska_overall$overall_delay_rate,
alaska_overall$overall_delay_rate)
)
# Create AmWest pie chart data
amwest_pie_data <- data.frame(
status = c("On Time", "Delayed"),
flights = c(amwest_overall$total_all_flights - amwest_overall$total_delayed_flights,
amwest_overall$total_delayed_flights),
percentage = c(100 - amwest_overall$overall_delay_rate,
amwest_overall$overall_delay_rate)
)
# Alaska Airlines Pie Chart
ggplot(alaska_pie_data, aes(x = "", y = flights, fill = status)) +
geom_col(width = 1) +
coord_polar("y", start = 0) +
geom_text(aes(label = paste0(status, "\n", round(percentage, 1), "%")),
position = position_stack(vjust = 0.5), size = 4, fontface = "bold") +
labs(title = "Alaska Airlines - Flight Performance") +
theme_void() +
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
legend.position = "none",
plot.margin = margin(30, 30, 30, 30, "pt")
) +
scale_fill_manual(values = c("On Time" = "grey", "Delayed" = "orange"))
# AmWest Airlines Pie Chart
ggplot(amwest_pie_data, aes(x = "", y = flights, fill = status)) +
geom_col(width = 1) +
coord_polar("y", start = 0) +
geom_text(aes(label = paste0(status, "\n", round(percentage, 1), "%")),
position = position_stack(vjust = 0.5), size = 4, fontface = "bold") +
labs(title = "AmWest Airlines - Flight Performance") +
theme_void() +
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
legend.position = "none",
plot.margin = margin(30, 30, 30, 30, "pt")
) +
scale_fill_manual(values = c("On Time" = "grey", "Delayed" = "orange"))
# Print summary statistics
print("=== AIRLINE PERFORMANCE SUMMARY ===")
## [1] "=== AIRLINE PERFORMANCE SUMMARY ==="
print(paste("Alaska Airlines Delay Rate:", round(alaska_overall$overall_delay_rate, 1), "%"))
## [1] "Alaska Airlines Delay Rate: 13.3 %"
print(paste("AmWest Airlines Delay Rate:", round(amwest_overall$overall_delay_rate, 1), "%"))
## [1] "AmWest Airlines Delay Rate: 10.9 %"
##Summary of Overall Flight Performance Analysis:
Both Alaska Airlines (13.3% delay rate) and AmWest Airlines (10.9% delay rate) maintained exceptionally low delay frequencies, with fewer than 15% of flights experiencing delays across their route networks.
This consistently low delay performance across all five destinations—Los Angeles, Phoenix, San Diego, San Francisco, and Seattle—suggests both airlines have implemented operational strategies that effectively minimize disruptions. The sub-15% delay rates are particularly noteworthy in the airline industry, where delays can significantly impact passenger satisfaction, operational costs, and schedule reliability.
# Calculate Alaska delay percentage by city
alaska_delays <- flight_data_long %>%
filter(airline == "ALASKA") %>%
group_by(destination) %>%
summarise(
delayed_flights = sum(flight_count[status == "delayed"], na.rm = TRUE),
total_flights = sum(flight_count, na.rm = TRUE),
delay_percentage = (delayed_flights / total_flights) * 100
) %>%
arrange(desc(delay_percentage))
# Create bar graph (FIXED - removed xlim)
ggplot(alaska_delays, aes(x = reorder(destination, delay_percentage), y = delay_percentage)) +
geom_col(fill = "orange", alpha = 0.8) +
geom_text(aes(label = paste0(round(delay_percentage, 1), "%")),
hjust = -0.1, size = 4) +
coord_flip() +
labs(
title = "Alaska Airlines: Delay Rate by City",
subtitle = "Percentage of delayed flights per destination",
x = "Destination City",
y = "Delay Rate (%)"
) +
theme_minimal() +
ylim(0, max(alaska_delays$delay_percentage) * 1.1) # Use ylim instead of xlim
Alaska’s delay rates varied significantly across destinations,
ranging from 5.2% in Phoenix to 16.9% in San Francisco. San Francisco
emerged as Alaska’s most challenging route with the highest delay
frequency, while Phoenix demonstrated the airline’s best operational
performance. This variation suggests that Alaska faces location-specific
operational challenges, potentially related to airport infrastructure,
weather patterns, or air traffic congestion at certain
destinations.
# Calculate AmWest delay percentage by city
amwest_delays <- flight_data_long %>%
filter(airline == "AMWEST") %>%
group_by(destination) %>%
summarise(
delayed_flights = sum(flight_count[status == "delayed"], na.rm = TRUE),
total_flights = sum(flight_count, na.rm = TRUE),
delay_percentage = (delayed_flights / total_flights) * 100
) %>%
arrange(desc(delay_percentage))
# Create bar graph
ggplot(amwest_delays, aes(x = reorder(destination, delay_percentage), y = delay_percentage)) +
geom_col(fill = "purple", alpha = 0.8) +
geom_text(aes(label = paste0(round(delay_percentage, 1), "%")),
hjust = -0.1, size = 4) +
coord_flip() +
labs(
title = "AmWest Airlines: Delay Rate by City",
subtitle = "Percentage of delayed flights per destination",
x = "Destination City",
y = "Delay Rate (%)"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5), # Center title
plot.subtitle = element_text(hjust = 0.5), # Center subtitle
axis.title.x = element_text(hjust = 0.5), # Center x-axis label
axis.title.y = element_text(hjust = 0.5) # Center y-axis label
) +
ylim(0, max(amwest_delays$delay_percentage) * 1.1)
AmWest’s strongest performance was recorded in Phoenix, where
optimal operational conditions—possibly including favorable weather
patterns, efficient airport operations, or strategic scheduling—resulted
in minimal delays. Conversely, San Francisco presented the greatest
operational challenges, with higher delay rates likely attributed to
factors such as complex airspace management, or airport congestion,
staffing and scheduling challenges.
Both AMWest and Alaska Airlines exhibited their highest delay rates in San Francisco, indicating that this destination poses significant operational challenges for both carriers. This commonality suggests that external factors specific to San Francisco, such as air traffic congestion, weather conditions, or airport infrastructure limitations, may be contributing to delays across different airlines.The both showed their lowest delay rates in Phoenix, indicating that this destination offers more favorable operational conditions for both carriers.
Describe discrepancy between comparing two airlines’ flight performances city-by-city and overall. When comparing Alaska Airlines and AmWest’s performance, the overall analysis shows AmWest as the superior airline with a 10.9% delay rate compared to Alaska’s 13.3%. However, the city-by-city analysis reveals a more complex picture where Alaska outperforms AmWest in some destinations while AmWest excels in others. For example, Alaska may have significantly better performance in high-traffic routes like Los Angeles or Seattle, while AmWest dominates in Phoenix. This creates a discrepancy where the “better” airline depends entirely on which specific route passengers are considering, contradicting the overall ranking that suggests AmWest is universally superior.
Key Analytical Differences:
Explanation of the discrepancy between comparing two airlines’ flight performances city-by-city and overall
This discrepancy occurs because overall performance averages mask individual route variations. AmWest’s lower overall delay rate (10.9%) most likely results from their exceptional performance in specific cities—particularly high-volume routes where strong performance heavily influences the total. Meanwhile, Alaska’s higher overall rate (13.3%) may be skewed by poor performance on a few challenging routes, even though they excel on others. Overall percentages weight all flights equally, so an airline with excellent performance on high-traffic routes will show better overall statistics, even if they struggle on smaller routes.
Comparing the overall percentage of delays does not account for cities that can have extremely high delays nor does it account for events that can give extreme values such as snow storms, hurricanes, or other weather events. Even under staffing , plane maintenance, and air traffic control issues can cause delays that are not accounted for in the overall percentage. For example AWWest showed an average delay across all cities of 10.9% but the percent delay across cities showed values from 7.9% to 28.7&%. This is a significant difference that is not present in the overall percentage.