# Load libraries
install.packages("ggplot")
## Warning: package 'ggplot' is not available for this version of R
## 
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
library(DBI)
library(RPostgres)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
# Check that DBI loaded properly
if (!exists("dbGetQuery")) {
  stop("DBI package not loaded properly.")
}
# Connect using RPostgres
con <- dbConnect(
  RPostgres::Postgres(),
  dbname = "Data607",
  host = "localhost",
  port = 5432,
  user = "postgres",
  password = "admin%$509"
)
# Test if connection works:
test_result <- dbGetQuery(con, "SELECT 1 as test")
print("Connection test:")
## [1] "Connection test:"
print(test_result)
##   test
## 1    1
# Define the table name
table_name <- "flight_performance"
# Get all data from flight_performance table
flight_data <- dbGetQuery(con, paste("SELECT * FROM", table_name))
# Save to CSV
write.csv(flight_data, "flight_performance.csv", row.names = FALSE)
print("CSV file saved successfully!")
## [1] "CSV file saved successfully!"
# 1. Get basic table info using SQL queries
print("COLUMN NAMES AND TYPES:")
## [1] "COLUMN NAMES AND TYPES:"
columns_info <- dbGetQuery(con, paste0("
  SELECT column_name, data_type 
  FROM information_schema.columns 
  WHERE table_name = '", table_name, "'
  ORDER BY ordinal_position
"))
print(columns_info)
##     column_name         data_type
## 1       airline character varying
## 2        status character varying
## 3   los_angeles           integer
## 4       phoenix           integer
## 5     san_diego           integer
## 6 san_francisco           integer
## 7       seattle           integer
# 2. Get total row count
row_count <- dbGetQuery(con, paste("SELECT COUNT(*) as total_rows FROM", table_name))
print(paste("TOTAL ROWS:", row_count$total_rows))
## [1] "TOTAL ROWS: 4"
# 3. Look at first 10 rows
print("FIRST 10 ROWS:")
## [1] "FIRST 10 ROWS:"
sample_data <- dbGetQuery(con, paste("SELECT * FROM", table_name, "LIMIT 10"))
print(sample_data)
##   airline  status los_angeles phoenix san_diego san_francisco seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2  ALASKA delayed          62      12        20           102     305
## 3  AMWEST on time         694    4840       383           320     201
## 4  AMWEST delayed         117     415        65           129      61
# 4. Basic data exploration
print("COLUMN NAMES:")
## [1] "COLUMN NAMES:"
print(names(flight_data))
## [1] "airline"       "status"        "los_angeles"   "phoenix"      
## [5] "san_diego"     "san_francisco" "seattle"
print("DATA STRUCTURE:")
## [1] "DATA STRUCTURE:"
str(flight_data)
## 'data.frame':    4 obs. of  7 variables:
##  $ airline      : chr  "ALASKA" "ALASKA" "AMWEST" "AMWEST"
##  $ status       : chr  "on time" "delayed" "on time" "delayed"
##  $ los_angeles  : int  497 62 694 117
##  $ phoenix      : int  221 12 4840 415
##  $ san_diego    : int  212 20 383 65
##  $ san_francisco: int  503 102 320 129
##  $ seattle      : int  1841 305 201 61
print("SUMMARY STATISTICS:")
## [1] "SUMMARY STATISTICS:"
summary(flight_data)
##    airline             status           los_angeles       phoenix      
##  Length:4           Length:4           Min.   : 62.0   Min.   :  12.0  
##  Class :character   Class :character   1st Qu.:103.2   1st Qu.: 168.8  
##  Mode  :character   Mode  :character   Median :307.0   Median : 318.0  
##                                        Mean   :342.5   Mean   :1372.0  
##                                        3rd Qu.:546.2   3rd Qu.:1521.2  
##                                        Max.   :694.0   Max.   :4840.0  
##    san_diego      san_francisco      seattle    
##  Min.   : 20.00   Min.   :102.0   Min.   :  61  
##  1st Qu.: 53.75   1st Qu.:122.2   1st Qu.: 166  
##  Median :138.50   Median :224.5   Median : 253  
##  Mean   :170.00   Mean   :263.5   Mean   : 602  
##  3rd Qu.:254.75   3rd Qu.:365.8   3rd Qu.: 689  
##  Max.   :383.00   Max.   :503.0   Max.   :1841
# Check for rows that are completely empty (all NA or empty values)
empty_rows <- apply(flight_data, 1, function(x) all(is.na(x) | x == ""))
num_empty_rows <- sum(empty_rows)
print(paste("Number of completely empty rows:", num_empty_rows))
## [1] "Number of completely empty rows: 0"
if(num_empty_rows > 0) {
  print("Empty row indices:")
  print(which(empty_rows))
} else {
  print("No completely empty rows found")
}
## [1] "No completely empty rows found"
# Check for rows with any NA values
rows_with_na <- apply(flight_data, 1, function(x) any(is.na(x)))
num_rows_with_na <- sum(rows_with_na)
print(paste("Number of rows with any missing values:", num_rows_with_na))
## [1] "Number of rows with any missing values: 0"
if(num_rows_with_na > 0) {
  print("Rows with missing values:")
  print(which(rows_with_na))
  print("These rows contain NA:")
  print(flight_data[rows_with_na, ])
} else {
  print("No rows with missing values found")
}
## [1] "No rows with missing values found"
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
library(tidyr)
# Converting flight_data to a long format
flight_data_long <- flight_data %>%
  pivot_longer(
    cols = los_angeles:seattle,         
    names_to = "destination",          
    values_to = "flight_count"        
  )
print(flight_data_long)
## # A tibble: 20 × 4
##    airline status  destination   flight_count
##    <chr>   <chr>   <chr>                <int>
##  1 ALASKA  on time los_angeles            497
##  2 ALASKA  on time phoenix                221
##  3 ALASKA  on time san_diego              212
##  4 ALASKA  on time san_francisco          503
##  5 ALASKA  on time seattle               1841
##  6 ALASKA  delayed los_angeles             62
##  7 ALASKA  delayed phoenix                 12
##  8 ALASKA  delayed san_diego               20
##  9 ALASKA  delayed san_francisco          102
## 10 ALASKA  delayed seattle                305
## 11 AMWEST  on time los_angeles            694
## 12 AMWEST  on time phoenix               4840
## 13 AMWEST  on time san_diego              383
## 14 AMWEST  on time san_francisco          320
## 15 AMWEST  on time seattle                201
## 16 AMWEST  delayed los_angeles            117
## 17 AMWEST  delayed phoenix                415
## 18 AMWEST  delayed san_diego               65
## 19 AMWEST  delayed san_francisco          129
## 20 AMWEST  delayed seattle                 61
flight_data_long %>% 
  group_by(destination) %>% 
  summarise(
    total_flights = sum(flight_count, na.rm = TRUE)
    ) %>% 
  arrange(desc(total_flights)) %>%
 
  print()
## # A tibble: 5 × 2
##   destination   total_flights
##   <chr>                 <int>
## 1 phoenix                5488
## 2 seattle                2408
## 3 los_angeles            1370
## 4 san_francisco          1054
## 5 san_diego               680
  flight_data_long %>% 
  group_by(destination) %>% 
  summarise(
    total_flights = sum(flight_count, na.rm = TRUE)
    ) %>% 
  arrange(desc(total_flights)) %>%
  ggplot(aes(x = destination, y = total_flights)) +
  geom_col(fill = "steelblue") +
  labs(title = "Total Flights by City",
       x = "City",
       y = "Total Flights") +
  theme_minimal()
For each city, there are 4 records: one for on-time flights and one for delayed flights, across two airlines. The average sums these values divides by 4 for the 4 categories and give an average value per category.
flight_data_long %>% 
  group_by(destination) %>% 
  summarise(
    average_flights = mean(flight_count, na.rm = TRUE)
    ) %>% 
  arrange(desc(average_flights)) %>%
  print()
## # A tibble: 5 × 2
##   destination   average_flights
##   <chr>                   <dbl>
## 1 phoenix                 1372 
## 2 seattle                  602 
## 3 los_angeles              342.
## 4 san_francisco            264.
## 5 san_diego                170
flight_data_long %>% 
  group_by(destination) %>% 
  summarise(
    average_flights = mean(flight_count, na.rm = TRUE)
    ) %>% 
  arrange(desc(average_flights)) %>%
  ggplot(aes(x = reorder(destination, average_flights), y = average_flights)) +
  geom_col(fill = "darkgreen", alpha = 0.7) +
  geom_text(aes(label = round(average_flights, 1)), hjust = -0.1) +
  coord_flip() +
  labs(title = "Average Flights by Destination",
       x = "Destination",
       y = "Average Flights") +
  theme_classic()
My approach to calculating the overall airline delay performance will be calculated in the following steps:
Add all delayed flights across all cities and airlines
Add all total flights across all cities and airlines
Divide delayed by total flights for percentage
Result shows network-wide delay rate
One number summarizes entire system performance
overall_flight_percentage <- flight_data_long %>%
  summarise(
    total_delayed_flights = sum(flight_count[status == "delayed"], na.rm = TRUE),
    total_all_flights = sum(flight_count, na.rm = TRUE),
    overall_delay_rate = (total_delayed_flights / total_all_flights) * 100
  ) %>%
  print()
## # A tibble: 1 × 3
##   total_delayed_flights total_all_flights overall_delay_rate
##                   <int>             <int>              <dbl>
## 1                  1288             11000               11.7
library(ggplot2)
library(dplyr)
# Calculate overall performance for Alaska
alaska_overall <- flight_data_long %>%
  filter(airline == "ALASKA") %>%
  summarise(
    total_delayed_flights = sum(flight_count[status == "delayed"], na.rm = TRUE),
    total_all_flights = sum(flight_count, na.rm = TRUE),
    overall_delay_rate = (total_delayed_flights / total_all_flights) * 100
  )
# Calculate overall performance for AmWest
amwest_overall <- flight_data_long %>%
  filter(airline == "AMWEST") %>%
  summarise(
    total_delayed_flights = sum(flight_count[status == "delayed"], na.rm = TRUE),
    total_all_flights = sum(flight_count, na.rm = TRUE),
    overall_delay_rate = (total_delayed_flights / total_all_flights) * 100
  )
# Create Alaska pie chart data
alaska_pie_data <- data.frame(
  status = c("On Time", "Delayed"),
  flights = c(alaska_overall$total_all_flights - alaska_overall$total_delayed_flights,
              alaska_overall$total_delayed_flights),
  percentage = c(100 - alaska_overall$overall_delay_rate,
                 alaska_overall$overall_delay_rate)
)
# Create AmWest pie chart data
amwest_pie_data <- data.frame(
  status = c("On Time", "Delayed"),
  flights = c(amwest_overall$total_all_flights - amwest_overall$total_delayed_flights,
              amwest_overall$total_delayed_flights),
  percentage = c(100 - amwest_overall$overall_delay_rate,
                 amwest_overall$overall_delay_rate)
)
# Alaska Airlines Pie Chart
ggplot(alaska_pie_data, aes(x = "", y = flights, fill = status)) +
  geom_col(width = 1) +
  coord_polar("y", start = 0) +
  geom_text(aes(label = paste0(status, "\n", round(percentage, 1), "%")), 
            position = position_stack(vjust = 0.5), size = 4, fontface = "bold") +
  labs(title = "Alaska Airlines - Flight Performance") +
  theme_void() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    legend.position = "none",
    plot.margin = margin(30, 30, 30, 30, "pt")
  ) +
  scale_fill_manual(values = c("On Time" = "grey", "Delayed" = "orange"))
# AmWest Airlines Pie Chart
ggplot(amwest_pie_data, aes(x = "", y = flights, fill = status)) +
  geom_col(width = 1) +
  coord_polar("y", start = 0) +
  geom_text(aes(label = paste0(status, "\n", round(percentage, 1), "%")), 
            position = position_stack(vjust = 0.5), size = 4, fontface = "bold") +
  labs(title = "AmWest Airlines - Flight Performance") +
  theme_void() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    legend.position = "none",
    plot.margin = margin(30, 30, 30, 30, "pt")
  ) +
  scale_fill_manual(values = c("On Time" = "grey", "Delayed" = "orange"))
# Print summary statistics
print("=== AIRLINE PERFORMANCE SUMMARY ===")
## [1] "=== AIRLINE PERFORMANCE SUMMARY ==="
print(paste("Alaska Airlines Delay Rate:", round(alaska_overall$overall_delay_rate, 1), "%"))
## [1] "Alaska Airlines Delay Rate: 13.3 %"
print(paste("AmWest Airlines Delay Rate:", round(amwest_overall$overall_delay_rate, 1), "%"))
## [1] "AmWest Airlines Delay Rate: 10.9 %"
##Summary of Overall Flight Performance Analysis:
Both Alaska Airlines (13.3% delay rate) and AmWest Airlines (10.9% delay rate) maintained exceptionally low delay frequencies, with fewer than 15% of flights experiencing delays across their route networks.
This consistently low delay performance across all five destinations—Los Angeles, Phoenix, San Diego, San Francisco, and Seattle—suggests both airlines have implemented operational strategies that effectively minimize disruptions. The sub-15% delay rates are particularly noteworthy in the airline industry, where delays can significantly impact passenger satisfaction, operational costs, and schedule reliability.
# Calculate Alaska delay percentage by city
alaska_delays <- flight_data_long %>%
  filter(airline == "ALASKA") %>%
  group_by(destination) %>%
  summarise(
    delayed_flights = sum(flight_count[status == "delayed"], na.rm = TRUE),
    total_flights = sum(flight_count, na.rm = TRUE),
    delay_percentage = (delayed_flights / total_flights) * 100
  ) %>%
  arrange(desc(delay_percentage))
# Create bar graph (FIXED - removed xlim)
ggplot(alaska_delays, aes(x = reorder(destination, delay_percentage), y = delay_percentage)) +
  geom_col(fill = "orange", alpha = 0.8) +
  geom_text(aes(label = paste0(round(delay_percentage, 1), "%")), 
            hjust = -0.1, size = 4) +
  coord_flip() +
  labs(
    title = "Alaska Airlines: Delay Rate by City",
    subtitle = "Percentage of delayed flights per destination",
    x = "Destination City",
    y = "Delay Rate (%)"
  ) +
  theme_minimal() +
  ylim(0, max(alaska_delays$delay_percentage) * 1.1)  # Use ylim instead of xlim
Alaska’s delay rates varied significantly across destinations,
ranging from 5.2% in Phoenix to 16.9% in San Francisco. San Francisco
emerged as Alaska’s most challenging route with the highest delay
frequency, while Phoenix demonstrated the airline’s best operational
performance. This variation suggests that Alaska faces location-specific
operational challenges, potentially related to airport infrastructure,
weather patterns, or air traffic congestion at certain
destinations.
# Calculate AmWest delay percentage by city
amwest_delays <- flight_data_long %>%
  filter(airline == "AMWEST") %>%
  group_by(destination) %>%
  summarise(
    delayed_flights = sum(flight_count[status == "delayed"], na.rm = TRUE),
    total_flights = sum(flight_count, na.rm = TRUE),
    delay_percentage = (delayed_flights / total_flights) * 100
  ) %>%
  arrange(desc(delay_percentage))
# Create bar graph
ggplot(amwest_delays, aes(x = reorder(destination, delay_percentage), y = delay_percentage)) +
  geom_col(fill = "purple", alpha = 0.8) +
  geom_text(aes(label = paste0(round(delay_percentage, 1), "%")), 
            hjust = -0.1, size = 4) +
  coord_flip() +
  labs(
    title = "AmWest Airlines: Delay Rate by City",
    subtitle = "Percentage of delayed flights per destination",
    x = "Destination City",
    y = "Delay Rate (%)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5),      # Center title
    plot.subtitle = element_text(hjust = 0.5),   # Center subtitle
    axis.title.x = element_text(hjust = 0.5),    # Center x-axis label
    axis.title.y = element_text(hjust = 0.5)     # Center y-axis label
  ) +
  ylim(0, max(amwest_delays$delay_percentage) * 1.1)
AmWest’s strongest performance was recorded in Phoenix, where
optimal operational conditions—possibly including favorable weather
patterns, efficient airport operations, or strategic scheduling—resulted
in minimal delays. Conversely, San Francisco presented the greatest
operational challenges, with higher delay rates likely attributed to
factors such as complex airspace management, or airport congestion,
staffing and scheduling challenges.
Both AMWest and Alaska Airlines exhibited their highest delay rates in San Francisco, indicating that this destination poses significant operational challenges for both carriers. This commonality suggests that external factors specific to San Francisco, such as air traffic congestion, weather conditions, or airport infrastructure limitations, may be contributing to delays across different airlines.The both showed their lowest delay rates in Phoenix, indicating that this destination offers more favorable operational conditions for both carriers.
Describe discrepancy between comparing two airlines’ flight performances city-by-city and overall. When comparing Alaska Airlines and AmWest’s performance, the overall analysis shows AmWest as the superior airline with a 10.9% delay rate compared to Alaska’s 13.3%. However, the city-by-city analysis reveals a more complex picture where Alaska outperforms AmWest in some destinations while AmWest excels in others. For example, Alaska may have significantly better performance in high-traffic routes like Los Angeles or Seattle, while AmWest dominates in Phoenix. This creates a discrepancy where the “better” airline depends entirely on which specific route passengers are considering, contradicting the overall ranking that suggests AmWest is universally superior.
Key Analytical Differences:
Explanation of the discrepancy between comparing two airlines’ flight performances city-by-city and overall
This discrepancy occurs because overall performance averages mask individual route variations. AmWest’s lower overall delay rate (10.9%) most likely results from their exceptional performance in specific cities—particularly high-volume routes where strong performance heavily influences the total. Meanwhile, Alaska’s higher overall rate (13.3%) may be skewed by poor performance on a few challenging routes, even though they excel on others. Overall percentages weight all flights equally, so an airline with excellent performance on high-traffic routes will show better overall statistics, even if they struggle on smaller routes.
Comparing the overall percentage of delays does not account for cities that can have extremely high delays nor does it account for events that can give extreme values such as snow storms, hurricanes, or other weather events. Even under staffing , plane maintenance, and air traffic control issues can cause delays that are not accounted for in the overall percentage. For example AWWest showed an average delay across all cities of 10.9% but the percent delay across cities showed values from 7.9% to 28.7&%. This is a significant difference that is not present in the overall percentage.