Library

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(dplyr)
library(ggplot2)

Read the information for .CSV file

flights_data <- read.csv ("https://raw.githubusercontent.com/JaydeeJan/Data-607-Assignment-4/refs/heads/main/flights_data.csv")
flights_data
##         X     X.1 Los.Angels Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA On Time        497     221       212           503   1,841
## 2         Delayed         62      12        20           102     305
## 3                         NA                NA            NA        
## 4 AM West On Time        694   4,840       383           320     201
## 5         Delayed        117     415        65           129      61

Clean and organize the data

flights_clean <- flights_data[!apply(flights_data, 1, function(x) all(is.na(x) | x == "")),]
flights_clean
##         X     X.1 Los.Angels Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA On Time        497     221       212           503   1,841
## 2         Delayed         62      12        20           102     305
## 4 AM West On Time        694   4,840       383           320     201
## 5         Delayed        117     415        65           129      61
flights_clean$X[2] <- "ALASKA"
flights_clean$X[4] <- "AM West"
flights_clean
##         X     X.1 Los.Angels Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA On Time        497     221       212           503   1,841
## 2  ALASKA Delayed         62      12        20           102     305
## 4 AM West On Time        694   4,840       383           320     201
## 5 AM West Delayed        117     415        65           129      61
colnames(flights_clean)[1] <- "Airline"
colnames(flights_clean)[2] <- "Status"
colnames(flights_clean)
## [1] "Airline"       "Status"        "Los.Angels"    "Phoenix"      
## [5] "San.Diego"     "San.Francisco" "Seattle"
flights_clean
##   Airline  Status Los.Angels Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA On Time        497     221       212           503   1,841
## 2  ALASKA Delayed         62      12        20           102     305
## 4 AM West On Time        694   4,840       383           320     201
## 5 AM West Delayed        117     415        65           129      61

Tidy and transform the data from wide to long format

str(flights_clean)
## 'data.frame':    4 obs. of  7 variables:
##  $ Airline      : chr  "ALASKA" "ALASKA" "AM West" "AM West"
##  $ Status       : chr  "On Time" "Delayed" "On Time" "Delayed"
##  $ Los.Angels   : int  497 62 694 117
##  $ Phoenix      : chr  "221" "12" "4,840" "415"
##  $ San.Diego    : int  212 20 383 65
##  $ San.Francisco: int  503 102 320 129
##  $ Seattle      : chr  "1,841" "305" "201" "61"
flights_clean$Phoenix <- as.numeric((gsub(",", "", flights_clean$Phoenix)))

flights_clean$Seattle <- as.numeric(gsub(",", "", flights_clean$Seattle))

flights_reshape <- flights_clean %>%
  pivot_longer(cols = -c(Airline, Status),
               names_to = "City",
               values_to = "Flights")
flights_reshape
## # A tibble: 20 × 4
##    Airline Status  City          Flights
##    <chr>   <chr>   <chr>           <dbl>
##  1 ALASKA  On Time Los.Angels        497
##  2 ALASKA  On Time Phoenix           221
##  3 ALASKA  On Time San.Diego         212
##  4 ALASKA  On Time San.Francisco     503
##  5 ALASKA  On Time Seattle          1841
##  6 ALASKA  Delayed Los.Angels         62
##  7 ALASKA  Delayed Phoenix            12
##  8 ALASKA  Delayed San.Diego          20
##  9 ALASKA  Delayed San.Francisco     102
## 10 ALASKA  Delayed Seattle           305
## 11 AM West On Time Los.Angels        694
## 12 AM West On Time Phoenix          4840
## 13 AM West On Time San.Diego         383
## 14 AM West On Time San.Francisco     320
## 15 AM West On Time Seattle           201
## 16 AM West Delayed Los.Angels        117
## 17 AM West Delayed Phoenix           415
## 18 AM West Delayed San.Diego          65
## 19 AM West Delayed San.Francisco     129
## 20 AM West Delayed Seattle            61

Filter for delayed flights and compare delayed flights from both airlines

delayed_flights <- flights_reshape %>%
  filter(Status == "Delayed")

summary_delays <- delayed_flights %>%
  group_by(City, Airline) %>%
  summarise(Total_Delays = sum(Flights),
            .groups = 'drop')

summary_delays
## # A tibble: 10 × 3
##    City          Airline Total_Delays
##    <chr>         <chr>          <dbl>
##  1 Los.Angels    ALASKA            62
##  2 Los.Angels    AM West          117
##  3 Phoenix       ALASKA            12
##  4 Phoenix       AM West          415
##  5 San.Diego     ALASKA            20
##  6 San.Diego     AM West           65
##  7 San.Francisco ALASKA           102
##  8 San.Francisco AM West          129
##  9 Seattle       ALASKA           305
## 10 Seattle       AM West           61
ggplot(data = summary_delays, aes(x = City, y = Total_Delays, fill = Airline)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = " Comparison of Delayed Airline by City",
       x = "City",
       y = "Total Delays") +
  theme_minimal()

Filter on-time flights and compare on-time flights from both airlines

ontime_flights <- flights_reshape %>%
  filter(Status == "On Time")

summary_ontime <- ontime_flights %>%
  group_by(City, Airline) %>%
  summarise(Total_OnTime = sum(Flights),
            .groups = 'drop')

summary_ontime
## # A tibble: 10 × 3
##    City          Airline Total_OnTime
##    <chr>         <chr>          <dbl>
##  1 Los.Angels    ALASKA           497
##  2 Los.Angels    AM West          694
##  3 Phoenix       ALASKA           221
##  4 Phoenix       AM West         4840
##  5 San.Diego     ALASKA           212
##  6 San.Diego     AM West          383
##  7 San.Francisco ALASKA           503
##  8 San.Francisco AM West          320
##  9 Seattle       ALASKA          1841
## 10 Seattle       AM West          201
ggplot(data = summary_ontime, aes(x = City, y = Total_OnTime, fill = Airline)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = " Comparison of On-Time Airline by City",
       x = "City",
       y = "Total On-Time") +
  theme_minimal()

Calculate the total and average delays for each airline across all cities.

delay_analysis <- delayed_flights %>%
  group_by(Airline) %>%
  summarise(
    Total_Delays = sum(Flights),
    Avg_Delays = mean(Flights)
  )

delay_analysis
## # A tibble: 2 × 3
##   Airline Total_Delays Avg_Delays
##   <chr>          <dbl>      <dbl>
## 1 ALASKA           501       100.
## 2 AM West          787       157.

Calculate the total and average on-time for each airline across all cities.

ontime_analysis <- ontime_flights %>%
  group_by(Airline) %>%
  summarise(
    Total_OnTime = sum(Flights),
    Avg_OnTime = mean(Flights)
  )

ontime_analysis
## # A tibble: 2 × 3
##   Airline Total_OnTime Avg_OnTime
##   <chr>          <dbl>      <dbl>
## 1 ALASKA          3274       655.
## 2 AM West         6438      1288.

Data showing total delayed & on-time percentage and average delayed & on-time percentage

total_flights <- flights_reshape %>%
  group_by(Airline, City) %>%
  summarize(Total_Flights = sum (Flights),
            .groups = 'drop')

total_flights
## # A tibble: 10 × 3
##    Airline City          Total_Flights
##    <chr>   <chr>                 <dbl>
##  1 ALASKA  Los.Angels              559
##  2 ALASKA  Phoenix                 233
##  3 ALASKA  San.Diego               232
##  4 ALASKA  San.Francisco           605
##  5 ALASKA  Seattle                2146
##  6 AM West Los.Angels              811
##  7 AM West Phoenix                5255
##  8 AM West San.Diego               448
##  9 AM West San.Francisco           449
## 10 AM West Seattle                 262
flight_percentage <- flights_reshape %>%
  left_join(total_flights, by = c("City", "Airline")) %>%
  mutate(Percentage = (Flights / Total_Flights) * 100)

flight_percentage
## # A tibble: 20 × 6
##    Airline Status  City          Flights Total_Flights Percentage
##    <chr>   <chr>   <chr>           <dbl>         <dbl>      <dbl>
##  1 ALASKA  On Time Los.Angels        497           559      88.9 
##  2 ALASKA  On Time Phoenix           221           233      94.8 
##  3 ALASKA  On Time San.Diego         212           232      91.4 
##  4 ALASKA  On Time San.Francisco     503           605      83.1 
##  5 ALASKA  On Time Seattle          1841          2146      85.8 
##  6 ALASKA  Delayed Los.Angels         62           559      11.1 
##  7 ALASKA  Delayed Phoenix            12           233       5.15
##  8 ALASKA  Delayed San.Diego          20           232       8.62
##  9 ALASKA  Delayed San.Francisco     102           605      16.9 
## 10 ALASKA  Delayed Seattle           305          2146      14.2 
## 11 AM West On Time Los.Angels        694           811      85.6 
## 12 AM West On Time Phoenix          4840          5255      92.1 
## 13 AM West On Time San.Diego         383           448      85.5 
## 14 AM West On Time San.Francisco     320           449      71.3 
## 15 AM West On Time Seattle           201           262      76.7 
## 16 AM West Delayed Los.Angels        117           811      14.4 
## 17 AM West Delayed Phoenix           415          5255       7.90
## 18 AM West Delayed San.Diego          65           448      14.5 
## 19 AM West Delayed San.Francisco     129           449      28.7 
## 20 AM West Delayed Seattle            61           262      23.3
# calculating the percentage of delayed and on-time performance for ALASKA Airline

alaska_flights <- flights_reshape %>%
  filter(Airline == "ALASKA")
alaska_flights
## # A tibble: 10 × 4
##    Airline Status  City          Flights
##    <chr>   <chr>   <chr>           <dbl>
##  1 ALASKA  On Time Los.Angels        497
##  2 ALASKA  On Time Phoenix           221
##  3 ALASKA  On Time San.Diego         212
##  4 ALASKA  On Time San.Francisco     503
##  5 ALASKA  On Time Seattle          1841
##  6 ALASKA  Delayed Los.Angels         62
##  7 ALASKA  Delayed Phoenix            12
##  8 ALASKA  Delayed San.Diego          20
##  9 ALASKA  Delayed San.Francisco     102
## 10 ALASKA  Delayed Seattle           305
alaska_delayed <- alaska_flights %>%
  filter(Status == "Delayed") %>%
  summarize(Delayed_Flights = sum(Flights))
alaska_delayed
## # A tibble: 1 × 1
##   Delayed_Flights
##             <dbl>
## 1             501
alaska_ontime <- alaska_flights %>%
  filter(Status == "On Time") %>%
  summarize(OnTime_Flights = sum(Flights))
alaska_ontime
## # A tibble: 1 × 1
##   OnTime_Flights
##            <dbl>
## 1           3274
alaska_total <- alaska_flights %>%
  summarize(Total_Flights = sum(Flights))
alaska_total
## # A tibble: 1 × 1
##   Total_Flights
##           <dbl>
## 1          3775
alaska_delayed_percentage <- (alaska_delayed$Delayed_Flights / alaska_total$Total_Flights) * 100
alaska_delayed_percentage 
## [1] 13.27152
alaska_ontime_percentage <- (alaska_ontime$OnTime_Flights / alaska_total$Total_Flights) * 100
alaska_ontime_percentage 
## [1] 86.72848
# Calculating the percentage of delayed and on-time performance for AM West Airline

amwest_flights <- flights_reshape %>%
  filter(Airline == "AM West")
amwest_flights
## # A tibble: 10 × 4
##    Airline Status  City          Flights
##    <chr>   <chr>   <chr>           <dbl>
##  1 AM West On Time Los.Angels        694
##  2 AM West On Time Phoenix          4840
##  3 AM West On Time San.Diego         383
##  4 AM West On Time San.Francisco     320
##  5 AM West On Time Seattle           201
##  6 AM West Delayed Los.Angels        117
##  7 AM West Delayed Phoenix           415
##  8 AM West Delayed San.Diego          65
##  9 AM West Delayed San.Francisco     129
## 10 AM West Delayed Seattle            61
amwest_delayed <- amwest_flights %>%
  filter(Status == "Delayed") %>%
  summarize(Delayed_Flights = sum(Flights))
amwest_delayed
## # A tibble: 1 × 1
##   Delayed_Flights
##             <dbl>
## 1             787
amwest_ontime <- amwest_flights %>%
  filter(Status == "On Time") %>%
  summarize(OnTime_Flights = sum(Flights))
amwest_ontime
## # A tibble: 1 × 1
##   OnTime_Flights
##            <dbl>
## 1           6438
amwest_total <- amwest_flights %>%
  summarize(Total_Flights = sum(Flights))
amwest_total
## # A tibble: 1 × 1
##   Total_Flights
##           <dbl>
## 1          7225
amwest_delayed_percentage <- (amwest_delayed$Delayed_Flights / amwest_total$Total_Flights) * 100
amwest_delayed_percentage 
## [1] 10.89273
amwest_ontime_percentage <- (amwest_ontime$OnTime_Flights / amwest_total$Total_Flights) * 100
amwest_ontime_percentage 
## [1] 89.10727
# Comparing both airlines performance

airlines_analysis <- data.frame(
  Airline = c("ALASKA", "AM West"),
  Total_Flights = c(alaska_total$Total_Flights, amwest_total$Total_Flights),
  Delayed_Flights = c(alaska_delayed$Delayed_Flights, amwest_delayed$Delayed_Flights),
  OnTime_Flights = c(alaska_ontime$OnTime_Flights, amwest_ontime$OnTime_Flights),
  Delayed_Percentage = c(alaska_delayed_percentage, amwest_delayed_percentage),
  OnTime_Percentage = c(alaska_ontime_percentage, amwest_ontime_percentage)
)

airlines_analysis
##   Airline Total_Flights Delayed_Flights OnTime_Flights Delayed_Percentage
## 1  ALASKA          3775             501           3274           13.27152
## 2 AM West          7225             787           6438           10.89273
##   OnTime_Percentage
## 1          86.72848
## 2          89.10727

In Conclusion

Based on the analysis of the flights data for ALASKA and AM West, we can conclude both airlines show a good performance with over 86% of their flights arriving on time. However, AM West Airline appears to have less delays and higher on-time flights. Therefore, AM West offer a slightly more reliable service for passengers who priortized on-time performance.