library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tibble' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## Warning: package 'purrr' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” forcats   1.0.1     âś” readr     2.1.6
## âś” ggplot2   4.0.2     âś” stringr   1.6.0
## âś” lubridate 1.9.4     âś” tibble    3.3.1
## âś” purrr     1.2.1     âś” tidyr     1.3.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)

#Approach For Assignment 5A, the first step is to manually construct the dataset in Microsoft Excel using the structure provided in the original chart. The data will be organized in a wide format, preserving the layout of airlines, flight status (on time vs. delayed), and destinations.

After verifying accuracy, the file is exported as a comma-separated values (.csv) file to ensure compatibility with R. The CSV file was then uploaded to my GitHub repository, and the raw file URL was used to import the dataset directly into R using the read_csv() function.

get_url <- "https://raw.githubusercontent.com/japhet125/Assignement-Data-Science/refs/heads/main/Assignment5A.csv"
get_data <- read.csv(get_url)
## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on
## 'https://raw.githubusercontent.com/japhet125/Assignement-Data-Science/refs/heads/main/Assignment5A.csv'
head(get_data)
##   Airline  Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  Alaska on time         497     221       212           503    1841
## 2  Alaska delayed          62      12        20           102     305
## 3  AmWest on time         697    4840       383           320     201
## 4  AmWest delayed         117     415        65           129      61
#colnames(get_data)

Once the data were successfully loaded, I will use the tidyverse packages, particularly tidyr and dplyr, to reshape and clean the dataset. Specifically, I will transforme the data from wide format to long format to facilitate comparison and analysis. This restructuring allowed for clearer aggregation and calculation of delay totals and delay rates across airlines and destinations.

The cleaned dataset will then used to compare arrival delays between Alaska and AmWest Airlines, focusing on both total delays and proportional delay rates to ensure a meaningful and accurate comparison.

airlines_delays <- get_data |>
  pivot_longer(
    cols = -c(Airline, Status),
    names_to = "Destination",
    values_to = "Count"
    
  )
airlines_delays
## # A tibble: 20 Ă— 4
##    Airline Status  Destination   Count
##    <chr>   <chr>   <chr>         <int>
##  1 Alaska  on time Los.Angeles     497
##  2 Alaska  on time Phoenix         221
##  3 Alaska  on time San.Diego       212
##  4 Alaska  on time San.Francisco   503
##  5 Alaska  on time Seattle        1841
##  6 Alaska  delayed Los.Angeles      62
##  7 Alaska  delayed Phoenix          12
##  8 Alaska  delayed San.Diego        20
##  9 Alaska  delayed San.Francisco   102
## 10 Alaska  delayed Seattle         305
## 11 AmWest  on time Los.Angeles     697
## 12 AmWest  on time Phoenix        4840
## 13 AmWest  on time San.Diego       383
## 14 AmWest  on time San.Francisco   320
## 15 AmWest  on time Seattle         201
## 16 AmWest  delayed Los.Angeles     117
## 17 AmWest  delayed Phoenix         415
## 18 AmWest  delayed San.Diego        65
## 19 AmWest  delayed San.Francisco   129
## 20 AmWest  delayed Seattle          61
#colnames(airlines_delays)

#trying wider and shorter

#airlines_delays <- get_data |>
#  pivot_wider(
  #  names_from = "Airline",
  #  values_from = "Status"
 # )
#airlines_delays
#colnames(airlines_delays)

we can compare airlines delays

By comparing Airlines delays we can see that AmWest have more delays than Alaska

airlines_delays |>
  filter(Status == 'delayed') %>%
  group_by(Airline, Destination) %>%
  summarise(Total_delays = sum(Count), .groups = "drop")
## # A tibble: 10 Ă— 3
##    Airline Destination   Total_delays
##    <chr>   <chr>                <int>
##  1 Alaska  Los.Angeles             62
##  2 Alaska  Phoenix                 12
##  3 Alaska  San.Diego               20
##  4 Alaska  San.Francisco          102
##  5 Alaska  Seattle                305
##  6 AmWest  Los.Angeles            117
##  7 AmWest  Phoenix                415
##  8 AmWest  San.Diego               65
##  9 AmWest  San.Francisco          129
## 10 AmWest  Seattle                 61

#we can compare airlines ontime

By comparing Airlines on time we can see that AmWest has more on time than Alaska

airlines_delays |>
  filter(Status == 'on time') %>%
  group_by(Airline, Destination) %>%
  summarise(Total_on_time = sum(Count), .groups = "drop")
## # A tibble: 10 Ă— 3
##    Airline Destination   Total_on_time
##    <chr>   <chr>                 <int>
##  1 Alaska  Los.Angeles             497
##  2 Alaska  Phoenix                 221
##  3 Alaska  San.Diego               212
##  4 Alaska  San.Francisco           503
##  5 Alaska  Seattle                1841
##  6 AmWest  Los.Angeles             697
##  7 AmWest  Phoenix                4840
##  8 AmWest  San.Diego               383
##  9 AmWest  San.Francisco           320
## 10 AmWest  Seattle                 201

Calculating the delays rate

the delay rate show that Alaska has a delay rate greater than AmWest when combine.

delay_summary <- airlines_delays |>
 # filter(Status) %>%
  group_by(Airline, Destination, Status) %>%
  summarise(Total = sum(Count), .groups = "drop") %>%
  pivot_wider(names_from = "Status", values_from = "Total") %>%
  mutate(delay_rate = delayed / (`on time` + delayed) )
 # select(delay_rate)
delay_summary
## # A tibble: 10 Ă— 5
##    Airline Destination   delayed `on time` delay_rate
##    <chr>   <chr>           <int>     <int>      <dbl>
##  1 Alaska  Los.Angeles        62       497     0.111 
##  2 Alaska  Phoenix            12       221     0.0515
##  3 Alaska  San.Diego          20       212     0.0862
##  4 Alaska  San.Francisco     102       503     0.169 
##  5 Alaska  Seattle           305      1841     0.142 
##  6 AmWest  Los.Angeles       117       697     0.144 
##  7 AmWest  Phoenix           415      4840     0.0790
##  8 AmWest  San.Diego          65       383     0.145 
##  9 AmWest  San.Francisco     129       320     0.287 
## 10 AmWest  Seattle            61       201     0.233
delay_summary |>
  ggplot(aes( x = Destination, y = delay_rate, fill = Airline)) + geom_col(position = "dodge")

delay_summar <- airlines_delays |>
 # filter(Status) %>%
  group_by(Airline, Status) %>%
  summarise(Total = sum(Count), .groups = "drop") %>%
  pivot_wider(names_from = "Status", values_from = "Total") %>%
  mutate(delay_rate = delayed / (`on time` + delayed) )
 # select(delay_rate)
delay_summar
## # A tibble: 2 Ă— 4
##   Airline delayed `on time` delay_rate
##   <chr>     <int>     <int>      <dbl>
## 1 Alaska      501      3274      0.133
## 2 AmWest      787      6441      0.109

#using ggplot to analyze AmWest has a greater total number of delayed flights compared to Alaska when aggregating across all destinations according to the ggplot bar geom_col.

airlines_delays |>
  ggplot(aes(x =  Airline, y = Count, fill = Status)) + geom_col()

# we can plot the delay rate and analyze by destination AmWest has a greater total number of delayed flights compared to Alaska when aggregating across all destinations.

delay_summary |>
  ggplot(aes(x = Airline, y = delay_rate)) + geom_col()

#we can plot the total delay rate from all destination here with the total delay rate of all destination we can see that Alaska has more delay rate.

delay_summar |>
  ggplot(aes(x = Airline, y = delay_rate)) + geom_col()

#Conclusion

The analysis demonstrates that while AmWest has a higher total number of delayed flights, this is largely due to operating a substantially greater number of flights overall. When examining proportional delay rates, Alaska exhibits a higher delay rate than AmWest.

This outcome highlights the importance of analyzing relative proportions rather than relying solely on aggregated totals. The results illustrate Simpson’s Paradox, where aggregated data can lead to misleading conclusions if underlying group distributions are not considered.

This assignment emphasizes the importance of data tidying, transformation, and careful interpretation when performing comparative analyses.