library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tibble' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## Warning: package 'purrr' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” forcats 1.0.1 âś” readr 2.1.6
## âś” ggplot2 4.0.2 âś” stringr 1.6.0
## âś” lubridate 1.9.4 âś” tibble 3.3.1
## âś” purrr 1.2.1 âś” tidyr 1.3.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
#Approach For Assignment 5A, the first step is to manually construct the dataset in Microsoft Excel using the structure provided in the original chart. The data will be organized in a wide format, preserving the layout of airlines, flight status (on time vs. delayed), and destinations.
After verifying accuracy, the file is exported as a comma-separated values (.csv) file to ensure compatibility with R. The CSV file was then uploaded to my GitHub repository, and the raw file URL was used to import the dataset directly into R using the read_csv() function.
get_url <- "https://raw.githubusercontent.com/japhet125/Assignement-Data-Science/refs/heads/main/Assignment5A.csv"
get_data <- read.csv(get_url)
## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## incomplete final line found by readTableHeader on
## 'https://raw.githubusercontent.com/japhet125/Assignement-Data-Science/refs/heads/main/Assignment5A.csv'
head(get_data)
## Airline Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 Alaska on time 497 221 212 503 1841
## 2 Alaska delayed 62 12 20 102 305
## 3 AmWest on time 697 4840 383 320 201
## 4 AmWest delayed 117 415 65 129 61
#colnames(get_data)
Once the data were successfully loaded, I will use the tidyverse packages, particularly tidyr and dplyr, to reshape and clean the dataset. Specifically, I will transforme the data from wide format to long format to facilitate comparison and analysis. This restructuring allowed for clearer aggregation and calculation of delay totals and delay rates across airlines and destinations.
The cleaned dataset will then used to compare arrival delays between Alaska and AmWest Airlines, focusing on both total delays and proportional delay rates to ensure a meaningful and accurate comparison.
airlines_delays <- get_data |>
pivot_longer(
cols = -c(Airline, Status),
names_to = "Destination",
values_to = "Count"
)
airlines_delays
## # A tibble: 20 Ă— 4
## Airline Status Destination Count
## <chr> <chr> <chr> <int>
## 1 Alaska on time Los.Angeles 497
## 2 Alaska on time Phoenix 221
## 3 Alaska on time San.Diego 212
## 4 Alaska on time San.Francisco 503
## 5 Alaska on time Seattle 1841
## 6 Alaska delayed Los.Angeles 62
## 7 Alaska delayed Phoenix 12
## 8 Alaska delayed San.Diego 20
## 9 Alaska delayed San.Francisco 102
## 10 Alaska delayed Seattle 305
## 11 AmWest on time Los.Angeles 697
## 12 AmWest on time Phoenix 4840
## 13 AmWest on time San.Diego 383
## 14 AmWest on time San.Francisco 320
## 15 AmWest on time Seattle 201
## 16 AmWest delayed Los.Angeles 117
## 17 AmWest delayed Phoenix 415
## 18 AmWest delayed San.Diego 65
## 19 AmWest delayed San.Francisco 129
## 20 AmWest delayed Seattle 61
#colnames(airlines_delays)
#trying wider and shorter
#airlines_delays <- get_data |>
# pivot_wider(
# names_from = "Airline",
# values_from = "Status"
# )
#airlines_delays
#colnames(airlines_delays)
By comparing Airlines delays we can see that AmWest have more delays than Alaska
airlines_delays |>
filter(Status == 'delayed') %>%
group_by(Airline, Destination) %>%
summarise(Total_delays = sum(Count), .groups = "drop")
## # A tibble: 10 Ă— 3
## Airline Destination Total_delays
## <chr> <chr> <int>
## 1 Alaska Los.Angeles 62
## 2 Alaska Phoenix 12
## 3 Alaska San.Diego 20
## 4 Alaska San.Francisco 102
## 5 Alaska Seattle 305
## 6 AmWest Los.Angeles 117
## 7 AmWest Phoenix 415
## 8 AmWest San.Diego 65
## 9 AmWest San.Francisco 129
## 10 AmWest Seattle 61
#we can compare airlines ontime
By comparing Airlines on time we can see that AmWest has more on time than Alaska
airlines_delays |>
filter(Status == 'on time') %>%
group_by(Airline, Destination) %>%
summarise(Total_on_time = sum(Count), .groups = "drop")
## # A tibble: 10 Ă— 3
## Airline Destination Total_on_time
## <chr> <chr> <int>
## 1 Alaska Los.Angeles 497
## 2 Alaska Phoenix 221
## 3 Alaska San.Diego 212
## 4 Alaska San.Francisco 503
## 5 Alaska Seattle 1841
## 6 AmWest Los.Angeles 697
## 7 AmWest Phoenix 4840
## 8 AmWest San.Diego 383
## 9 AmWest San.Francisco 320
## 10 AmWest Seattle 201
the delay rate show that Alaska has a delay rate greater than AmWest when combine.
delay_summary <- airlines_delays |>
# filter(Status) %>%
group_by(Airline, Destination, Status) %>%
summarise(Total = sum(Count), .groups = "drop") %>%
pivot_wider(names_from = "Status", values_from = "Total") %>%
mutate(delay_rate = delayed / (`on time` + delayed) )
# select(delay_rate)
delay_summary
## # A tibble: 10 Ă— 5
## Airline Destination delayed `on time` delay_rate
## <chr> <chr> <int> <int> <dbl>
## 1 Alaska Los.Angeles 62 497 0.111
## 2 Alaska Phoenix 12 221 0.0515
## 3 Alaska San.Diego 20 212 0.0862
## 4 Alaska San.Francisco 102 503 0.169
## 5 Alaska Seattle 305 1841 0.142
## 6 AmWest Los.Angeles 117 697 0.144
## 7 AmWest Phoenix 415 4840 0.0790
## 8 AmWest San.Diego 65 383 0.145
## 9 AmWest San.Francisco 129 320 0.287
## 10 AmWest Seattle 61 201 0.233
delay_summary |>
ggplot(aes( x = Destination, y = delay_rate, fill = Airline)) + geom_col(position = "dodge")
delay_summar <- airlines_delays |>
# filter(Status) %>%
group_by(Airline, Status) %>%
summarise(Total = sum(Count), .groups = "drop") %>%
pivot_wider(names_from = "Status", values_from = "Total") %>%
mutate(delay_rate = delayed / (`on time` + delayed) )
# select(delay_rate)
delay_summar
## # A tibble: 2 Ă— 4
## Airline delayed `on time` delay_rate
## <chr> <int> <int> <dbl>
## 1 Alaska 501 3274 0.133
## 2 AmWest 787 6441 0.109
#using ggplot to analyze AmWest has a greater total number of delayed flights compared to Alaska when aggregating across all destinations according to the ggplot bar geom_col.
airlines_delays |>
ggplot(aes(x = Airline, y = Count, fill = Status)) + geom_col()
# we can plot the delay rate and analyze by destination AmWest has a
greater total number of delayed flights compared to Alaska when
aggregating across all destinations.
delay_summary |>
ggplot(aes(x = Airline, y = delay_rate)) + geom_col()
#we can plot the total delay rate from all destination here with the
total delay rate of all destination we can see that Alaska has more
delay rate.
delay_summar |>
ggplot(aes(x = Airline, y = delay_rate)) + geom_col()
#Conclusion
The analysis demonstrates that while AmWest has a higher total number of delayed flights, this is largely due to operating a substantially greater number of flights overall. When examining proportional delay rates, Alaska exhibits a higher delay rate than AmWest.
This outcome highlights the importance of analyzing relative proportions rather than relying solely on aggregated totals. The results illustrate Simpson’s Paradox, where aggregated data can lead to misleading conclusions if underlying group distributions are not considered.
This assignment emphasizes the importance of data tidying, transformation, and careful interpretation when performing comparative analyses.