library(tibble)
library(readr)
library(tidyr)
library(dplyr)
library(kableExtra)
data <- list(
Airline = c("ALASKA", "ALASKA", "AM WEST", "AM WEST"),
Status = c("on time", "delayed", "on time", "delayed"),
Los_Angeles = c(497, 62, 694, 117),
Phoenix = c(221, 12, 4840, 415),
San_Diego = c(212, 20, 383, 65),
San_Francisco = c(503, 102, 320, 129),
Seattle = c(1841, 305, 201, 61)
)
df <- as_tibble(data)
names(df) <- gsub(" ", "_", names(df))
# I then saved the df as a CSV file in a hidden code chunk to avoid publicly showing directory paths.
flights_data <- df
# Pivot longer so we can separate city and counts
flights_long <- flights_data %>%
pivot_longer(cols = Los_Angeles:Seattle, names_to = "City", values_to = "Count")
# Pivot wider to create columns for each airline-status combination
flights_wide <- flights_long %>%
pivot_wider(names_from = c(Airline, Status), values_from = Count)
names(flights_wide) <- gsub(" ", "_", names(flights_wide))
I will first perform an odds ratio analysis with a 95% confidence interval on the data provided, but I need to create a 2x2 contingency table for both ‘on time’ and ‘delayed’ statuses across the two airlines. This table will compare the odds of being on time or delayed for ALASKA against AM WEST.I will use the fisher.test() function to get the odds ratio and its confidence interval for a 2x2 contingency table.
# Sum the counts
on_time_counts <- c(sum(flights_wide$ALASKA_on_time), sum(flights_wide$AM_WEST_on_time))
delayed_counts <- c(sum(flights_wide$ALASKA_delayed), sum(flights_wide$AM_WEST_delayed))
# contingency table
contingency_table <- matrix(c(on_time_counts, delayed_counts), nrow = 2, byrow = TRUE,
dimnames = list(c("On Time", "Delayed"), c("ALASKA", "AM WEST")))
# showing the table
contingency_table %>%
kable("html") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
ALASKA | AM WEST | |
---|---|---|
On Time | 3274 | 6438 |
Delayed | 501 | 787 |
# Fisher's Exact Test
odds_ratio_test <- fisher.test(contingency_table)
# odds ratio and 95% CI
odds_ratio <- odds_ratio_test$estimate
conf_int <- odds_ratio_test$conf.int
p_value <- odds_ratio_test$p.value
list(OddsRatio = odds_ratio, ConfInt95 = conf_int, PValue = p_value)
## $OddsRatio
## odds ratio
## 0.7988827
##
## $ConfInt95
## [1] 0.7076985 0.9023705
## attr(,"conf.level")
## [1] 0.95
##
## $PValue
## [1] 0.0002569764
R kept computing the OR for the on-time rather than the delays, so I am going to rearrange the contingency table to get the OR of the delays:
contingency_table2 <- matrix(c(501, 787, 3274, 6438), nrow = 2, byrow = TRUE,
dimnames = list(c("Delayed", "On Time"), c("ALASKA", "AM WEST")))
odds_ratio_test2 <- fisher.test(contingency_table2, alternative = "greater")
odds_ratio_delays2 <- odds_ratio_test2$estimate
conf_int_delays2 <- odds_ratio_test2$conf.int
p_value_delays2 <- odds_ratio_test2$p.value
contingency_table2 %>%
kable("html") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
ALASKA | AM WEST | |
---|---|---|
Delayed | 501 | 787 |
On Time | 3274 | 6438 |
list(OddsRatioDelays2 = odds_ratio_delays2, ConfInt95Delays = conf_int_delays2, PValueDelays = p_value_delays2)
## $OddsRatioDelays2
## odds ratio
## 1.251748
##
## $ConfInt95Delays
## [1] 1.129829 Inf
## attr(,"conf.level")
## [1] 0.95
##
## $PValueDelays
## [1] 0.0001445432
So the OR (95% confidence interval) for having delayed flights was 1.25 (1.13 to infinity) with a p value of 0.0001. This suggests that a flight with Alaska airlines had higher odds of being delayed than a flight with AM West.
I will compare the proportions of delays between the two airlines and then test for any statistical difference using the chi-square test:
on_time_flights_alaska <- sum(df[1, 3:7])
on_time_flights_amwest <- sum(df[3, 3:7])
delayed_flights_alaska <- sum(df[2, 3:7])
delayed_flights_amwest <- sum(df[4, 3:7])
total_flights_alaska <- on_time_flights_alaska + delayed_flights_alaska
total_flights_amwest <- on_time_flights_amwest + delayed_flights_amwest
prop_delayed_alaska <- delayed_flights_alaska / total_flights_alaska
prop_delayed_amwest <- delayed_flights_amwest / total_flights_amwest
# chi-squared test
observed <- matrix(c(delayed_flights_alaska, delayed_flights_amwest,
on_time_flights_alaska,
on_time_flights_amwest),
nrow = 2, byrow = TRUE,
dimnames = list(c("Delayed", "On Time"),
c("ALASKA", "AM WEST")))
chi_squared_test <- chisq.test(observed)
observed %>%
kable("html") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
ALASKA | AM WEST | |
---|---|---|
Delayed | 501 | 787 |
On Time | 3274 | 6438 |
list(TotalFlights = c(ALASKA = total_flights_alaska, AMWEST = total_flights_amwest),
DelayedFlights = c(ALASKA = delayed_flights_alaska, AMWEST = delayed_flights_amwest),
ProportionDelayed = c(ALASKA = prop_delayed_alaska, AMWEST = prop_delayed_amwest),
ChiSquaredPValue = chi_squared_test$p.value)
## $TotalFlights
## ALASKA AMWEST
## 3775 7225
##
## $DelayedFlights
## ALASKA AMWEST
## 501 787
##
## $ProportionDelayed
## ALASKA AMWEST
## 0.1327152 0.1089273
##
## $ChiSquaredPValue
## [1] 0.0002594445
This result confirms the initial result acquired via the odds ratio. The proportion of delayed flights with Alaska airlines is greater than that with AM West airlines, and the Chi-square p value is 0.0002.