607_week5hm

Building the table and creating a CSV file from it, then saving it.The wide structure of the table is preserved while creating the table and df.

library(tibble)
library(readr)
library(tidyr)
library(dplyr)
library(kableExtra)

data <- list(
  Airline = c("ALASKA", "ALASKA", "AM WEST", "AM WEST"),
  Status = c("on time", "delayed", "on time", "delayed"),
  Los_Angeles = c(497, 62, 694, 117),
  Phoenix = c(221, 12, 4840, 415),
  San_Diego = c(212, 20, 383, 65),
  San_Francisco = c(503, 102, 320, 129),
  Seattle = c(1841, 305, 201, 61)
)

df <- as_tibble(data)

names(df) <- gsub(" ", "_", names(df))

# I then saved the df as a CSV file in a hidden code chunk to avoid publicly showing directory paths.

Performing the necessary tidying and transformations on the df:

flights_data <- df

# Pivot longer so we can separate city and counts
flights_long <- flights_data %>% 
  pivot_longer(cols = Los_Angeles:Seattle, names_to = "City", values_to = "Count")

# Pivot wider to create columns for each airline-status combination
flights_wide <- flights_long %>% 
  pivot_wider(names_from = c(Airline, Status), values_from = Count)

names(flights_wide) <- gsub(" ", "_", names(flights_wide))

Now to the analysis:

Method #1

I will first perform an odds ratio analysis with a 95% confidence interval on the data provided, but I need to create a 2x2 contingency table for both ‘on time’ and ‘delayed’ statuses across the two airlines. This table will compare the odds of being on time or delayed for ALASKA against AM WEST.I will use the fisher.test() function to get the odds ratio and its confidence interval for a 2x2 contingency table.

# Sum the counts
on_time_counts <- c(sum(flights_wide$ALASKA_on_time), sum(flights_wide$AM_WEST_on_time))
delayed_counts <- c(sum(flights_wide$ALASKA_delayed), sum(flights_wide$AM_WEST_delayed))

# contingency table
contingency_table <- matrix(c(on_time_counts, delayed_counts), nrow = 2, byrow = TRUE,
                            dimnames = list(c("On Time", "Delayed"), c("ALASKA", "AM WEST")))

# showing the table
contingency_table %>% 
  kable("html") %>% 
  kable_styling(bootstrap_options = c("striped", "hover"))

	ALASKA	AM WEST
On Time	3274	6438
Delayed	501	787

# Fisher's Exact Test
odds_ratio_test <- fisher.test(contingency_table)

# odds ratio and 95% CI
odds_ratio <- odds_ratio_test$estimate
conf_int <- odds_ratio_test$conf.int
p_value <- odds_ratio_test$p.value

list(OddsRatio = odds_ratio, ConfInt95 = conf_int, PValue = p_value)

## $OddsRatio
## odds ratio 
##  0.7988827 
## 
## $ConfInt95
## [1] 0.7076985 0.9023705
## attr(,"conf.level")
## [1] 0.95
## 
## $PValue
## [1] 0.0002569764

R kept computing the OR for the on-time rather than the delays, so I am going to rearrange the contingency table to get the OR of the delays:

contingency_table2 <- matrix(c(501, 787, 3274, 6438), nrow = 2, byrow = TRUE,
                            dimnames = list(c("Delayed", "On Time"), c("ALASKA", "AM WEST")))

odds_ratio_test2 <- fisher.test(contingency_table2, alternative = "greater")

odds_ratio_delays2 <- odds_ratio_test2$estimate
conf_int_delays2 <- odds_ratio_test2$conf.int
p_value_delays2 <- odds_ratio_test2$p.value

contingency_table2 %>% 
  kable("html") %>% 
  kable_styling(bootstrap_options = c("striped", "hover"))

	ALASKA	AM WEST
Delayed	501	787
On Time	3274	6438

list(OddsRatioDelays2 = odds_ratio_delays2, ConfInt95Delays = conf_int_delays2, PValueDelays = p_value_delays2)

## $OddsRatioDelays2
## odds ratio 
##   1.251748 
## 
## $ConfInt95Delays
## [1] 1.129829      Inf
## attr(,"conf.level")
## [1] 0.95
## 
## $PValueDelays
## [1] 0.0001445432

So the OR (95% confidence interval) for having delayed flights was 1.25 (1.13 to infinity) with a p value of 0.0001. This suggests that a flight with Alaska airlines had higher odds of being delayed than a flight with AM West.

Method #2

I will compare the proportions of delays between the two airlines and then test for any statistical difference using the chi-square test:

on_time_flights_alaska <- sum(df[1, 3:7])
on_time_flights_amwest <- sum(df[3, 3:7])

delayed_flights_alaska <- sum(df[2, 3:7])
delayed_flights_amwest <- sum(df[4, 3:7])

total_flights_alaska <- on_time_flights_alaska + delayed_flights_alaska
total_flights_amwest <- on_time_flights_amwest + delayed_flights_amwest

prop_delayed_alaska <- delayed_flights_alaska / total_flights_alaska
prop_delayed_amwest <- delayed_flights_amwest / total_flights_amwest

# chi-squared test
observed <- matrix(c(delayed_flights_alaska, delayed_flights_amwest, 
                     on_time_flights_alaska, 
                     on_time_flights_amwest), 
                   nrow = 2, byrow = TRUE,
                   dimnames = list(c("Delayed", "On Time"), 
                                   c("ALASKA", "AM WEST")))
chi_squared_test <- chisq.test(observed)

observed %>% 
  kable("html") %>% 
  kable_styling(bootstrap_options = c("striped", "hover"))

	ALASKA	AM WEST
Delayed	501	787
On Time	3274	6438

list(TotalFlights = c(ALASKA = total_flights_alaska, AMWEST = total_flights_amwest),
     DelayedFlights = c(ALASKA = delayed_flights_alaska, AMWEST = delayed_flights_amwest),
     ProportionDelayed = c(ALASKA = prop_delayed_alaska, AMWEST = prop_delayed_amwest),
     ChiSquaredPValue = chi_squared_test$p.value)

## $TotalFlights
## ALASKA AMWEST 
##   3775   7225 
## 
## $DelayedFlights
## ALASKA AMWEST 
##    501    787 
## 
## $ProportionDelayed
##    ALASKA    AMWEST 
## 0.1327152 0.1089273 
## 
## $ChiSquaredPValue
## [1] 0.0002594445

This result confirms the initial result acquired via the odds ratio. The proportion of delayed flights with Alaska airlines is greater than that with AM West airlines, and the Chi-square p value is 0.0002.