# loading csv from github repository
db <- "https://raw.githubusercontent.com/hrensimin05/Data_607/master/flights.csv"
flights <- read.csv(db)
flights
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
# tidying operantions by renaming colomns, filter the empty row, collecting all the data and performing calculation to compare arrivals and departure by destination
flights <- flights %>%
rename(airlines = X, arrival_status = X.1) %>%
filter(arrival_status != "") %>%
mutate(
airlines = ifelse(airlines == "", lag(airlines),airlines),
arrival_status = ifelse(arrival_status == "on time", "on_time", arrival_status)
) %>%
gather("CITY", "num", Los.Angeles : Seattle) %>%
spread(arrival_status, num) %>%
mutate(
total = delayed + on_time,
num_on_time = round(on_time / total,4),
num_delayed = round(delayed / total,4)
)
flights
## airlines CITY delayed on_time total num_on_time num_delayed
## 1 ALASKA Los.Angeles 62 497 559 0.8891 0.1109
## 2 ALASKA Phoenix 12 221 233 0.9485 0.0515
## 3 ALASKA San.Diego 20 212 232 0.9138 0.0862
## 4 ALASKA San.Francisco 102 503 605 0.8314 0.1686
## 5 ALASKA Seattle 305 1841 2146 0.8579 0.1421
## 6 AM WEST Los.Angeles 117 694 811 0.8557 0.1443
## 7 AM WEST Phoenix 415 4840 5255 0.9210 0.0790
## 8 AM WEST San.Diego 65 383 448 0.8549 0.1451
## 9 AM WEST San.Francisco 129 320 449 0.7127 0.2873
## 10 AM WEST Seattle 61 201 262 0.7672 0.2328
# Graph and table of proportion of delayed flights
flights.redux <- flights %>%
select(c(airlines,CITY, num_delayed)) %>%
spread(CITY, num_delayed) %>%
select(Los.Angeles:Seattle) %>%
as.matrix()
rownames(flights.redux) <- unique(flights$airlines)
flights.redux
## Los.Angeles Phoenix San.Diego San.Francisco Seattle
## ALASKA 0.1109 0.0515 0.0862 0.1686 0.1421
## AM WEST 0.1443 0.0790 0.1451 0.2873 0.2328
barplot(flights.redux, beside = TRUE, main = "Ration of Delayed Flights", xlab = "DESTINATION", col=c("orange","red"))
legend("topleft", rownames(flights.redux), pch = 15, bty = 'n', col = c("orange","red"))
#table and bar graph plot summarizing ratio of all delayed flights by carrier for all cities combined
combined_cities <- flights %>%
group_by(airlines) %>%
summarise(num_delayed = sum(delayed)/sum(total),.groups = 'drop') %>%
select(num_delayed) %>%
as.matrix()
rownames(combined_cities) <- unique(flights$airlines)
combined_cities
## num_delayed
## ALASKA 0.1327152
## AM WEST 0.1089273
barplot(combined_cities, beside = TRUE, main = "Ratio of Delayed Flights", xlab = "Cities Together", col=c("orange","yellow"), ylim = c(0,0.22))
legend("topleft", rownames(combined_cities), pch = 15, bty = 'n', col = c("orange","yellow"))
# using function `prop` to examine if the difference in ratios above is statistically meaningful at the 5% level of significance.
for (i in unique(flights$CITY)){
flights_db <- filter(flights, CITY == i)
AM_delayed <- sum(select(filter(flights_db, airlines == 'AM WEST'),delayed))
Alaska_delayed <- sum(select(filter(flights_db, airlines =='ALASKA'), delayed))
total_AM <- sum(select(filter(flights_db, airlines == 'AM WEST'),total))
total_Alaska <- sum(select(filter(flights_db, airlines == 'ALASKA'),total))
print("--------------------------------------------------------------------")
cat("TEST FOR:", i)
print(prop.test(c(AM_delayed,Alaska_delayed), c(total_AM,total_Alaska)))
}
## [1] "--------------------------------------------------------------------"
## TEST FOR: Los.Angeles
## 2-sample test for equality of proportions with continuity correction
##
## data: c(AM_delayed, Alaska_delayed) out of c(total_AM, total_Alaska)
## X-squared = 2.954, df = 1, p-value = 0.08566
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.003687491 0.070395480
## sample estimates:
## prop 1 prop 2
## 0.1442663 0.1109123
##
## [1] "--------------------------------------------------------------------"
## TEST FOR: Phoenix
## 2-sample test for equality of proportions with continuity correction
##
## data: c(AM_delayed, Alaska_delayed) out of c(total_AM, total_Alaska)
## X-squared = 1.9792, df = 1, p-value = 0.1595
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.004071878 0.059012400
## sample estimates:
## prop 1 prop 2
## 0.07897241 0.05150215
##
## [1] "--------------------------------------------------------------------"
## TEST FOR: San.Diego
## 2-sample test for equality of proportions with continuity correction
##
## data: c(AM_delayed, Alaska_delayed) out of c(total_AM, total_Alaska)
## X-squared = 4.3218, df = 1, p-value = 0.03763
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.006949539 0.110815239
## sample estimates:
## prop 1 prop 2
## 0.1450893 0.0862069
##
## [1] "--------------------------------------------------------------------"
## TEST FOR: San.Francisco
## 2-sample test for equality of proportions with continuity correction
##
## data: c(AM_delayed, Alaska_delayed) out of c(total_AM, total_Alaska)
## X-squared = 20.535, df = 1, p-value = 5.855e-06
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.06537096 0.17204920
## sample estimates:
## prop 1 prop 2
## 0.2873051 0.1685950
##
## [1] "--------------------------------------------------------------------"
## TEST FOR: Seattle
## 2-sample test for equality of proportions with continuity correction
##
## data: c(AM_delayed, Alaska_delayed) out of c(total_AM, total_Alaska)
## X-squared = 14.207, df = 1, p-value = 0.0001637
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.03529326 0.14610583
## sample estimates:
## prop 1 prop 2
## 0.2328244 0.1421249
A significance level of 0.05 indicates a 5% risk of concluding that a difference exists when there is no actual difference. That is why we reject the null hypothesis of equal delay proportions for San Diego, San Francisco and Seattle. And We fail to reject the null hypothesis for the following cities for LA and Phoenix.
# Repeating 'prop' function again
db2 <- flights %>%
group_by(airlines) %>%
summarise(
delayed = sum(delayed),
total = sum(total),
.groups = 'drop'
)
Am_delay <- sum(select(filter(db2, airlines == 'AM WEST'),delayed))
Alaska_delay <- sum(select(filter(db2, airlines =='ALASKA'), delayed))
totalAM <- sum(select(filter(db2, airlines == 'AM WEST'),total))
totalAlaska <- sum(select(filter(db2, airlines == 'ALASKA'),total))
print(prop.test(c(Am_delay,Alaska_delay), c(totalAM,totalAlaska)))
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(Am_delay, Alaska_delay) out of c(totalAM, totalAlaska)
## X-squared = 13.343, df = 1, p-value = 0.0002594
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.03697936 -0.01059643
## sample estimates:
## prop 1 prop 2
## 0.1089273 0.1327152
#At the 5% level of significance, we reject the null hypothesis that the difference in proportions is ZERO.