Data607

Homework 5

# loading csv from github repository

db <- "https://raw.githubusercontent.com/hrensimin05/Data_607/master/flights.csv"
flights <- read.csv(db)
flights

##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2         delayed          62      12        20           102     305
## 3                          NA      NA        NA            NA      NA
## 4 AM WEST on time         694    4840       383           320     201
## 5         delayed         117     415        65           129      61

# tidying operantions by renaming colomns, filter the empty row, collecting all the data and performing calculation to compare arrivals and departure by destination 
flights <-   flights %>%
                rename(airlines = X, arrival_status = X.1) %>%  
                filter(arrival_status != "") %>%
                mutate(
                    airlines  = ifelse(airlines == "", lag(airlines),airlines),
                    arrival_status = ifelse(arrival_status == "on time", "on_time", arrival_status)
                    ) %>%
                gather("CITY", "num", Los.Angeles : Seattle) %>%
                spread(arrival_status, num) %>%
                mutate(
                    total = delayed + on_time,
                    num_on_time = round(on_time / total,4),
                    num_delayed = round(delayed / total,4)
                )
    
flights

##    airlines          CITY delayed on_time total num_on_time num_delayed
## 1    ALASKA   Los.Angeles      62     497   559      0.8891      0.1109
## 2    ALASKA       Phoenix      12     221   233      0.9485      0.0515
## 3    ALASKA     San.Diego      20     212   232      0.9138      0.0862
## 4    ALASKA San.Francisco     102     503   605      0.8314      0.1686
## 5    ALASKA       Seattle     305    1841  2146      0.8579      0.1421
## 6   AM WEST   Los.Angeles     117     694   811      0.8557      0.1443
## 7   AM WEST       Phoenix     415    4840  5255      0.9210      0.0790
## 8   AM WEST     San.Diego      65     383   448      0.8549      0.1451
## 9   AM WEST San.Francisco     129     320   449      0.7127      0.2873
## 10  AM WEST       Seattle      61     201   262      0.7672      0.2328

# Graph and table of proportion of delayed flights 
flights.redux <- flights %>%
                    select(c(airlines,CITY, num_delayed)) %>%
                    spread(CITY, num_delayed) %>%
                    select(Los.Angeles:Seattle) %>%
                    as.matrix()
rownames(flights.redux) <- unique(flights$airlines)
flights.redux

##         Los.Angeles Phoenix San.Diego San.Francisco Seattle
## ALASKA       0.1109  0.0515    0.0862        0.1686  0.1421
## AM WEST      0.1443  0.0790    0.1451        0.2873  0.2328

barplot(flights.redux, beside = TRUE, main = "Ration of Delayed Flights", xlab = "DESTINATION", col=c("orange","red"))          
legend("topleft", rownames(flights.redux), pch = 15, bty = 'n', col = c("orange","red"))

#table and bar graph plot summarizing ratio of all delayed flights by carrier for all cities combined

combined_cities <- flights %>%
                    group_by(airlines) %>%
                    summarise(num_delayed = sum(delayed)/sum(total),.groups = 'drop') %>%
                    select(num_delayed) %>%
                    as.matrix()
rownames(combined_cities) <- unique(flights$airlines)
combined_cities

##         num_delayed
## ALASKA    0.1327152
## AM WEST   0.1089273

barplot(combined_cities, beside = TRUE, main = "Ratio of Delayed Flights", xlab = "Cities Together", col=c("orange","yellow"), ylim = c(0,0.22))      
legend("topleft", rownames(combined_cities), pch = 15,  bty = 'n', col = c("orange","yellow"))

# using function `prop` to examine if the difference in ratios above is statistically meaningful at the 5% level of significance.

for (i in unique(flights$CITY)){ 
    flights_db <- filter(flights, CITY == i)
    AM_delayed <- sum(select(filter(flights_db, airlines == 'AM WEST'),delayed))
    Alaska_delayed <- sum(select(filter(flights_db, airlines =='ALASKA'), delayed))
    total_AM <- sum(select(filter(flights_db, airlines == 'AM WEST'),total))
    total_Alaska <- sum(select(filter(flights_db, airlines == 'ALASKA'),total))
    print("--------------------------------------------------------------------")
    cat("TEST FOR:", i)
    print(prop.test(c(AM_delayed,Alaska_delayed), c(total_AM,total_Alaska)))
}

## [1] "--------------------------------------------------------------------"
## TEST FOR: Los.Angeles
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(AM_delayed, Alaska_delayed) out of c(total_AM, total_Alaska)
## X-squared = 2.954, df = 1, p-value = 0.08566
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.003687491  0.070395480
## sample estimates:
##    prop 1    prop 2 
## 0.1442663 0.1109123 
## 
## [1] "--------------------------------------------------------------------"
## TEST FOR: Phoenix
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(AM_delayed, Alaska_delayed) out of c(total_AM, total_Alaska)
## X-squared = 1.9792, df = 1, p-value = 0.1595
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.004071878  0.059012400
## sample estimates:
##     prop 1     prop 2 
## 0.07897241 0.05150215 
## 
## [1] "--------------------------------------------------------------------"
## TEST FOR: San.Diego
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(AM_delayed, Alaska_delayed) out of c(total_AM, total_Alaska)
## X-squared = 4.3218, df = 1, p-value = 0.03763
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.006949539 0.110815239
## sample estimates:
##    prop 1    prop 2 
## 0.1450893 0.0862069 
## 
## [1] "--------------------------------------------------------------------"
## TEST FOR: San.Francisco
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(AM_delayed, Alaska_delayed) out of c(total_AM, total_Alaska)
## X-squared = 20.535, df = 1, p-value = 5.855e-06
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.06537096 0.17204920
## sample estimates:
##    prop 1    prop 2 
## 0.2873051 0.1685950 
## 
## [1] "--------------------------------------------------------------------"
## TEST FOR: Seattle
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(AM_delayed, Alaska_delayed) out of c(total_AM, total_Alaska)
## X-squared = 14.207, df = 1, p-value = 0.0001637
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.03529326 0.14610583
## sample estimates:
##    prop 1    prop 2 
## 0.2328244 0.1421249

A significance level of 0.05 indicates a 5% risk of concluding that a difference exists when there is no actual difference. That is why we reject the null hypothesis of equal delay proportions for San Diego, San Francisco and Seattle. And We fail to reject the null hypothesis for the following cities for LA and Phoenix.

# Repeating 'prop' function again
db2 <- flights %>%
                    group_by(airlines) %>%
                    summarise(
                        delayed = sum(delayed),
                        total = sum(total),
                        .groups = 'drop'
                        )
Am_delay <- sum(select(filter(db2, airlines == 'AM WEST'),delayed))
Alaska_delay <- sum(select(filter(db2, airlines =='ALASKA'), delayed))
totalAM <- sum(select(filter(db2, airlines == 'AM WEST'),total))
totalAlaska <- sum(select(filter(db2, airlines == 'ALASKA'),total))
print(prop.test(c(Am_delay,Alaska_delay), c(totalAM,totalAlaska)))

## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(Am_delay, Alaska_delay) out of c(totalAM, totalAlaska)
## X-squared = 13.343, df = 1, p-value = 0.0002594
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.03697936 -0.01059643
## sample estimates:
##    prop 1    prop 2 
## 0.1089273 0.1327152

#At the 5% level of significance, we reject the null hypothesis that the difference in proportions is ZERO.

Data607_HW5

Dominika Markowska-Desvallons

9/26/2020

Homework 5