Transform and tidy the airline data
Perform an analysis of flight delays
library(RMySQL)
## Loading required package: DBI
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(ggplot2)
library(pander)
library(stringr)
library(htmlTable)
query <- 'select * from flights'
con <- dbConnect(MySQL(), user = 'guest_user', password = 'datascience', dbname = 'airports', host = 'localhost')
untidy <- dbGetQuery(con, query)
pander(head(untidy))
| airline | status | los_angeles | phoenix | san_diego | san_francisco | seattle |
|---|---|---|---|---|---|---|
| Alaska | on time | 497 | 221 | 212 | 503 | 1841 |
| delayed | 62 | 12 | 20 | 102 | 305 | |
| AM West | on time | 694 | 4840 | 383 | 320 | 201 |
| delayed | 117 | 415 | 65 | 129 | 61 |
rows <- nrow(untidy)
untidy$status <- untidy$status %>% lapply(function (x) str_replace_all(x,' ', '_'))
#We want the statuses to make better column names, so they can be referenced with the "$" operator or used in a relational DB
for (i in seq (2,rows,2)){
untidy[i,1] <- untidy[i-1,1]
}
#The airline only appears as a header. All the null columns beneath are assumed to be the same airline. This is common when data is displayed, but not good for storing data.
We want the dataframe to conform to the tidy format, so the airport and count become variables, while locations become rows. We also split the status column into separate variables, as the count of on time and delayed represent different quantities. Finally, we want to create variables for total flights and ratio delayed for further analysis
tidier <- untidy %>% gather( 'Location', 'count', 3:7) %>% spread('status', 'count')
tidier$Location <- tidier$Location %>% lapply( function (x) str_to_title(str_replace_all(x,'_', ' '))) %>% unlist()
#this no longer needs to be one word and can be propcased
tidy <- tidier %>% mutate(total_flights = delayed + on_time) %>% mutate (ratio_delayed = delayed/total_flights)
pander(tidy)
| airline | Location | on_time | delayed | total_flights | ratio_delayed |
|---|---|---|---|---|---|
| Alaska | Los Angeles | 497 | 62 | 559 | 0.1109 |
| Alaska | Phoenix | 221 | 12 | 233 | 0.0515 |
| Alaska | San Diego | 212 | 20 | 232 | 0.08621 |
| Alaska | San Francisco | 503 | 102 | 605 | 0.1686 |
| Alaska | Seattle | 1841 | 305 | 2146 | 0.1421 |
| AM West | Los Angeles | 694 | 117 | 811 | 0.1443 |
| AM West | Phoenix | 4840 | 415 | 5255 | 0.07897 |
| AM West | San Diego | 383 | 65 | 448 | 0.1451 |
| AM West | San Francisco | 320 | 129 | 449 | 0.2873 |
| AM West | Seattle | 201 | 61 | 262 | 0.2328 |
by_airline <- tidy %>% group_by(airline) %>%
summarize(total_delayed = sum(delayed), total_flights = sum(total_flights), ratio = sum(delayed)/sum(total_flights))
#Compare the overal ratios
pander(data.frame(by_airline))
| airline | total_delayed | total_flights | ratio |
|---|---|---|---|
| Alaska | 501 | 3775 | 0.1327 |
| AM West | 787 | 7225 | 0.1089 |
#pander did not work with the tibble and in my opinion looks better than htmlTable
ggplot(tidy) + geom_bar(mapping = aes(x = Location, y = ratio_delayed, fill = airline), stat = "identity") +
labs( title = "Delay Ratio by Location", y = "Proportion Delayed")
#compare the ratios by airport
ggplot(tidy) + geom_bar(mapping = aes(x = Location, y = ratio_delayed, fill = airline), stat = "identity",
position ="fill") + labs( title = "Relative Delay Ratios", y = "Proportion Delayed")
#the same chart as above, but with the ratio_delayed normalized. Can be used to see if there is a difference in delay
We want to prove there is a statistically signficant difference in delay activity between airlines
t.test(tidy[tidy$airline == 'AM West', "ratio_delayed"], tidy[tidy$airline == 'Alaska', "ratio_delayed"], paired = TRUE)
##
## Paired t-test
##
## data: tidy[tidy$airline == "AM West", "ratio_delayed"] and tidy[tidy$airline == "Alaska", "ratio_delayed"]
## t = 3.8037, df = 4, p-value = 0.01904
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.01777701 0.11386950
## sample estimates:
## mean of the differences
## 0.06582325
by_airline %>% select(2:3) %>% chisq.test()
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: .
## X-squared = 10.456, df = 1, p-value = 0.001223
Both indicate a differance at a .05 level of significance. The chi squared test is has much more statistical power because it differentiates between many observations and few observations. For instance, the T test does not differentiate between 1 delay in 4 flights and 2500 delays in 10,000 flights. This can be seen through the much high p value with this data.