Create a .CSV file (or optionally, a MySQL database!) that includes all of the information above. You’re encouraged to use a “wide” structure similar to how the information appears above, so that you can practice tidying and transformations as described below.
library(knitr)
library(gridExtra)
library(knitr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggthemes)
if(!require(RPostgreSQL)){
install.packages("RPostgreSQL")
library(RPostgreSQL)
}drv <- dbDriver("PostgreSQL")
con <- dbConnect(drv, dbname = "kailukowiak",
host = "localhost", port = 5432,
user = "kailukowiak")Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data.
df <- dbGetQuery(con, "SELECT * FROM alaskaflights")
kable(df)| id | airline | flight_status | losangeles | phoenix | sandiego | sanfran | seattle |
|---|---|---|---|---|---|---|---|
| 1 | ALASKA | Ontime | 497 | 221 | 212 | 503 | 1841 |
| 2 | NA | Delayed | 62 | 12 | 20 | 102 | 305 |
| 3 | NA | NA | NA | NA | NA | NA | NA |
| 4 | AMWEST | Ontime | 694 | 4840 | 383 | 320 | 201 |
| 5 | NA | Delayed | 117 | 415 | 65 | 129 | 61 |
df <- df %>% select(-id)df <- df %>% fill(airline) %>% drop_na()
# The order is important because drop_na will detlet all
# rows with any na'sdf2 <- gather(df, Airport, Flights, (3:7))
kable(df2)| airline | flight_status | Airport | Flights |
|---|---|---|---|
| ALASKA | Ontime | losangeles | 497 |
| ALASKA | Delayed | losangeles | 62 |
| AMWEST | Ontime | losangeles | 694 |
| AMWEST | Delayed | losangeles | 117 |
| ALASKA | Ontime | phoenix | 221 |
| ALASKA | Delayed | phoenix | 12 |
| AMWEST | Ontime | phoenix | 4840 |
| AMWEST | Delayed | phoenix | 415 |
| ALASKA | Ontime | sandiego | 212 |
| ALASKA | Delayed | sandiego | 20 |
| AMWEST | Ontime | sandiego | 383 |
| AMWEST | Delayed | sandiego | 65 |
| ALASKA | Ontime | sanfran | 503 |
| ALASKA | Delayed | sanfran | 102 |
| AMWEST | Ontime | sanfran | 320 |
| AMWEST | Delayed | sanfran | 129 |
| ALASKA | Ontime | seattle | 1841 |
| ALASKA | Delayed | seattle | 305 |
| AMWEST | Ontime | seattle | 201 |
| AMWEST | Delayed | seattle | 61 |
Perform analysis to compare the arrival delays for the two airlines.
df3 <- df2 %>% spread(flight_status, Flights)
kable(df3)| airline | Airport | Delayed | Ontime |
|---|---|---|---|
| ALASKA | losangeles | 62 | 497 |
| ALASKA | phoenix | 12 | 221 |
| ALASKA | sandiego | 20 | 212 |
| ALASKA | sanfran | 102 | 503 |
| ALASKA | seattle | 305 | 1841 |
| AMWEST | losangeles | 117 | 694 |
| AMWEST | phoenix | 415 | 4840 |
| AMWEST | sandiego | 65 | 383 |
| AMWEST | sanfran | 129 | 320 |
| AMWEST | seattle | 61 | 201 |
We now can more imaginably work with this data set. Next we must create a new column with the percentage for each row.
df3 <- mutate(df3, Avg_On_Time = Ontime/(Ontime + Delayed))
kable(arrange(df3, Avg_On_Time))| airline | Airport | Delayed | Ontime | Avg_On_Time |
|---|---|---|---|---|
| AMWEST | sanfran | 129 | 320 | 0.7126949 |
| AMWEST | seattle | 61 | 201 | 0.7671756 |
| ALASKA | sanfran | 102 | 503 | 0.8314050 |
| AMWEST | sandiego | 65 | 383 | 0.8549107 |
| AMWEST | losangeles | 117 | 694 | 0.8557337 |
| ALASKA | seattle | 305 | 1841 | 0.8578751 |
| ALASKA | losangeles | 62 | 497 | 0.8890877 |
| ALASKA | sandiego | 20 | 212 | 0.9137931 |
| AMWEST | phoenix | 415 | 4840 | 0.9210276 |
| ALASKA | phoenix | 12 | 221 | 0.9484979 |
AirlinAVG <- group_by(df3, airline) %>% summarise(AirlineSuccess = mean(Avg_On_Time))
kable(AirlinAVG)| airline | AirlineSuccess |
|---|---|
| ALASKA | 0.8881317 |
| AMWEST | 0.8223085 |
ggplot(data = AirlinAVG, aes(x = airline, y = AirlineSuccess)) +
geom_bar(stat = 'identity', width = 0.5) +
geom_text(aes(label = format(AirlineSuccess, digits = 2)), vjust = -0.25) +
ylim(0:1) +
theme_economist()Alaska is more on ontime.
Which airport is more on time?
AirportAVG <- group_by(df3, Airport) %>% summarise(AirportSuccess = mean(Avg_On_Time))
kable(AirportAVG)| Airport | AirportSuccess |
|---|---|
| losangeles | 0.8724107 |
| phoenix | 0.9347627 |
| sandiego | 0.8843519 |
| sanfran | 0.7720499 |
| seattle | 0.8125253 |
ggplot(data = AirportAVG, aes(x = Airport, y = AirportSuccess)) +
geom_bar(stat = 'identity', width = 0.5) +
geom_text(aes(label = format(AirportSuccess, digits = 2)), vjust = -0.25) +
ylim(0:1) +
theme_economist()Phoenix seems to be best.
Do the different airlines fair better or worse at different airports?
dfpaired <- df3 %>% select(-c(Delayed, Ontime)) %>% spread(Airport, Avg_On_Time)
ggplot(data = df3, mapping = aes(x = Airport, y = Avg_On_Time, fill = airline)) +
geom_bar(stat = 'identity', position = 'dodge', width = 0.75,) +
geom_text(aes(label = format(Avg_On_Time, digits = 2)), vjust = -0.3,
position = position_dodge(width=0.75)) +
ylim(0:1) +
theme_economist() + scale_fill_economist()This is cool but it is hard to tell if there are differences since AM West does worse all the time. We need a way to control for this:
Using the mean:
kable(AirlinAVG)| airline | AirlineSuccess |
|---|---|
| ALASKA | 0.8881317 |
| AMWEST | 0.8223085 |
alskAvg <- as.numeric(AirlinAVG[1,2])
amAvg <- as.numeric(AirlinAVG[2,2])
# We now have the averages.
dfCon1 <- df3 %>%
mutate(OnTimeControl1 = ifelse(airline == "ALASKA", yes = Avg_On_Time / alskAvg,
no = Avg_On_Time / amAvg))
# Cool did not expect it to vectorize Avg_On_Time and not alskAvg. R is the best.
kable(dfCon1)| airline | Airport | Delayed | Ontime | Avg_On_Time | OnTimeControl1 |
|---|---|---|---|---|---|
| ALASKA | losangeles | 62 | 497 | 0.8890877 | 1.0010763 |
| ALASKA | phoenix | 12 | 221 | 0.9484979 | 1.0679698 |
| ALASKA | sandiego | 20 | 212 | 0.9137931 | 1.0288936 |
| ALASKA | sanfran | 102 | 503 | 0.8314050 | 0.9361280 |
| ALASKA | seattle | 305 | 1841 | 0.8578751 | 0.9659323 |
| AMWEST | losangeles | 117 | 694 | 0.8557337 | 1.0406480 |
| AMWEST | phoenix | 415 | 4840 | 0.9210276 | 1.1200512 |
| AMWEST | sandiego | 65 | 383 | 0.8549107 | 1.0396472 |
| AMWEST | sanfran | 129 | 320 | 0.7126949 | 0.8667001 |
| AMWEST | seattle | 61 | 201 | 0.7671756 | 0.9329535 |
Now to graph:
ggplot(data = dfCon1, mapping = aes(x = Airport, y = OnTimeControl1, fill = airline)) +
geom_bar(stat = 'identity', position = 'dodge', width = 0.75) +
geom_text(aes(label = format(OnTimeControl1, digits = 2)), vjust = -0.3,
position = position_dodge(width=0.75)) +
theme_economist() + scale_fill_economist() # For some reason ylim does not work wellWe can see that we do have some success and that when we control, AM West does do better some airport than we might expect. This is interesting because it means different conditions at the airports effect the carriers differently.
df4 <- df3 %>%
select(-c(Delayed, Ontime)) %>%
spread(airline, Avg_On_Time)
kable(df4)| Airport | ALASKA | AMWEST |
|---|---|---|
| losangeles | 0.8890877 | 0.8557337 |
| phoenix | 0.9484979 | 0.9210276 |
| sandiego | 0.9137931 | 0.8549107 |
| sanfran | 0.8314050 | 0.7126949 |
| seattle | 0.8578751 | 0.7671756 |
Now we controlled for how bad the airlines are, but we are being unfair to the airports because we don’t know which airport gets more of the better trafic. Maybe the airports are more similar and only bad airline trafic makes them look worse.
portControl <- spread(df2, airline, Flights)
portControl1 <- group_by(portControl, Airport) %>% summarise(alskNum = sum(ALASKA))
portControl2 <- group_by(portControl, Airport) %>% summarise(amNum = sum(AMWEST))
portDF <- left_join(portControl1, portControl2) %>% mutate( percAlsk = alskNum/(alskNum + amNum))## Joining, by = "Airport"
kable(portDF)| Airport | alskNum | amNum | percAlsk |
|---|---|---|---|
| losangeles | 559 | 811 | 0.4080292 |
| phoenix | 233 | 5255 | 0.0424563 |
| sandiego | 232 | 448 | 0.3411765 |
| sanfran | 605 | 449 | 0.5740038 |
| seattle | 2146 | 262 | 0.8911960 |
To graph this compared to the initial airport graph:
pltA <- ggplot(data = portDF, mapping = aes(x = Airport, y = percAlsk)) +
geom_bar(stat = 'identity', position = 'dodge', width = 0.75) +
geom_text(aes(label = format(percAlsk, digits = 2)), vjust = -0.3,
position = position_dodge(width=0.75)) +
ylim(0:1)+
theme_economist() + scale_fill_economist() # For some reason ylim does not work well
pltB <- ggplot(data = AirportAVG, aes(x = Airport, y = AirportSuccess)) +
geom_bar(stat = 'identity', width = 0.5) +
geom_text(aes(label = format(AirportSuccess, digits = 2)), vjust = -0.25) +
ylim(0:1) +
theme_economist()
grid.arrange(pltA, pltB)cor( portDF$percAlsk, AirportAVG$AirportSuccess)## [1] -0.8235408
Interesting, the higher percent of alaska flights, the lower the success rates. This goes against the data we ahve on the quality of AM West. It is probably explained by other flights from other cariers.
t.test(df4$ALASKA, df4$AMWEST)##
## Welch Two Sample t-test
##
## data: df4$ALASKA and df4$AMWEST
## t = 1.5642, df = 6.2788, p-value = 0.1666
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.03604854 0.16769505
## sample estimates:
## mean of x mean of y
## 0.8881317 0.8223085
Statistically we cannot say that There is difference. But we also ahve limited samples.
I won’t try and do the same with airports because I’d lose degrees of freedom PDQ.