Q1

Create a .CSV file (or optionally, a MySQL database!) that includes all of the information above. You’re encouraged to use a “wide” structure similar to how the information appears above, so that you can practice tidying and transformations as described below.

Loading packages:

library(knitr)
library(gridExtra)
library(knitr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggthemes)
if(!require(RPostgreSQL)){
    install.packages("RPostgreSQL")
    library(RPostgreSQL)
}

Loading data from postgres:

drv <- dbDriver("PostgreSQL")
con <- dbConnect(drv, dbname = "kailukowiak",
                 host = "localhost", port = 5432,
                 user = "kailukowiak")

Q2

Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data.

df <- dbGetQuery(con, "SELECT * FROM alaskaflights")
kable(df)
id airline flight_status losangeles phoenix sandiego sanfran seattle
1 ALASKA Ontime 497 221 212 503 1841
2 NA Delayed 62 12 20 102 305
3 NA NA NA NA NA NA NA
4 AMWEST Ontime 694 4840 383 320 201
5 NA Delayed 117 415 65 129 61

Drop ID:

df <- df %>% select(-id)

Remove na row and repalce Na values:

df <- df %>% fill(airline) %>% drop_na()  
# The order is important because drop_na will detlet all 
# rows with any na's

Gather the Data

df2 <- gather(df, Airport, Flights, (3:7))
kable(df2)
airline flight_status Airport Flights
ALASKA Ontime losangeles 497
ALASKA Delayed losangeles 62
AMWEST Ontime losangeles 694
AMWEST Delayed losangeles 117
ALASKA Ontime phoenix 221
ALASKA Delayed phoenix 12
AMWEST Ontime phoenix 4840
AMWEST Delayed phoenix 415
ALASKA Ontime sandiego 212
ALASKA Delayed sandiego 20
AMWEST Ontime sandiego 383
AMWEST Delayed sandiego 65
ALASKA Ontime sanfran 503
ALASKA Delayed sanfran 102
AMWEST Ontime sanfran 320
AMWEST Delayed sanfran 129
ALASKA Ontime seattle 1841
ALASKA Delayed seattle 305
AMWEST Ontime seattle 201
AMWEST Delayed seattle 61

Q3

Perform analysis to compare the arrival delays for the two airlines.

It is easier to reshape and compare elements by row:

df3 <- df2 %>% spread(flight_status, Flights)
kable(df3)
airline Airport Delayed Ontime
ALASKA losangeles 62 497
ALASKA phoenix 12 221
ALASKA sandiego 20 212
ALASKA sanfran 102 503
ALASKA seattle 305 1841
AMWEST losangeles 117 694
AMWEST phoenix 415 4840
AMWEST sandiego 65 383
AMWEST sanfran 129 320
AMWEST seattle 61 201

We now can more imaginably work with this data set. Next we must create a new column with the percentage for each row.

df3 <- mutate(df3, Avg_On_Time = Ontime/(Ontime + Delayed))
kable(arrange(df3, Avg_On_Time))
airline Airport Delayed Ontime Avg_On_Time
AMWEST sanfran 129 320 0.7126949
AMWEST seattle 61 201 0.7671756
ALASKA sanfran 102 503 0.8314050
AMWEST sandiego 65 383 0.8549107
AMWEST losangeles 117 694 0.8557337
ALASKA seattle 305 1841 0.8578751
ALASKA losangeles 62 497 0.8890877
ALASKA sandiego 20 212 0.9137931
AMWEST phoenix 415 4840 0.9210276
ALASKA phoenix 12 221 0.9484979
AirlinAVG <- group_by(df3, airline) %>% summarise(AirlineSuccess = mean(Avg_On_Time)) 
kable(AirlinAVG)
airline AirlineSuccess
ALASKA 0.8881317
AMWEST 0.8223085
ggplot(data = AirlinAVG, aes(x = airline, y = AirlineSuccess)) + 
    geom_bar(stat = 'identity', width = 0.5) + 
    geom_text(aes(label = format(AirlineSuccess, digits = 2)), vjust = -0.25) + 
    ylim(0:1) + 
    theme_economist()

Alaska is more on ontime.

Which airport is more on time?

AirportAVG <- group_by(df3, Airport) %>% summarise(AirportSuccess = mean(Avg_On_Time)) 
kable(AirportAVG)
Airport AirportSuccess
losangeles 0.8724107
phoenix 0.9347627
sandiego 0.8843519
sanfran 0.7720499
seattle 0.8125253
ggplot(data = AirportAVG, aes(x = Airport, y = AirportSuccess)) + 
    geom_bar(stat = 'identity', width = 0.5) + 
    geom_text(aes(label = format(AirportSuccess, digits = 2)), vjust = -0.25) + 
    ylim(0:1) + 
    theme_economist()

Phoenix seems to be best.

Do the different airlines fair better or worse at different airports?

dfpaired <- df3 %>% select(-c(Delayed, Ontime)) %>% spread(Airport, Avg_On_Time)

ggplot(data = df3, mapping = aes(x = Airport, y = Avg_On_Time, fill = airline)) +
  geom_bar(stat = 'identity', position = 'dodge',  width = 0.75,) +
  geom_text(aes(label = format(Avg_On_Time, digits = 2)), vjust = -0.3, 
            position = position_dodge(width=0.75)) +
  ylim(0:1) +
  theme_economist() + scale_fill_economist()

This is cool but it is hard to tell if there are differences since AM West does worse all the time. We need a way to control for this:

Using the mean:

kable(AirlinAVG)
airline AirlineSuccess
ALASKA 0.8881317
AMWEST 0.8223085
alskAvg <- as.numeric(AirlinAVG[1,2])
amAvg <- as.numeric(AirlinAVG[2,2])

# We now have the averages.
dfCon1 <- df3 %>% 
  mutate(OnTimeControl1 = ifelse(airline == "ALASKA", yes = Avg_On_Time / alskAvg, 
                                 no = Avg_On_Time / amAvg))  
# Cool did not expect it to vectorize Avg_On_Time and not alskAvg. R is the best. 
kable(dfCon1)
airline Airport Delayed Ontime Avg_On_Time OnTimeControl1
ALASKA losangeles 62 497 0.8890877 1.0010763
ALASKA phoenix 12 221 0.9484979 1.0679698
ALASKA sandiego 20 212 0.9137931 1.0288936
ALASKA sanfran 102 503 0.8314050 0.9361280
ALASKA seattle 305 1841 0.8578751 0.9659323
AMWEST losangeles 117 694 0.8557337 1.0406480
AMWEST phoenix 415 4840 0.9210276 1.1200512
AMWEST sandiego 65 383 0.8549107 1.0396472
AMWEST sanfran 129 320 0.7126949 0.8667001
AMWEST seattle 61 201 0.7671756 0.9329535

Now to graph:

ggplot(data = dfCon1, mapping = aes(x = Airport, y = OnTimeControl1, fill = airline)) +
  geom_bar(stat = 'identity', position = 'dodge',  width = 0.75) +
  geom_text(aes(label = format(OnTimeControl1, digits = 2)), vjust = -0.3, 
            position = position_dodge(width=0.75)) +
  theme_economist() + scale_fill_economist() # For some reason ylim does not work well

We can see that we do have some success and that when we control, AM West does do better some airport than we might expect. This is interesting because it means different conditions at the airports effect the carriers differently.

df4 <- df3 %>% 
  select(-c(Delayed, Ontime)) %>% 
  spread(airline, Avg_On_Time)
kable(df4)
Airport ALASKA AMWEST
losangeles 0.8890877 0.8557337
phoenix 0.9484979 0.9210276
sandiego 0.9137931 0.8549107
sanfran 0.8314050 0.7126949
seattle 0.8578751 0.7671756

Now we controlled for how bad the airlines are, but we are being unfair to the airports because we don’t know which airport gets more of the better trafic. Maybe the airports are more similar and only bad airline trafic makes them look worse.

portControl <- spread(df2, airline, Flights)  
portControl1 <- group_by(portControl, Airport) %>% summarise(alskNum = sum(ALASKA)) 
portControl2 <- group_by(portControl, Airport) %>% summarise(amNum = sum(AMWEST)) 
portDF <- left_join(portControl1, portControl2) %>% mutate( percAlsk = alskNum/(alskNum + amNum))
## Joining, by = "Airport"
kable(portDF)
Airport alskNum amNum percAlsk
losangeles 559 811 0.4080292
phoenix 233 5255 0.0424563
sandiego 232 448 0.3411765
sanfran 605 449 0.5740038
seattle 2146 262 0.8911960

To graph this compared to the initial airport graph:

pltA <- ggplot(data = portDF, mapping = aes(x = Airport, y = percAlsk)) +
  geom_bar(stat = 'identity', position = 'dodge',  width = 0.75) +
  geom_text(aes(label = format(percAlsk, digits = 2)), vjust = -0.3, 
            position = position_dodge(width=0.75)) +
  ylim(0:1)+
  theme_economist() + scale_fill_economist() # For some reason ylim does not work well


pltB <- ggplot(data = AirportAVG, aes(x = Airport, y = AirportSuccess)) + 
    geom_bar(stat = 'identity', width = 0.5) + 
    geom_text(aes(label = format(AirportSuccess, digits = 2)), vjust = -0.25) + 
    ylim(0:1) + 
    theme_economist()
grid.arrange(pltA, pltB)

Graphs are cool but ttests are even cooler.

cor( portDF$percAlsk, AirportAVG$AirportSuccess)
## [1] -0.8235408

Interesting, the higher percent of alaska flights, the lower the success rates. This goes against the data we ahve on the quality of AM West. It is probably explained by other flights from other cariers.

t.test(df4$ALASKA, df4$AMWEST)
## 
##  Welch Two Sample t-test
## 
## data:  df4$ALASKA and df4$AMWEST
## t = 1.5642, df = 6.2788, p-value = 0.1666
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.03604854  0.16769505
## sample estimates:
## mean of x mean of y 
## 0.8881317 0.8223085

Statistically we cannot say that There is difference. But we also ahve limited samples.

I won’t try and do the same with airports because I’d lose degrees of freedom PDQ.