Q1

Create a .CSV file (or optionally, a MySQL database!) that includes all of the information above. You’re encouraged to use a “wide” structure similar to how the information appears above, so that you can practice tidying and transformations as described below.

Loading packages:

library(knitr)
library(gridExtra)
library(knitr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggthemes)
if(!require(RPostgreSQL)){
    install.packages("RPostgreSQL")
    library(RPostgreSQL)
}

Loading data from postgres:

drv <- dbDriver("PostgreSQL")
con <- dbConnect(drv, dbname = "kailukowiak",
                 host = "localhost", port = 5432,
                 user = "kailukowiak")

Q2

Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data.

df <- dbGetQuery(con, "SELECT * FROM alaskaflights")
kable(df)

id	airline	flight_status	losangeles	phoenix	sandiego	sanfran	seattle
1	ALASKA	Ontime	497	221	212	503	1841
2	NA	Delayed	62	12	20	102	305
3	NA	NA	NA	NA	NA	NA	NA
4	AMWEST	Ontime	694	4840	383	320	201
5	NA	Delayed	117	415	65	129	61

Drop ID:

df <- df %>% select(-id)

Remove na row and repalce Na values:

df <- df %>% fill(airline) %>% drop_na()  
# The order is important because drop_na will detlet all 
# rows with any na's

Gather the Data

df2 <- gather(df, Airport, Flights, (3:7))
kable(df2)

airline	flight_status	Airport	Flights
ALASKA	Ontime	losangeles	497
ALASKA	Delayed	losangeles	62
AMWEST	Ontime	losangeles	694
AMWEST	Delayed	losangeles	117
ALASKA	Ontime	phoenix	221
ALASKA	Delayed	phoenix	12
AMWEST	Ontime	phoenix	4840
AMWEST	Delayed	phoenix	415
ALASKA	Ontime	sandiego	212
ALASKA	Delayed	sandiego	20
AMWEST	Ontime	sandiego	383
AMWEST	Delayed	sandiego	65
ALASKA	Ontime	sanfran	503
ALASKA	Delayed	sanfran	102
AMWEST	Ontime	sanfran	320
AMWEST	Delayed	sanfran	129
ALASKA	Ontime	seattle	1841
ALASKA	Delayed	seattle	305
AMWEST	Ontime	seattle	201
AMWEST	Delayed	seattle	61

Q3

Perform analysis to compare the arrival delays for the two airlines.

It is easier to reshape and compare elements by row:

df3 <- df2 %>% spread(flight_status, Flights)
kable(df3)

airline	Airport	Delayed	Ontime
ALASKA	losangeles	62	497
ALASKA	phoenix	12	221
ALASKA	sandiego	20	212
ALASKA	sanfran	102	503
ALASKA	seattle	305	1841
AMWEST	losangeles	117	694
AMWEST	phoenix	415	4840
AMWEST	sandiego	65	383
AMWEST	sanfran	129	320
AMWEST	seattle	61	201

We now can more imaginably work with this data set. Next we must create a new column with the percentage for each row.

df3 <- mutate(df3, Avg_On_Time = Ontime/(Ontime + Delayed))
kable(arrange(df3, Avg_On_Time))

airline	Airport	Delayed	Ontime	Avg_On_Time
AMWEST	sanfran	129	320	0.7126949
AMWEST	seattle	61	201	0.7671756
ALASKA	sanfran	102	503	0.8314050
AMWEST	sandiego	65	383	0.8549107
AMWEST	losangeles	117	694	0.8557337
ALASKA	seattle	305	1841	0.8578751
ALASKA	losangeles	62	497	0.8890877
ALASKA	sandiego	20	212	0.9137931
AMWEST	phoenix	415	4840	0.9210276
ALASKA	phoenix	12	221	0.9484979

AirlinAVG <- group_by(df3, airline) %>% summarise(AirlineSuccess = mean(Avg_On_Time)) 
kable(AirlinAVG)

airline	AirlineSuccess
ALASKA	0.8881317
AMWEST	0.8223085

ggplot(data = AirlinAVG, aes(x = airline, y = AirlineSuccess)) + 
    geom_bar(stat = 'identity', width = 0.5) + 
    geom_text(aes(label = format(AirlineSuccess, digits = 2)), vjust = -0.25) + 
    ylim(0:1) + 
    theme_economist()

Alaska is more on ontime.

Which airport is more on time?

AirportAVG <- group_by(df3, Airport) %>% summarise(AirportSuccess = mean(Avg_On_Time)) 
kable(AirportAVG)

Airport	AirportSuccess
losangeles	0.8724107
phoenix	0.9347627
sandiego	0.8843519
sanfran	0.7720499
seattle	0.8125253

ggplot(data = AirportAVG, aes(x = Airport, y = AirportSuccess)) + 
    geom_bar(stat = 'identity', width = 0.5) + 
    geom_text(aes(label = format(AirportSuccess, digits = 2)), vjust = -0.25) + 
    ylim(0:1) + 
    theme_economist()

Phoenix seems to be best.

Do the different airlines fair better or worse at different airports?

dfpaired <- df3 %>% select(-c(Delayed, Ontime)) %>% spread(Airport, Avg_On_Time)

ggplot(data = df3, mapping = aes(x = Airport, y = Avg_On_Time, fill = airline)) +
  geom_bar(stat = 'identity', position = 'dodge',  width = 0.75,) +
  geom_text(aes(label = format(Avg_On_Time, digits = 2)), vjust = -0.3, 
            position = position_dodge(width=0.75)) +
  ylim(0:1) +
  theme_economist() + scale_fill_economist()

This is cool but it is hard to tell if there are differences since AM West does worse all the time. We need a way to control for this:

Using the mean:

kable(AirlinAVG)

airline	AirlineSuccess
ALASKA	0.8881317
AMWEST	0.8223085

alskAvg <- as.numeric(AirlinAVG[1,2])
amAvg <- as.numeric(AirlinAVG[2,2])

# We now have the averages.
dfCon1 <- df3 %>% 
  mutate(OnTimeControl1 = ifelse(airline == "ALASKA", yes = Avg_On_Time / alskAvg, 
                                 no = Avg_On_Time / amAvg))  
# Cool did not expect it to vectorize Avg_On_Time and not alskAvg. R is the best. 
kable(dfCon1)

airline	Airport	Delayed	Ontime	Avg_On_Time	OnTimeControl1
ALASKA	losangeles	62	497	0.8890877	1.0010763
ALASKA	phoenix	12	221	0.9484979	1.0679698
ALASKA	sandiego	20	212	0.9137931	1.0288936
ALASKA	sanfran	102	503	0.8314050	0.9361280
ALASKA	seattle	305	1841	0.8578751	0.9659323
AMWEST	losangeles	117	694	0.8557337	1.0406480
AMWEST	phoenix	415	4840	0.9210276	1.1200512
AMWEST	sandiego	65	383	0.8549107	1.0396472
AMWEST	sanfran	129	320	0.7126949	0.8667001
AMWEST	seattle	61	201	0.7671756	0.9329535

Now to graph:

ggplot(data = dfCon1, mapping = aes(x = Airport, y = OnTimeControl1, fill = airline)) +
  geom_bar(stat = 'identity', position = 'dodge',  width = 0.75) +
  geom_text(aes(label = format(OnTimeControl1, digits = 2)), vjust = -0.3, 
            position = position_dodge(width=0.75)) +
  theme_economist() + scale_fill_economist() # For some reason ylim does not work well

We can see that we do have some success and that when we control, AM West does do better some airport than we might expect. This is interesting because it means different conditions at the airports effect the carriers differently.

df4 <- df3 %>% 
  select(-c(Delayed, Ontime)) %>% 
  spread(airline, Avg_On_Time)
kable(df4)

Airport	ALASKA	AMWEST
losangeles	0.8890877	0.8557337
phoenix	0.9484979	0.9210276
sandiego	0.9137931	0.8549107
sanfran	0.8314050	0.7126949
seattle	0.8578751	0.7671756

Now we controlled for how bad the airlines are, but we are being unfair to the airports because we don’t know which airport gets more of the better trafic. Maybe the airports are more similar and only bad airline trafic makes them look worse.

portControl <- spread(df2, airline, Flights)  
portControl1 <- group_by(portControl, Airport) %>% summarise(alskNum = sum(ALASKA)) 
portControl2 <- group_by(portControl, Airport) %>% summarise(amNum = sum(AMWEST)) 
portDF <- left_join(portControl1, portControl2) %>% mutate( percAlsk = alskNum/(alskNum + amNum))

## Joining, by = "Airport"

kable(portDF)

Airport	alskNum	amNum	percAlsk
losangeles	559	811	0.4080292
phoenix	233	5255	0.0424563
sandiego	232	448	0.3411765
sanfran	605	449	0.5740038
seattle	2146	262	0.8911960

To graph this compared to the initial airport graph:

pltA <- ggplot(data = portDF, mapping = aes(x = Airport, y = percAlsk)) +
  geom_bar(stat = 'identity', position = 'dodge',  width = 0.75) +
  geom_text(aes(label = format(percAlsk, digits = 2)), vjust = -0.3, 
            position = position_dodge(width=0.75)) +
  ylim(0:1)+
  theme_economist() + scale_fill_economist() # For some reason ylim does not work well


pltB <- ggplot(data = AirportAVG, aes(x = Airport, y = AirportSuccess)) + 
    geom_bar(stat = 'identity', width = 0.5) + 
    geom_text(aes(label = format(AirportSuccess, digits = 2)), vjust = -0.25) + 
    ylim(0:1) + 
    theme_economist()
grid.arrange(pltA, pltB)

Graphs are cool but ttests are even cooler.

cor( portDF$percAlsk, AirportAVG$AirportSuccess)

## [1] -0.8235408

Interesting, the higher percent of alaska flights, the lower the success rates. This goes against the data we ahve on the quality of AM West. It is probably explained by other flights from other cariers.

t.test(df4$ALASKA, df4$AMWEST)

## 
##  Welch Two Sample t-test
## 
## data:  df4$ALASKA and df4$AMWEST
## t = 1.5642, df = 6.2788, p-value = 0.1666
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.03604854  0.16769505
## sample estimates:
## mean of x mean of y 
## 0.8881317 0.8223085

Statistically we cannot say that There is difference. But we also ahve limited samples.

I won’t try and do the same with airports because I’d lose degrees of freedom PDQ.

DATA_607_Ass5

Kai Lukowiak

2017-09-29

Q1