READ IN DATA

file = "/Users/euniceok/PycharmProjects/cuny/spring2019/Week5/data/airline_delays.csv"
df = read.csv(file, na.strings=c(""," "))
df
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2    <NA> delayed          62      12        20           102     305
## 3    <NA>    <NA>          NA      NA        NA            NA      NA
## 4 AM WEST on time         694    4840       383           320     201
## 5    <NA> delayed         117     415        65           129      61

TIDY and TRANSFORM DATA

df <- df %>% 
  # convert wide dataset to long on all cols except X and X.1
  gather(key, value, -X, -X.1, na.rm=TRUE) %>% 
  # rename all cols to be intuitive
  rename(city = key, counts = value, airline = X, status = X.1) %>% 
  # fill out missing airline names
  fill(airline) 
head(df)
##   airline  status        city counts
## 1  ALASKA on time Los.Angeles    497
## 2  ALASKA delayed Los.Angeles     62
## 4 AM WEST on time Los.Angeles    694
## 5 AM WEST delayed Los.Angeles    117
## 6  ALASKA on time     Phoenix    221
## 7  ALASKA delayed     Phoenix     12
# another version of tidy df for easier analysis
tmp <- df %>% 
  spread(status, counts) %>% 
  rename(ontime = 'on time') %>% 
  mutate(total = delayed + ontime)
tmp
##    airline          city delayed ontime total
## 1   ALASKA   Los.Angeles      62    497   559
## 2   ALASKA       Phoenix      12    221   233
## 3   ALASKA     San.Diego      20    212   232
## 4   ALASKA San.Francisco     102    503   605
## 5   ALASKA       Seattle     305   1841  2146
## 6  AM WEST   Los.Angeles     117    694   811
## 7  AM WEST       Phoenix     415   4840  5255
## 8  AM WEST     San.Diego      65    383   448
## 9  AM WEST San.Francisco     129    320   449
## 10 AM WEST       Seattle      61    201   262

ANALYZE DATA

  1. calculate delay and ontime rates for each airline
# what is the relative rate of delayed flights for each airline? 
airline_delay <- tmp %>%
  group_by(airline) %>%
  summarise(delays = sum(delayed),totals = sum(total)) %>%
  mutate(del_rate = delays / totals)

# what is the on time rate for flights for each airline? 
airline_ontime <- tmp %>%
  group_by(airline) %>%
  summarise(ontime = sum(ontime),totals = sum(total)) %>%
  mutate(ontime_rate = ontime / totals)

airline_tbl <- inner_join(airline_delay, airline_ontime, by="airline")
airline_tbl %>% select(airline,del_rate,ontime_rate)
## # A tibble: 2 x 3
##   airline del_rate ontime_rate
##   <fct>      <dbl>       <dbl>
## 1 ALASKA     0.133       0.867
## 2 AM WEST    0.109       0.891
airline_tbl
## # A tibble: 2 x 7
##   airline delays totals.x del_rate ontime totals.y ontime_rate
##   <fct>    <int>    <int>    <dbl>  <int>    <int>       <dbl>
## 1 ALASKA     501     3775    0.133   3274     3775       0.867
## 2 AM WEST    787     7225    0.109   6438     7225       0.891
atplot <- airline_tbl %>% 
    select(airline,delays, ontime) %>%
    gather(delays, ontime, -airline) %>%
    rename(status = delays, flightcts = ontime)  
atplot
## # A tibble: 4 x 3
##   airline status flightcts
##   <fct>   <chr>      <int>
## 1 ALASKA  delays       501
## 2 AM WEST delays       787
## 3 ALASKA  ontime      3274
## 4 AM WEST ontime      6438
  1. plot delay and ontime flight counts for each airline
g <- ggplot(atplot, aes(airline, flightcts))
g + geom_bar(aes(fill=status), width = 0.5, stat="identity") + 
  theme(axis.text.x = element_text(angle=65, vjust=0.6)) + theme_tufte()

  1. calculate delay and ontime rates for each airline and city

  2. plot rates for each airline and city

# ALASKA
alctplot <- 
  aircttbl %>% 
  filter(airline == "ALASKA") %>% 
  ungroup() %>%
  select(city, del_rate, ontime_rate) %>%
  gather(del_rate, ontime_rate, -city) %>%
  rename(status = del_rate, flightcts = ontime_rate) 

# AM WEST
amctplot <- 
  aircttbl %>% 
  filter(airline == "AM WEST") %>% 
  ungroup() %>%
  select(city, del_rate, ontime_rate) %>%
  gather(del_rate, ontime_rate, -city) %>%
  rename(status = del_rate, flightcts = ontime_rate) 

ALASKA PLOT

ggplot(alctplot,aes(x=city,y=flightcts,fill=factor(status)))+
  geom_bar(stat="identity",position="dodge")+
#  scale_fill_discrete(name="status",
#                      breaks=c(1, 2),
#                      labels=c("delay", "ontime"))+
  xlab("airline")+ylab("rates") + theme_tufte()

AM WEST PLOT

ggplot(amctplot,aes(x=city,y=flightcts,fill=factor(status)))+
  geom_bar(stat="identity",position="dodge")+
#  scale_fill_discrete(name="status",
#                      breaks=c(1, 2),
#                      labels=c("delay", "ontime"))+
  xlab("airline")+ylab("rates") + theme_tufte()

delctplot <- aircttbl %>%
  select(airline, city, del_rate)
delctplot
## # A tibble: 10 x 3
## # Groups:   airline [2]
##    airline city          del_rate
##    <fct>   <chr>            <dbl>
##  1 ALASKA  Los.Angeles     0.111 
##  2 ALASKA  Phoenix         0.0515
##  3 ALASKA  San.Diego       0.0862
##  4 ALASKA  San.Francisco   0.169 
##  5 ALASKA  Seattle         0.142 
##  6 AM WEST Los.Angeles     0.144 
##  7 AM WEST Phoenix         0.0790
##  8 AM WEST San.Diego       0.145 
##  9 AM WEST San.Francisco   0.287 
## 10 AM WEST Seattle         0.233
ggplot(delctplot,aes(x=city,y=del_rate,fill=factor(airline)))+
  geom_bar(stat="identity",position="dodge")+
  #scale_fill_discrete(name="status",
  #                    breaks=c(1, 2),
  #                    labels=c("alaska", "am west"))+
  xlab("city")+ylab("rates") + theme_tufte()

FINDINGS