Data 607 - Week5HW

0.1 Environment Setup

library(tidyverse)

## -- Attaching packages -------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 2.2.1     v purrr   0.2.4
## v tibble  1.4.2     v dplyr   0.7.4
## v tidyr   0.8.0     v stringr 1.3.0
## v readr   1.1.1     v forcats 0.3.0

## -- Conflicts ----------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(magrittr)

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:purrr':
## 
##     set_names

## The following object is masked from 'package:tidyr':
## 
##     extract

library(data.table)

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

## The following object is masked from 'package:purrr':
## 
##     transpose

library(RCurl)

## Loading required package: bitops

## 
## Attaching package: 'RCurl'

## The following object is masked from 'package:tidyr':
## 
##     complete

0.2 Load CSV

csv <- getURL("https://raw.githubusercontent.com/baroncurtin2/data607/master/week5/week5.csv")
data <- fread(csv, header = TRUE, sep = ",", fill = TRUE, stringsAsFactors = FALSE) %>% 
    # omit row with NAs
na.omit %>% tbl_df %>% print

## # A tibble: 4 x 7
##   V1      V2     `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
##   <chr>   <chr>          <int>   <int>       <int>           <int>   <int>
## 1 ALASKA  on ti~           497     221         212             503    1841
## 2 ""      delay~            62      12          20             102     305
## 3 AM WEST on ti~           694    4840         383             320     201
## 4 ""      delay~           117     415          65             129      61

0.3 Tidy Data

tidy <- data %>% # lets rename the V1 and V2 columns to something sensible
rename(carrier = V1, status = V2)

# replicate ALASKA and AM WEST for the blank rows in carrier
tidy[2, 1] <- "ALASKA"
tidy[4, 1] <- "AM WEST"

# apply tidyr principles of every variable being a column and every
# observation a row
tidy <- tidy %>% # gather the city columns into one column called destination, values are the
# flight counts
gather(destination, flight_count, "Los Angeles":Seattle) %>% # spread flight status into two columns
spread(status, flight_count) %>% # rename 'on time' to ontime
rename(ontime = `on time`) %>% print

## # A tibble: 10 x 4
##    carrier destination   delayed ontime
##    <chr>   <chr>           <int>  <int>
##  1 ALASKA  Los Angeles        62    497
##  2 ALASKA  Phoenix            12    221
##  3 ALASKA  San Diego          20    212
##  4 ALASKA  San Francisco     102    503
##  5 ALASKA  Seattle           305   1841
##  6 AM WEST Los Angeles       117    694
##  7 AM WEST Phoenix           415   4840
##  8 AM WEST San Diego          65    383
##  9 AM WEST San Francisco     129    320
## 10 AM WEST Seattle            61    201

0.4 Data Analysis

tidy <- tidy %>% # add a column for total flights and calculate delayed ratio
mutate(total_flights = delayed + ontime, delayed_ratio = delayed/total_flights) %>% 
    # arrange table
arrange(carrier, destination) %>% print

## # A tibble: 10 x 6
##    carrier destination   delayed ontime total_flights delayed_ratio
##    <chr>   <chr>           <int>  <int>         <int>         <dbl>
##  1 ALASKA  Los Angeles        62    497           559        0.111 
##  2 ALASKA  Phoenix            12    221           233        0.0515
##  3 ALASKA  San Diego          20    212           232        0.0862
##  4 ALASKA  San Francisco     102    503           605        0.169 
##  5 ALASKA  Seattle           305   1841          2146        0.142 
##  6 AM WEST Los Angeles       117    694           811        0.144 
##  7 AM WEST Phoenix           415   4840          5255        0.0790
##  8 AM WEST San Diego          65    383           448        0.145 
##  9 AM WEST San Francisco     129    320           449        0.287 
## 10 AM WEST Seattle            61    201           262        0.233

0.5 Data Visualization

ggplot(tidy, aes(x = destination, y = delayed_ratio, group = carrier, color = destination, 
    shape = carrier)) + geom_point(aes(shape = carrier)) + geom_line() + labs(x = "City", 
    y = "Delayed %")

0.6 Conclusion

From the visual, we can see that AM West is the poorer performer when it comes to delays, however since the lines are very similar in shape, it is likely that these destinations are also partly to blame for the delays. ALASKA also performed better at every city compared to AM WEST.