library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.4.2 v dplyr 0.7.4
## v tidyr 0.8.0 v stringr 1.3.0
## v readr 1.1.1 v forcats 0.3.0
## -- Conflicts ----------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
library(RCurl)
## Loading required package: bitops
##
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
##
## complete
csv <- getURL("https://raw.githubusercontent.com/baroncurtin2/data607/master/week5/week5.csv")
data <- fread(csv, header = TRUE, sep = ",", fill = TRUE, stringsAsFactors = FALSE) %>%
# omit row with NAs
na.omit %>% tbl_df %>% print
## # A tibble: 4 x 7
## V1 V2 `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
## <chr> <chr> <int> <int> <int> <int> <int>
## 1 ALASKA on ti~ 497 221 212 503 1841
## 2 "" delay~ 62 12 20 102 305
## 3 AM WEST on ti~ 694 4840 383 320 201
## 4 "" delay~ 117 415 65 129 61
tidy <- data %>% # lets rename the V1 and V2 columns to something sensible
rename(carrier = V1, status = V2)
# replicate ALASKA and AM WEST for the blank rows in carrier
tidy[2, 1] <- "ALASKA"
tidy[4, 1] <- "AM WEST"
# apply tidyr principles of every variable being a column and every
# observation a row
tidy <- tidy %>% # gather the city columns into one column called destination, values are the
# flight counts
gather(destination, flight_count, "Los Angeles":Seattle) %>% # spread flight status into two columns
spread(status, flight_count) %>% # rename 'on time' to ontime
rename(ontime = `on time`) %>% print
## # A tibble: 10 x 4
## carrier destination delayed ontime
## <chr> <chr> <int> <int>
## 1 ALASKA Los Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San Diego 20 212
## 4 ALASKA San Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Los Angeles 117 694
## 7 AM WEST Phoenix 415 4840
## 8 AM WEST San Diego 65 383
## 9 AM WEST San Francisco 129 320
## 10 AM WEST Seattle 61 201
tidy <- tidy %>% # add a column for total flights and calculate delayed ratio
mutate(total_flights = delayed + ontime, delayed_ratio = delayed/total_flights) %>%
# arrange table
arrange(carrier, destination) %>% print
## # A tibble: 10 x 6
## carrier destination delayed ontime total_flights delayed_ratio
## <chr> <chr> <int> <int> <int> <dbl>
## 1 ALASKA Los Angeles 62 497 559 0.111
## 2 ALASKA Phoenix 12 221 233 0.0515
## 3 ALASKA San Diego 20 212 232 0.0862
## 4 ALASKA San Francisco 102 503 605 0.169
## 5 ALASKA Seattle 305 1841 2146 0.142
## 6 AM WEST Los Angeles 117 694 811 0.144
## 7 AM WEST Phoenix 415 4840 5255 0.0790
## 8 AM WEST San Diego 65 383 448 0.145
## 9 AM WEST San Francisco 129 320 449 0.287
## 10 AM WEST Seattle 61 201 262 0.233
ggplot(tidy, aes(x = destination, y = delayed_ratio, group = carrier, color = destination,
shape = carrier)) + geom_point(aes(shape = carrier)) + geom_line() + labs(x = "City",
y = "Delayed %")
From the visual, we can see that AM West is the poorer performer when it comes to delays, however since the lines are very similar in shape, it is likely that these destinations are also partly to blame for the delays. ALASKA also performed better at every city compared to AM WEST.