library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(magrittr) # for using the pipe operators for dplyr/tidyr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
##
## extract
arrflights <- read.csv("/home/jonboy1987/Desktop/CUNYSPS/IS607/Assignments/Week5/flight_arrivals.csv")
arrflights
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
# Change some column names to be more readable
colnames(arrflights)[1:2] <- c("airline","arrival")
names(arrflights)
## [1] "airline" "arrival" "Los.Angeles" "Phoenix"
## [5] "San.Diego" "San.Francisco" "Seattle"
# strip off the 3rd row as it really has no data associated with it
arrflights <- arrflights[complete.cases(arrflights), ]
# fill in the gaps for the airline so two for Alaska, two for AM Flights
arrflights$airline <- c(rep("ALASKA", 2), rep("AM WEST", 2))
# gather the number of flights by destination and then arrange the data to be
# sorted by airline and then by destination
tidy_arrflights <- arrflights %>% gather(destination, flights, 3:7) %>%
arrange(airline, destination)
# replace the '.' in the destination with ' '
tidy_arrflights$destination <- gsub("\\.", " ", tidy_arrflights$destination)
tidy_arrflights <- tidy_arrflights %>% mutate_each(funs(factor), -flights)
tidy_arrflights
## airline arrival destination flights
## 1 ALASKA on time Los Angeles 497
## 2 ALASKA delayed Los Angeles 62
## 3 ALASKA on time Phoenix 221
## 4 ALASKA delayed Phoenix 12
## 5 ALASKA on time San Diego 212
## 6 ALASKA delayed San Diego 20
## 7 ALASKA on time San Francisco 503
## 8 ALASKA delayed San Francisco 102
## 9 ALASKA on time Seattle 1841
## 10 ALASKA delayed Seattle 305
## 11 AM WEST on time Los Angeles 694
## 12 AM WEST delayed Los Angeles 117
## 13 AM WEST on time Phoenix 4840
## 14 AM WEST delayed Phoenix 415
## 15 AM WEST on time San Diego 383
## 16 AM WEST delayed San Diego 65
## 17 AM WEST on time San Francisco 320
## 18 AM WEST delayed San Francisco 129
## 19 AM WEST on time Seattle 201
## 20 AM WEST delayed Seattle 61
str(tidy_arrflights)
## 'data.frame': 20 obs. of 4 variables:
## $ airline : Factor w/ 2 levels "ALASKA","AM WEST": 1 1 1 1 1 1 1 1 1 1 ...
## $ arrival : Factor w/ 2 levels "delayed","on time": 2 1 2 1 2 1 2 1 2 1 ...
## $ destination: Factor w/ 5 levels "Los Angeles",..: 1 1 2 2 3 3 4 4 5 5 ...
## $ flights : int 497 62 221 12 212 20 503 102 1841 305 ...
# What kind of distribution does destination ~ log(flights) show???
par(cex.axis = .75)
with(tidy_arrflights, plot(destination, log(flights),
main = "destination vs number of flights",
xlab = "destination", ylab = "log(flights)"))
# and the same for airlines vs flights?
with(tidy_arrflights, plot(airline, log(flights),
main = "airline vs number of flights",
xlab = "airline", ylab = "log(flights)"))