Load Packages
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.4.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
Add headers for Airline and status and row headers for “delayed”.
csv_file <- rbind(c("Airline", "Status", "Los Angeles", "Phoenix", "San Diego", "San Francisco", "Seattle"),
c("ALASKA", "On Time", 497, 221, 212, 503, 1841),
c("ALASKA", "Delayed", 62, 12, 20, 102, 305),
c("AM WEST", "On Time", 694, 4840, 383, 320, 201),
c("AM WEST", "Delayed", 117, 415, 65, 129, 61))
csv_file
## [,1] [,2] [,3] [,4] [,5]
## [1,] "Airline" "Status" "Los Angeles" "Phoenix" "San Diego"
## [2,] "ALASKA" "On Time" "497" "221" "212"
## [3,] "ALASKA" "Delayed" "62" "12" "20"
## [4,] "AM WEST" "On Time" "694" "4840" "383"
## [5,] "AM WEST" "Delayed" "117" "415" "65"
## [,6] [,7]
## [1,] "San Francisco" "Seattle"
## [2,] "503" "1841"
## [3,] "102" "305"
## [4,] "320" "201"
## [5,] "129" "61"
write.table(csv_file, file = "Airline.csv", sep = ",", col.names=F, row.names=F)
Read the data from the file
csvdata <- read.csv(paste0("Airline.csv"), stringsAsFactors = F);
csvdata[,2] <- sapply(csvdata[,2], str_replace, " ", "_")
csvdata
## Airline Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA On_Time 497 221 212 503 1841
## 2 ALASKA Delayed 62 12 20 102 305
## 3 AM WEST On_Time 694 4840 383 320 201
## 4 AM WEST Delayed 117 415 65 129 61
Use the gather function was used to move data from wide to tall format.
csv_file2 <- gather(csvdata, "City", "Count", 3:7)
csv_file2
## Airline Status City Count
## 1 ALASKA On_Time Los.Angeles 497
## 2 ALASKA Delayed Los.Angeles 62
## 3 AM WEST On_Time Los.Angeles 694
## 4 AM WEST Delayed Los.Angeles 117
## 5 ALASKA On_Time Phoenix 221
## 6 ALASKA Delayed Phoenix 12
## 7 AM WEST On_Time Phoenix 4840
## 8 AM WEST Delayed Phoenix 415
## 9 ALASKA On_Time San.Diego 212
## 10 ALASKA Delayed San.Diego 20
## 11 AM WEST On_Time San.Diego 383
## 12 AM WEST Delayed San.Diego 65
## 13 ALASKA On_Time San.Francisco 503
## 14 ALASKA Delayed San.Francisco 102
## 15 AM WEST On_Time San.Francisco 320
## 16 AM WEST Delayed San.Francisco 129
## 17 ALASKA On_Time Seattle 1841
## 18 ALASKA Delayed Seattle 305
## 19 AM WEST On_Time Seattle 201
## 20 AM WEST Delayed Seattle 61
Use the spread function to move Status into its own variable
csv_file2 <- spread(csv_file2, Status, Count)
csv_file2
## Airline City Delayed On_Time
## 1 ALASKA Los.Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San.Diego 20 212
## 4 ALASKA San.Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Los.Angeles 117 694
## 7 AM WEST Phoenix 415 4840
## 8 AM WEST San.Diego 65 383
## 9 AM WEST San.Francisco 129 320
## 10 AM WEST Seattle 61 201
examine delays for these two airlines in general and by city
By Airline:
csv_file2 %>% group_by(Airline) %>%
summarise(
TotDelay=sum(Delayed),TotOn_Time=sum(On_Time),
PctDelayed=(TotDelay/(TotDelay+TotOn_Time)*100),
PctOnTime=(TotOn_Time/(TotDelay+TotOn_Time)*100)
)
## Warning: package 'bindrcpp' was built under R version 3.4.2
## # A tibble: 2 x 5
## Airline TotDelay TotOn_Time PctDelayed PctOnTime
## <chr> <int> <int> <dbl> <dbl>
## 1 ALASKA 501 3274 13.27152 86.72848
## 2 AM WEST 787 6438 10.89273 89.10727
Alaska has a greater percentage of delays than AM West.
By City:
csv_file2 %>% group_by(City) %>%
summarise(
TotDelay=sum(Delayed),TotOn_Time=sum(On_Time),
PctDelayed=(TotDelay/(TotDelay+TotOn_Time)*100),
PctOnTime=(TotOn_Time/(TotDelay+TotOn_Time)*100)
)
## # A tibble: 5 x 5
## City TotDelay TotOn_Time PctDelayed PctOnTime
## <chr> <int> <int> <dbl> <dbl>
## 1 Los.Angeles 179 1191 13.065693 86.93431
## 2 Phoenix 427 5061 7.780612 92.21939
## 3 San.Diego 85 595 12.500000 87.50000
## 4 San.Francisco 231 823 21.916509 78.08349
## 5 Seattle 366 2042 15.199336 84.80066
San Francisco has the greatest percentage of delays of the cities listed. Phoenix has the best on time.
By City and Airline:
csv_file2 %>% group_by(City,Airline) %>%
summarise(
TotDelay=sum(Delayed),TotOn_Time=sum(On_Time),
PctDelayed=(TotDelay/(TotDelay+TotOn_Time)*100),
PctOnTime=(TotOn_Time/(TotDelay+TotOn_Time)*100)
)
## # A tibble: 10 x 6
## # Groups: City [?]
## City Airline TotDelay TotOn_Time PctDelayed PctOnTime
## <chr> <chr> <int> <int> <dbl> <dbl>
## 1 Los.Angeles ALASKA 62 497 11.091234 88.90877
## 2 Los.Angeles AM WEST 117 694 14.426634 85.57337
## 3 Phoenix ALASKA 12 221 5.150215 94.84979
## 4 Phoenix AM WEST 415 4840 7.897241 92.10276
## 5 San.Diego ALASKA 20 212 8.620690 91.37931
## 6 San.Diego AM WEST 65 383 14.508929 85.49107
## 7 San.Francisco ALASKA 102 503 16.859504 83.14050
## 8 San.Francisco AM WEST 129 320 28.730512 71.26949
## 9 Seattle ALASKA 305 1841 14.212488 85.78751
## 10 Seattle AM WEST 61 201 23.282443 76.71756
In particular, AM West at San Francisco has a much higher percentage of delays than all airlines and cities. Taking Alaska airlines to Phoenix would have the best chance of getting there on time.