knitr::opts_chunk$set(echo = TRUE)
if("tidyr" %in% rownames(installed.packages()) == FALSE) {install.packages("tidyr")}
library(tidyr)
if("dplyr" %in% rownames(installed.packages()) == FALSE) {install.packages("dplyr")}
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
if("stringr" %in% rownames(installed.packages()) == FALSE) {install.packages("stringr")}
library(stringr)
if("zoo" %in% rownames(installed.packages()) == FALSE) {install.packages("zoo")}
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
if("ggplot2" %in% rownames(installed.packages()) == FALSE) {install.packages("ggplot2")}
library(ggplot2)
raw_arr_delay_data <- read.csv("arr_delay.csv", stringsAsFactors = FALSE, na.strings = c("", " "))
raw_arr_delay_data
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1,841
## 2 <NA> delayed 62 12 20 102 305
## 3 <NA> <NA> NA <NA> NA NA <NA>
## 4 AM WEST on time 694 4,840 383 320 201
## 5 <NA> delayed 117 415 65 129 61
#Remove Row 3 as it is an empty row
arr_delay_data <- raw_arr_delay_data[-c(3),]
#Cleanup Column Names
names(arr_delay_data) <- c("Airline", "Status", "Los Angeles", "Phoenix", "San Diego", "San Francisco", "Seattle")
#Remove commas from numeric data (Phoenix and Seattle Columns) so that they can be recognised as integer. This is necessary to be able to summarise the data later on.
arr_delay_data$Phoenix <- as.integer(str_replace_all(arr_delay_data$Phoenix, ",",""))
arr_delay_data$Seattle <- as.integer(str_replace_all(arr_delay_data$Seattle, ",",""))
#Filling Missing data in Airline Column. Will use the na.locf from the zoo package to fill the N/A values in the Airlines column
arr_delay_data <- transform(arr_delay_data, Airline = na.locf(Airline))
arr_delay_data
## Airline Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 4 AM WEST on time 694 4840 383 320 201
## 5 AM WEST delayed 117 415 65 129 61
The data is deemed to have four variables - Airline, Status(whether on time or delayed), City, Number (of flights). Airline is in Column 1, Status in Column 2 and City and number spread among the remaining columns. There is need to put City in its own column (the content to be the remaining column names) and the Number can be in the fourth column. For this, the gather function (of the tidy package) will be used to reshape the data.
tidy_arr_delay <- arr_delay_data %>%
gather(City, Number, Los.Angeles:Seattle)
tidy_arr_delay
## Airline Status City Number
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA delayed Los.Angeles 62
## 3 AM WEST on time Los.Angeles 694
## 4 AM WEST delayed Los.Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AM WEST on time Phoenix 4840
## 8 AM WEST delayed Phoenix 415
## 9 ALASKA on time San.Diego 212
## 10 ALASKA delayed San.Diego 20
## 11 AM WEST on time San.Diego 383
## 12 AM WEST delayed San.Diego 65
## 13 ALASKA on time San.Francisco 503
## 14 ALASKA delayed San.Francisco 102
## 15 AM WEST on time San.Francisco 320
## 16 AM WEST delayed San.Francisco 129
## 17 ALASKA on time Seattle 1841
## 18 ALASKA delayed Seattle 305
## 19 AM WEST on time Seattle 201
## 20 AM WEST delayed Seattle 61
Filter the delayed flights only. The on time flights are not of concern for the rest of the analysis. In addition, since the status column will be the same value, that column is no longer needed.
delayed_data <- tidy_arr_delay %>%
filter(Status == "delayed") %>%
select(Airline, City, Number)
delayed_data
## Airline City Number
## 1 ALASKA Los.Angeles 62
## 2 AM WEST Los.Angeles 117
## 3 ALASKA Phoenix 12
## 4 AM WEST Phoenix 415
## 5 ALASKA San.Diego 20
## 6 AM WEST San.Diego 65
## 7 ALASKA San.Francisco 102
## 8 AM WEST San.Francisco 129
## 9 ALASKA Seattle 305
## 10 AM WEST Seattle 61
In order to summarise data by airline, the group_by function of the dplyr package will be used to group the data by airline.
delay_group <- group_by(delayed_data, Airline)
delay_group
## Source: local data frame [10 x 3]
## Groups: Airline [2]
##
## Airline City Number
## <chr> <chr> <int>
## 1 ALASKA Los.Angeles 62
## 2 AM WEST Los.Angeles 117
## 3 ALASKA Phoenix 12
## 4 AM WEST Phoenix 415
## 5 ALASKA San.Diego 20
## 6 AM WEST San.Diego 65
## 7 ALASKA San.Francisco 102
## 8 AM WEST San.Francisco 129
## 9 ALASKA Seattle 305
## 10 AM WEST Seattle 61
Summarise by adding up the number of flights for each airline. A bar plot is also generated from the summary data.
airline_summary <- summarise(delay_group, sum(Number))
airline_summary
## # A tibble: 2 × 2
## Airline `sum(Number)`
## <chr> <int>
## 1 ALASKA 501
## 2 AM WEST 787
barplot(airline_summary$`sum(Number)`, names.arg = airline_summary$Airline, xlab = "Airline", ylab = "Number of Delayed Flights")
The graph shows that AM West had the most delayed flights
Reverting to using the non-grouped data, the data is presented in graphical format by Airline on the x-axis grouped by city.
ggplot(data = delayed_data) +
geom_bar(
mapping = aes(x = Airline, y = Number, fill = City),
stat = "identity",
position = "dodge"
)
The graph shows that for Alaska airlines, the most delays were in Seattle while the least were in Phoenix. For AM West airlines the most delays were in Phoenix and the least in Seattle.
The data is presented in graphical format by City on the x-axis grouped by Airline
delayed_data
## Airline City Number
## 1 ALASKA Los.Angeles 62
## 2 AM WEST Los.Angeles 117
## 3 ALASKA Phoenix 12
## 4 AM WEST Phoenix 415
## 5 ALASKA San.Diego 20
## 6 AM WEST San.Diego 65
## 7 ALASKA San.Francisco 102
## 8 AM WEST San.Francisco 129
## 9 ALASKA Seattle 305
## 10 AM WEST Seattle 61
ggplot(data = delayed_data) +
geom_bar(
mapping = aes(x = City, y = Number, fill = Airline),
stat = "identity",
position = "dodge"
)
The graph shows that AM West had the most delays in every city except Seattle where Alaska Airlines had the most delays