Transform the wide dataset to tidy format. In the new dataset, one variable will form one column, one observation will form one row, and all observational units needed for the analysis will form a table. With the tidy dataset, the arrival delays for the two airlines across five destinations will be analyzed and compared.
library(RCurl)
## Loading required package: bitops
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:RCurl':
##
## complete
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(knitr)
library(ggplot2)
airdelay_url <- "https://raw.githubusercontent.com/YunMai-SPS/DA607-homework/master/Airline_delay.csv"
airdelay_raw <- read.csv(airdelay_url,header = T, sep=",",quote = "",stringsAsFactors = FALSE, na.strings=c("","NA"))
View(airdelay_raw)
# review the variables and observations in the dataset
str(airdelay_raw)
## 'data.frame': 6 obs. of 7 variables:
## $ X : chr "ALASKA" NA NA "AM WEST" ...
## $ X.1 : chr "on time" "delayed" NA "on time" ...
## $ Los.Angeles : int 497 62 NA 694 117 NA
## $ Phoenix : int 221 12 NA 4840 415 NA
## $ San.Diego : int 212 20 NA 383 65 NA
## $ San.Francisco: int 503 102 NA 320 129 NA
## $ Seattle : int 1841 305 NA 201 61 NA
# view the raw dataset
kable(airdelay_raw)
X | X.1 | Los.Angeles | Phoenix | San.Diego | San.Francisco | Seattle |
---|---|---|---|---|---|---|
ALASKA | on time | 497 | 221 | 212 | 503 | 1841 |
NA | delayed | 62 | 12 | 20 | 102 | 305 |
NA | NA | NA | NA | NA | NA | NA |
AM WEST | on time | 694 | 4840 | 383 | 320 | 201 |
NA | delayed | 117 | 415 | 65 | 129 | 61 |
Source: Numbersense | Kaiser Fung | McGraw Hill| 2013 | NA | NA | NA | NA | NA | NA |
# name the first two columns
colnames(airdelay_raw)[c(1,2)] <- c("Airline","Performance")
# replace each NA with the most recent non-NA prior to it
airdelay_raw1 <- transform(airdelay_raw, Airline=na.locf(Airline))
# drop rows containing NA
airdelay_raw2 <- drop_na(airdelay_raw1)
kable(airdelay_raw2)
Airline | Performance | Los.Angeles | Phoenix | San.Diego | San.Francisco | Seattle | |
---|---|---|---|---|---|---|---|
1 | ALASKA | on time | 497 | 221 | 212 | 503 | 1841 |
2 | ALASKA | delayed | 62 | 12 | 20 | 102 | 305 |
4 | AM WEST | on time | 694 | 4840 | 383 | 320 | 201 |
5 | AM WEST | delayed | 117 | 415 | 65 | 129 | 61 |
# collapse 5 columns with Destinations as header into 2 columns; then generate two columns for on-time and delayed flights; add three new derived variales - total, proportion of delayed and on-time
airdelay_raw3 <- airdelay_raw2 %>%
gather(Destination, Number,3:7) %>%
spread(Performance, Number)
# change the column name "on time" to a valid fromat,"on_time", for the mutate function
colnames(airdelay_raw3)[4] <- "on_time"
airdelay_tidy1 <- mutate(airdelay_raw3,Total = on_time + delayed, Proportion_delayed = delayed/Total, Proportion_ontime = on_time/Total)
kable(airdelay_tidy1)
Airline | Destination | delayed | on_time | Total | Proportion_delayed | Proportion_ontime |
---|---|---|---|---|---|---|
ALASKA | Los.Angeles | 62 | 497 | 559 | 0.1109123 | 0.8890877 |
ALASKA | Phoenix | 12 | 221 | 233 | 0.0515021 | 0.9484979 |
ALASKA | San.Diego | 20 | 212 | 232 | 0.0862069 | 0.9137931 |
ALASKA | San.Francisco | 102 | 503 | 605 | 0.1685950 | 0.8314050 |
ALASKA | Seattle | 305 | 1841 | 2146 | 0.1421249 | 0.8578751 |
AM WEST | Los.Angeles | 117 | 694 | 811 | 0.1442663 | 0.8557337 |
AM WEST | Phoenix | 415 | 4840 | 5255 | 0.0789724 | 0.9210276 |
AM WEST | San.Diego | 65 | 383 | 448 | 0.1450893 | 0.8549107 |
AM WEST | San.Francisco | 129 | 320 | 449 | 0.2873051 | 0.7126949 |
AM WEST | Seattle | 61 | 201 | 262 | 0.2328244 | 0.7671756 |
# plot total counts of flights for the two Airlines accross five destinations
ggplot(airdelay_tidy1, aes(x = Destination, y = Total, fill = Airline)) +
geom_bar(stat="identity", position = "dodge") +
labs(y = "Total flights") +
scale_fill_brewer(palette = "Paired")
Figure 1. Total flights of ALASKA Airline and AM WEST to the five airports.The majority of flights to Phoenix is carried by AM WEST while the majority of flights to Seattle is carried by ALASKA.The contributions to the flight volumes from ALSAKA and AM WEST are silimar in other three airports.
# plot arrival delay rate for the two Airlines accross five destinations
ggplot(airdelay_tidy1, aes(x = Destination, y = Proportion_delayed, fill = Airline)) +
geom_bar(stat="identity", position = "dodge") +
scale_fill_brewer(palette = "Set1") +
labs(y = "Proportion_delay") +
geom_text(aes(label = round(Proportion_delayed, digits = 2)), color='black', position=position_dodge(.9), size=4)
Figure 2. The arrival delay rates of ALASKA Airline and AM WEST accross the five airports.
It is easy to conclude from the bar graph:
Arrival delay occurs more frequently to AM WEST than to ALASKA Airline across the five airports, especially at San. Francisco and Seattle.
To both of the airlines, Phoenix has the lowest arrvial delay rate and San. Froncisco has the highest arrival delay rate accross the five airports.
One of the possbile reasons why AM WEST has the lowest rate of arrival delay at Phoenix Airport is that it is familiar with the ariline since it flies a lot more to this airport than to the other four airports. It is worthwhile to notice that ALASKA Airline also has the lowest arrival delay rate at Phoenix Airport even though that airline only accounts for a very small amount of its flights. This indicates that Phoenix Airport may has optimized schedule to decrease the effects of season, whether, weekday etc. on the performance.
# plot total flights for the two Airlines accross five destinations with stacked bar graph
ggplot(airdelay_tidy1, aes(x = Airline, y = Total, fill = Destination)) +
geom_bar(stat="identity", position = "fill") +
labs(y = "Total flights") +
scale_fill_brewer(palette = "Paired")
Figure 3. The stacked bar graph for the total flights of ALASKA Airline and AM WEST to the five airports.
It is easy to conclude from the stacked bar graph that: 1) Seattle contributes the most to the flights of ALASKA Airline but the least to the flights of AM WEST Airline. 2) Phoenix contributes the most to the flights of AM WEST Airline but the least to the flights of ALASKA Airline.
# calculate the delay rate and on-time rate in all flights of ALASKA
airdelay_tidy2_1 <- airdelay_tidy1 %>%
filter(Airline %in% "ALASKA") %>%
mutate(All_Flight = sum(Total), Delayed_to_all = delayed/All_Flight ,Ontime_to_all = on_time/All_Flight)
# calculate the delay rate and on-time rate in all flights of AM WEST
airdelay_tidy2_2 <-airdelay_tidy1 %>%
filter(Airline %in% "AM WEST") %>%
mutate(All_Flight = sum(Total), Delayed_to_all = delayed/All_Flight ,Ontime_to_all = on_time/All_Flight)
# join two mutated datasets
airdelay_tidy2 <- bind_rows(airdelay_tidy2_1, airdelay_tidy2_2)
kable(airdelay_tidy2)
Airline | Destination | delayed | on_time | Total | Proportion_delayed | Proportion_ontime | All_Flight | Delayed_to_all | Ontime_to_all |
---|---|---|---|---|---|---|---|---|---|
ALASKA | Los.Angeles | 62 | 497 | 559 | 0.1109123 | 0.8890877 | 3775 | 0.0164238 | 0.1316556 |
ALASKA | Phoenix | 12 | 221 | 233 | 0.0515021 | 0.9484979 | 3775 | 0.0031788 | 0.0585430 |
ALASKA | San.Diego | 20 | 212 | 232 | 0.0862069 | 0.9137931 | 3775 | 0.0052980 | 0.0561589 |
ALASKA | San.Francisco | 102 | 503 | 605 | 0.1685950 | 0.8314050 | 3775 | 0.0270199 | 0.1332450 |
ALASKA | Seattle | 305 | 1841 | 2146 | 0.1421249 | 0.8578751 | 3775 | 0.0807947 | 0.4876821 |
AM WEST | Los.Angeles | 117 | 694 | 811 | 0.1442663 | 0.8557337 | 7225 | 0.0161938 | 0.0960554 |
AM WEST | Phoenix | 415 | 4840 | 5255 | 0.0789724 | 0.9210276 | 7225 | 0.0574394 | 0.6698962 |
AM WEST | San.Diego | 65 | 383 | 448 | 0.1450893 | 0.8549107 | 7225 | 0.0089965 | 0.0530104 |
AM WEST | San.Francisco | 129 | 320 | 449 | 0.2873051 | 0.7126949 | 7225 | 0.0178547 | 0.0442907 |
AM WEST | Seattle | 61 | 201 | 262 | 0.2328244 | 0.7671756 | 7225 | 0.0084429 | 0.0278201 |
# plot arrival delay rate for the two Airlines accross five destinations with stacked bar graph
ggplot(airdelay_tidy2, aes(x = Airline, y = Delayed_to_all, fill = Destination)) +
geom_bar(stat="identity", position = "fill") +
labs(y = "Arrival Delay Rate") +
scale_fill_brewer(palette = "Paired")
Figure 4. The stacked bar graph for the arrival-delay rates of ALASKA Airline and AM WEST accross the five airports.
The pattern of the distribution of arrival-delay rates accross the five airports is similar to that of total flights. But the arrival-delayed rate at San. Francisco for AM WEST Airline is not proportinal to its total flights, suggesting it performed better at San. Francisco Airport than at the rest 4 airports.
# plot arrival delay rate for the two Airlines accross five destinations with stacked bar graph
ggplot(airdelay_tidy1, aes(x = Airline, y = Proportion_delayed, fill = Destination)) +
geom_bar(stat="identity") +
labs(y = "Arrival Delay Rate") +
scale_fill_brewer(palette = "Paired")
Figure 5. The stacked bar graph for the individual arrival-delay/individual flights to a airport of ALASKA Airline and AM WEST accross the five airports.
According to Figure 3, the Flights to San Francisco and Seattle accounts for 75% and 10% of the businesses of ALASKA Airline and AM WEST Airline respectively. The arrival-delay rate at these two airports for bother Airlines accounts for around 50% of their total delayed events and being the high for both Airlines, suggesting these two airports are very busy and congested and their schedules need to be optimized.
# make groupies observations and change the units of analysis
airdelay_tidy3 <- airdelay_tidy1 %>%
group_by(Airline) %>%
summarise(delayed = sum(delayed), on_time = sum(on_time), Total = sum(Total)) %>%
gather("Performance","Number",2:4)
kable(airdelay_tidy3)
Airline | Performance | Number |
---|---|---|
ALASKA | delayed | 501 |
AM WEST | delayed | 787 |
ALASKA | on_time | 3274 |
AM WEST | on_time | 6438 |
ALASKA | Total | 3775 |
AM WEST | Total | 7225 |
# plot the total counts of flights arrived on time or delayed for the twe Airlines
ggplot(airdelay_tidy3, aes(x = Airline, y = Number, fill = Performance)) +
geom_bar(stat="identity", position = "dodge") +
labs(y = "Flight Count") +
scale_fill_brewer(palette = "Set1")
Figure 6. The total counts of flights arrived on time or delayed of ALASKA Airline and AM WEST.
AM WEST Airline flies more than ALASKA Airline and AM WEST Airline has higher counts of flights on either arrived on time or delayed.
airdelay_tidy4 <- airdelay_tidy1 %>%
group_by(Airline) %>%
gather("Number_category","Number", 3:5)
airdelay_tidy3[,3:4] <- NULL
kable(airdelay_tidy4)
Airline | Destination | Proportion_delayed | Proportion_ontime | Number_category | Number |
---|---|---|---|---|---|
ALASKA | Los.Angeles | 0.1109123 | 0.8890877 | delayed | 62 |
ALASKA | Phoenix | 0.0515021 | 0.9484979 | delayed | 12 |
ALASKA | San.Diego | 0.0862069 | 0.9137931 | delayed | 20 |
ALASKA | San.Francisco | 0.1685950 | 0.8314050 | delayed | 102 |
ALASKA | Seattle | 0.1421249 | 0.8578751 | delayed | 305 |
AM WEST | Los.Angeles | 0.1442663 | 0.8557337 | delayed | 117 |
AM WEST | Phoenix | 0.0789724 | 0.9210276 | delayed | 415 |
AM WEST | San.Diego | 0.1450893 | 0.8549107 | delayed | 65 |
AM WEST | San.Francisco | 0.2873051 | 0.7126949 | delayed | 129 |
AM WEST | Seattle | 0.2328244 | 0.7671756 | delayed | 61 |
ALASKA | Los.Angeles | 0.1109123 | 0.8890877 | on_time | 497 |
ALASKA | Phoenix | 0.0515021 | 0.9484979 | on_time | 221 |
ALASKA | San.Diego | 0.0862069 | 0.9137931 | on_time | 212 |
ALASKA | San.Francisco | 0.1685950 | 0.8314050 | on_time | 503 |
ALASKA | Seattle | 0.1421249 | 0.8578751 | on_time | 1841 |
AM WEST | Los.Angeles | 0.1442663 | 0.8557337 | on_time | 694 |
AM WEST | Phoenix | 0.0789724 | 0.9210276 | on_time | 4840 |
AM WEST | San.Diego | 0.1450893 | 0.8549107 | on_time | 383 |
AM WEST | San.Francisco | 0.2873051 | 0.7126949 | on_time | 320 |
AM WEST | Seattle | 0.2328244 | 0.7671756 | on_time | 201 |
ALASKA | Los.Angeles | 0.1109123 | 0.8890877 | Total | 559 |
ALASKA | Phoenix | 0.0515021 | 0.9484979 | Total | 233 |
ALASKA | San.Diego | 0.0862069 | 0.9137931 | Total | 232 |
ALASKA | San.Francisco | 0.1685950 | 0.8314050 | Total | 605 |
ALASKA | Seattle | 0.1421249 | 0.8578751 | Total | 2146 |
AM WEST | Los.Angeles | 0.1442663 | 0.8557337 | Total | 811 |
AM WEST | Phoenix | 0.0789724 | 0.9210276 | Total | 5255 |
AM WEST | San.Diego | 0.1450893 | 0.8549107 | Total | 448 |
AM WEST | San.Francisco | 0.2873051 | 0.7126949 | Total | 449 |
AM WEST | Seattle | 0.2328244 | 0.7671756 | Total | 262 |
ggplot(airdelay_tidy4, aes(x = factor(Number_category), y = Number, fill = Airline )) +
geom_boxplot() +
labs(x = "Performance", y = "Filght Count") +
theme(axis.text.x = element_text(vjust = 1, hjust = 0.5)) +
scale_y_log10()
Figure 7. The overall performance of ALASKA Airline and AM WEST Airline.
The median of delayed arrival of ALASKA Airline is smaller than that of AM WEST Airline. On the countrary, the median of on time arrivle of ALASKA Airline is larger than that of AM WEST Airline. These observations suggest that ALASKA Airline did better in on-time performance.