#install.packages("dplyr")
#install.packages("tidyr")
#install.packages("ggplot2")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
After creating the csv file in my local machine, I loaded the data file into Rstudio.
flights_delayed <- read.csv("C:/Users/blin261/Desktop/DATA607/flights_delayed.csv", header = TRUE, stringsAsFactors = FALSE)
flights_delayed
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
str(flights_delayed)
## 'data.frame': 4 obs. of 7 variables:
## $ X : chr "ALASKA" "ALASKA" "AM WEST" "AM WEST"
## $ X.1 : chr "on time" "delayed" "on time" "delayed"
## $ Los.Angeles : int 497 62 694 117
## $ Phoenix : int 221 12 4840 415
## $ San.Diego : int 212 20 383 65
## $ San.Francisco: int 503 102 320 129
## $ Seattle : int 1841 305 201 61
From the raw databases, we can tell the dataset is not very tidy and appears to be “wide”. Some of the column names do not really make sense. The following code will first change the column names and the observation’s name as well.
colnames(flights_delayed)[1:2] <- c("Airlines", "Punctuality")
flights_delayed$Punctuality[flights_delayed$Punctuality == "on time"] <- "on_time"
flights_delayed
## Airlines Punctuality Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on_time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on_time 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
Then what I am doing with the following code is to transform the dataset from being “wide” to be “long”, so that all the data that we are interested in can be shown in a data frame.
flights <- flights_delayed%>%
gather(Destinations, Frequency, Los.Angeles:Seattle)%>%
spread(Punctuality, Frequency)
flights
## Airlines Destinations delayed on_time
## 1 ALASKA Los.Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San.Diego 20 212
## 4 ALASKA San.Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Los.Angeles 117 694
## 7 AM WEST Phoenix 415 4840
## 8 AM WEST San.Diego 65 383
## 9 AM WEST San.Francisco 129 320
## 10 AM WEST Seattle 61 201
In order to perform analysis, more variables need to be created, such as the total flights for each airline company on each destinations. In the meantime, we can also calculate the percentage of flights that were delayed. This can be accomplished using function called “mutate”.
flights <- flights%>%
mutate(total = delayed + on_time,Percentage = delayed / total)
flights
## Airlines Destinations delayed on_time total Percentage
## 1 ALASKA Los.Angeles 62 497 559 0.11091234
## 2 ALASKA Phoenix 12 221 233 0.05150215
## 3 ALASKA San.Diego 20 212 232 0.08620690
## 4 ALASKA San.Francisco 102 503 605 0.16859504
## 5 ALASKA Seattle 305 1841 2146 0.14212488
## 6 AM WEST Los.Angeles 117 694 811 0.14426634
## 7 AM WEST Phoenix 415 4840 5255 0.07897241
## 8 AM WEST San.Diego 65 383 448 0.14508929
## 9 AM WEST San.Francisco 129 320 449 0.28730512
## 10 AM WEST Seattle 61 201 262 0.23282443
str(flights)
## 'data.frame': 10 obs. of 6 variables:
## $ Airlines : chr "ALASKA" "ALASKA" "ALASKA" "ALASKA" ...
## $ Destinations: chr "Los.Angeles" "Phoenix" "San.Diego" "San.Francisco" ...
## $ delayed : int 62 12 20 102 305 117 415 65 129 61
## $ on_time : int 497 221 212 503 1841 694 4840 383 320 201
## $ total : int 559 233 232 605 2146 811 5255 448 449 262
## $ Percentage : num 0.1109 0.0515 0.0862 0.1686 0.1421 ...
After we create the new data frame called flights, we want to calculate the aggregate percentage of flights that were delayed for each airline. group_by function can help us calculate variables by group, in this case, the airline companies. summarize function can help us perform calculations in an aggregate scale. The result generated is a table. As shown in the following, AM WEST seems to be a better airline company overall as it has lower flights delayed rate compared to ALASKA.
f <- flights%>%
group_by(Airlines)%>%
summarize(total_delayed = sum(delayed), total_on_time = sum(on_time), total_flights = sum(total), total_percentage = total_delayed / total_flights)
f
## # A tibble: 2 x 5
## Airlines total_delayed total_on_time total_flights total_percentage
## <chr> <int> <int> <int> <dbl>
## 1 ALASKA 501 3274 3775 0.1327152
## 2 AM WEST 787 6438 7225 0.1089273
We also want to know what is each airline companies’ flight delayed rate look like for each destination. Of course, we need to group the data by airline companies again, then use spread function to display each destinations in columnns. The observations of those variables will be percentage of flight delays. Surprisingly, for every destination, it is always the ALASKA which has relatively lower flight delayed rate.
F <- flights%>%
group_by(Airlines)%>%
arrange(Destinations, Airlines)%>%
select(Airlines, Destinations, Percentage)%>%
spread(Destinations, Percentage)
F
## Source: local data frame [2 x 6]
## Groups: Airlines [2]
##
## Airlines Los.Angeles Phoenix San.Diego San.Francisco Seattle
## * <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALASKA 0.1109123 0.05150215 0.0862069 0.1685950 0.1421249
## 2 AM WEST 0.1442663 0.07897241 0.1450893 0.2873051 0.2328244
If ALASKA is able to beat AW WEST on each destinations in terms of lower flight delayed rates, how can AW WEST still be a better airline company overall? That is a question we want to ask. Actually this phenomenon has a very technical term called Simpson’s Paradox. If we want to investigate the reason behind this paradox, it is good idea to draw some graphs to have visual information about these numbers.According to the graph, we are able to know that both companies have very closed flight delayed rates. The majoy discrepancies between the two are the numbers of flights that land at Phoenix. Since most of AW WEST’s flights go to Phoenix, in addition, the flights delayed rates have been the lowest among all destinations for both airline companies in Phoenix, AW WESt will be able to take advantage of that to lower down its overall flight delayed rate.
ggplot(data = flights, aes(y = total, x = Destinations))+ geom_point(aes(color = Airlines)) + labs(title = "Total Flights")
ggplot(data = flights, aes(y = Percentage, x = Destinations))+ geom_point(aes(color = Airlines)) + labs(title = "Percentage of Delayed Flights")