Load all libraries
library(tidyr)
library(dplyr)
library(plyr)
library(ggplot2)
Read data from .csv to a dataframe
untidyFlight <- read.table("https://raw.githubusercontent.com/ezaccountz/Week_5_Assignment/master/flight_arrivals.csv", sep = ",", header = TRUE, stringsAsFactors = FALSE)
untidyFlight
## X X.1 Los.Angeles Phonenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
First, rename the columns, remove the empty row(s) and fill in the airline name for the records of delayed flights
tidyFlight <- dplyr::rename(untidyFlight, "AirLine" = colnames(untidyFlight)[1], "status" = colnames(untidyFlight)[2])
tidyFlight <- tidyFlight %>%
filter(status != "")
tidyFlight$AirLine[seq(2,length(tidyFlight$status),2)] <- tidyFlight$AirLine[seq(1,length(tidyFlight$status),2)]
tidyFlight
## AirLine status Los.Angeles Phonenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
Use the gather function to transform the table to be tidy
tidyFlight <- tidyFlight %>%
gather(destination, num_of_flights, 3:7)
tidyFlight
## AirLine status destination num_of_flights
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA delayed Los.Angeles 62
## 3 AM WEST on time Los.Angeles 694
## 4 AM WEST delayed Los.Angeles 117
## 5 ALASKA on time Phonenix 221
## 6 ALASKA delayed Phonenix 12
## 7 AM WEST on time Phonenix 4840
## 8 AM WEST delayed Phonenix 415
## 9 ALASKA on time San.Diego 212
## 10 ALASKA delayed San.Diego 20
## 11 AM WEST on time San.Diego 383
## 12 AM WEST delayed San.Diego 65
## 13 ALASKA on time San.Francisco 503
## 14 ALASKA delayed San.Francisco 102
## 15 AM WEST on time San.Francisco 320
## 16 AM WEST delayed San.Francisco 129
## 17 ALASKA on time Seattle 1841
## 18 ALASKA delayed Seattle 305
## 19 AM WEST on time Seattle 201
## 20 AM WEST delayed Seattle 61
The following shows the overall delay rate of the two airlines
ddply(tidyFlight, "AirLine", transform, perct = num_of_flights / sum(num_of_flights)) %>%
filter(status == "delayed") %>%
group_by(AirLine) %>%
dplyr::summarise(total_perct = sum(perct))
## # A tibble: 2 x 2
## AirLine total_perct
## <chr> <dbl>
## 1 ALASKA 0.133
## 2 AM WEST 0.109
The difference between the two airlines is not significantly large.
The following shows the overall delay rate of the five destinations
ddply(tidyFlight, "destination", transform, perct = num_of_flights / sum(num_of_flights)) %>%
filter(status == "delayed") %>%
group_by(destination) %>%
dplyr::summarise(total_perct = sum(perct))
## # A tibble: 5 x 2
## destination total_perct
## <chr> <dbl>
## 1 Los.Angeles 0.131
## 2 Phonenix 0.0778
## 3 San.Diego 0.125
## 4 San.Francisco 0.219
## 5 Seattle 0.152
We can see that the range of the rates is quite notable.
Now we perform further analysis. The following graph shows the total number of flights by airline and destination, and the total number of delayed flights by airline and destination.
tidyFlight2 <- ddply(tidyFlight, c("AirLine", "destination"), transform, total_flights = sum(num_of_flights)) %>%
filter(status == "delayed")
ggplot(data = tidyFlight2, aes(x = paste(destination,"\n",AirLine))) +
theme(axis.text.x = element_text(angle = 90)) +
xlab("Air Line and Destination") +
ylab("Total Number of Flights") +
geom_bar(aes(y=total_flights),stat = "identity",col = "cyan3", fill = "cyan3") +
geom_bar(aes(y=num_of_flights),stat = "identity",col = "coral1", fill = "coral1") +
geom_rect(mapping=aes(xmin=9.5, xmax=10, ymin=5000, ymax=5250),col = "cyan3", fill = "cyan3") +
geom_rect(mapping=aes(xmin=9.5, xmax=10, ymin=4500, ymax=4750),col = "coral1", fill = "coral1") +
geom_text(mapping=aes(x=9.75, y=5250, label="Total Num of Flight"), size=3, vjust=3, hjust=0.5) +
geom_text(mapping=aes(x=9.75, y=4750, label="Total Num of Delayed"), size=3, vjust=3, hjust=0.5)
To make it easier to compare, we compute the percentage of delays by airline and destination.
tidyFlight2 <- mutate(tidyFlight2, delay_perct = num_of_flights / total_flights)
tidyFlight2
## AirLine status destination num_of_flights total_flights delay_perct
## 1 ALASKA delayed Los.Angeles 62 559 0.11091234
## 2 ALASKA delayed Phonenix 12 233 0.05150215
## 3 ALASKA delayed San.Diego 20 232 0.08620690
## 4 ALASKA delayed San.Francisco 102 605 0.16859504
## 5 ALASKA delayed Seattle 305 2146 0.14212488
## 6 AM WEST delayed Los.Angeles 117 811 0.14426634
## 7 AM WEST delayed Phonenix 415 5255 0.07897241
## 8 AM WEST delayed San.Diego 65 448 0.14508929
## 9 AM WEST delayed San.Francisco 129 449 0.28730512
## 10 AM WEST delayed Seattle 61 262 0.23282443
Finally letโs compare some descriptive statistics of the two airlines
print("ALASKA")
## [1] "ALASKA"
tidyFlight2 %>%
filter(AirLine == "ALASKA") %>%
select(delay_perct) %>%
summary()
## delay_perct
## Min. :0.05150
## 1st Qu.:0.08621
## Median :0.11091
## Mean :0.11187
## 3rd Qu.:0.14212
## Max. :0.16860
print("AM WEST")
## [1] "AM WEST"
tidyFlight2 %>%
filter(AirLine == "AM WEST") %>%
select(delay_perct) %>%
summary()
## delay_perct
## Min. :0.07897
## 1st Qu.:0.14427
## Median :0.14509
## Mean :0.17769
## 3rd Qu.:0.23282
## Max. :0.28731
It is very clear now that all the statistics of AM WEST are higher than ALASKA. We can conclude that ALASKA is doing better job in making their flights on time.