library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
AlaskaAM<-read.csv("AlaskaAM.csv", header=TRUE, sep=",")
head(AlaskaAM)
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
AlaskaAM2<-AlaskaAM %>%
rename(Airline=X, Status=X.1) %>%
filter(Status!="")
AlaskaAM2$Airline<-as.character(AlaskaAM2$Airline)
AlaskaAM2$Airline[AlaskaAM2$Airline==""]<-c("ALASKA", "AM WEST")
AlaskaAM2
## Airline Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
tidydata1<-AlaskaAM2 %>%
gather(City, Number_Flights, Los.Angeles:Seattle)
tidydata2<-tidydata1%>%
spread(Status, Number_Flights)
tidydata2
## Airline City delayed on time
## 1 ALASKA Los.Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San.Diego 20 212
## 4 ALASKA San.Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Los.Angeles 117 694
## 7 AM WEST Phoenix 415 4840
## 8 AM WEST San.Diego 65 383
## 9 AM WEST San.Francisco 129 320
## 10 AM WEST Seattle 61 201
##Group the tidydata by Airline and then use summarise function to compute summaries of interest
tidydata2 %>%
group_by(Airline) %>%
summarise(Min_Delays=min(delayed),
Max_Delays=max(delayed),
Avg_Delays=mean(delayed),
Total_Delays=sum(delayed))
## # A tibble: 2 x 5
## Airline Min_Delays Max_Delays Avg_Delays Total_Delays
## <chr> <int> <int> <dbl> <int>
## 1 ALASKA 12 305 100. 501
## 2 AM WEST 61 415 157. 787
Using the summarise function we can calculate the total delays of both airlines. We can tell from the total delays what AM West have more delays compared to Alaska Airline. We also noticed AM West have majority of the delays in the city Phoenix.
Let’s put this in a graph because I love graphs.
ggplot(data = tidydata1, aes(x=Airline,y=Number_Flights))+
#geom_point(alpha = 0.5, size = 5, color ='blue')
geom_bar(stat = 'identity',aes(fill=Airline))+
geom_text(aes(x = Airline, y = Number_Flights,
label = paste(Number_Flights),
group = Airline,
vjust = -0.4)) +
labs(title = "Delays of Airlines & City",
x = "Airline",
y = "Delay Flight Count") +
facet_wrap(~City, ncol = 5)+
theme_bw()