The chart above describes arrival delays for two airlines across five destinations. Your task is to:
Create a .CSV file (or optionally, a MySQL database!) that includes all of the information above. You’re encouraged to use a “wide” structure similar to how the information appears above, so that you can practice tidying and transformations as described below.
Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data.
Perform analysis to compare the arrival delays for the two airlines.
Your code should be in an R Markdown file, posted to rpubs.com, and should include narrative descriptions of your data cleanup work, analysis, and conclusions. Please include in your homework submission:
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# short view of the table contents
glimpse(csv.file)
## Observations: 5
## Variables: 7
## $ Airline <chr> "ALASKA", "", "", "AM WEST", ""
## $ Status <chr> "on time", "delayed", "", "on time", "delayed"
## $ Los.Angeles <int> 497, 62, NA, 694, 117
## $ Phoenix <int> 221, 12, NA, 4840, 415
## $ San.Diego <int> 212, 20, NA, 383, 65
## $ San.Francisco <int> 503, 102, NA, 320, 129
## $ Seattle <int> 1841, 305, NA, 201, 61
# draw a graph with all 5 destination cities
la<-as.numeric(as.character(df4$Los.Angeles))
ph<-as.numeric(as.character(df4$Phoenix))
sd<-as.numeric(as.character(df4$San.Diego))
sf<-as.numeric(as.character(df4$San.Francisco))
se<-as.numeric(as.character(df4$Seattle))
ggplot(df4, aes(x = la)) +
geom_line(aes(y = la), colour="blue") + # Los Angeles
geom_line(aes(y = ph), colour ="grey") + # Phoenix
geom_line(aes(y = sd), colour="red") + # San Diego
geom_line(aes(y = sf), colour ="green") + # San Francisco
geom_line(aes(y = se), colour="yellow") + # Seattle
ylab(label="Arrival Time") +
xlab("Destination") +
scale_colour_manual(name="Destination", values=c("blue", "grey", "red", "green", "yellow"))
# short view of the new table
glimpse(df3)
## Observations: 11
## Variables: 8
## $ Airline <chr> "ALASKA", "", NA, "AM WEST", "", "TotalP...
## $ Status <chr> "on time", "delayed", NA, "on time", "de...
## $ Los.Angeles <chr> "497", "62", NA, "694", "117", "1370", "...
## $ Phoenix <chr> "221", "12", NA, "4840", "415", "5488", ...
## $ San.Diego <chr> "212", "20", NA, "383", "65", "680", "59...
## $ San.Francisco <chr> "503", "102", NA, "320", "129", "1054", ...
## $ Seattle <chr> "1841", "305", NA, "201", "61", "2408", ...
## $ TotalPerAirlineStatus <fct> 3274, 501, NA, 6438, 787, , , , , ,
# comparison of the 2 airlines and best city rate
df3.OnTimeRatePerCity<-c(round(df3.ontime1/(df4.row1+df4.row3),3),round(df3.ontime2/(df4.row1+df4.row3),3),round(df3.ontime3/(df4.row1+df4.row3),3),round(df3.ontime4/(df4.row1+df4.row3),3),round(df3.ontime5/(df4.row1+df4.row3),3))
df3.DelayRatePerCity<-c(round(df3.delayed1/(df4.row2+df4.row4),3),round(df3.delayed2/(df4.row2+df4.row4),3),round(df3.delayed3/(df4.row2+df4.row4),3),round(df3.delayed4/(df4.row2+df4.row4),3),round(df3.delayed5/(df4.row2+df4.row4),3))
# display table
kable(data.frame("Best On Time Airline"=c(df3$Airline[which(df3$TotalPerAirlineStatus==max(na.omit(as.numeric(as.character(df3$TotalPerAirlineStatus)))))]),"Most Delayed Airline"=c(df3$Airline[which(df3$TotalPerAirlineStatus==min(na.omit(as.numeric(as.character(df3$TotalPerAirlineStatus)))))-1]),"Best Ontime Rate City"=c(paste("Phoenix, rate =",max(df3.OnTimeRatePerCity))),"AM WEST"=c(paste("on time=",round(6438/7225,4)*100,"%,delay=",round(787/7225,4)*100,"%")),"ALASKA"=c(paste("on time=",round(3274/3775,4)*100,"%,delay=",round(501/3775,4)*100,"%"))))
| Best.On.Time.Airline | Most.Delayed.Airline | Best.Ontime.Rate.City | AM.WEST | ALASKA |
|---|---|---|---|---|
| AM WEST | ALASKA | Phoenix, rate = 0.521 | on time= 89.11 %,delay= 10.89 % | on time= 86.73 %,delay= 13.27 % |
# sort destinations with their respective total on time and on time rates
df5<-data.frame(n=cbind(colnames(df3)[3:7],TotalOnTimePerCity),df3.OnTimeRatePerCity)
df5<-df5[rev(order(TotalOnTimePerCity)),]
row.names(df5)<-NULL
names(df5)[1:3] <-c("Destination","TotalOnTimePerCity","OnTimeRatePerCity")
# display table
datatable(df5)