csv

The chart above describes arrival delays for two airlines across five destinations. Your task is to:

  1. Create a .CSV file (or optionally, a MySQL database!) that includes all of the information above. You’re encouraged to use a “wide” structure similar to how the information appears above, so that you can practice tidying and transformations as described below.

  2. Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data.

  3. Perform analysis to compare the arrival delays for the two airlines.

  4. Your code should be in an R Markdown file, posted to rpubs.com, and should include narrative descriptions of your data cleanup work, analysis, and conclusions. Please include in your homework submission:

Table created in “Assignment 5-Data 607.csv” file
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
Imported table from “Assignment 5-Data 607.csv” file
# short view of the table contents
glimpse(csv.file)
## Observations: 5
## Variables: 7
## $ Airline       <chr> "ALASKA", "", "", "AM WEST", ""
## $ Status        <chr> "on time", "delayed", "", "on time", "delayed"
## $ Los.Angeles   <int> 497, 62, NA, 694, 117
## $ Phoenix       <int> 221, 12, NA, 4840, 415
## $ San.Diego     <int> 212, 20, NA, 383, 65
## $ San.Francisco <int> 503, 102, NA, 320, 129
## $ Seattle       <int> 1841, 305, NA, 201, 61
Analytics: by rows and by columns
# draw a graph with all 5 destination cities 
la<-as.numeric(as.character(df4$Los.Angeles))
ph<-as.numeric(as.character(df4$Phoenix))
sd<-as.numeric(as.character(df4$San.Diego))
sf<-as.numeric(as.character(df4$San.Francisco))
se<-as.numeric(as.character(df4$Seattle))
ggplot(df4, aes(x = la)) + 
  geom_line(aes(y = la), colour="blue") +     # Los Angeles
  geom_line(aes(y = ph), colour ="grey") +    # Phoenix
  geom_line(aes(y = sd), colour="red") +      # San Diego
  geom_line(aes(y = sf), colour ="green") +   # San Francisco
  geom_line(aes(y = se), colour="yellow") +   # Seattle
  ylab(label="Arrival Time") + 
  xlab("Destination") +
  scale_colour_manual(name="Destination", values=c("blue", "grey", "red", "green", "yellow")) 

More Analytics
# short view of the new table
glimpse(df3)
## Observations: 11
## Variables: 8
## $ Airline               <chr> "ALASKA", "", NA, "AM WEST", "", "TotalP...
## $ Status                <chr> "on time", "delayed", NA, "on time", "de...
## $ Los.Angeles           <chr> "497", "62", NA, "694", "117", "1370", "...
## $ Phoenix               <chr> "221", "12", NA, "4840", "415", "5488", ...
## $ San.Diego             <chr> "212", "20", NA, "383", "65", "680", "59...
## $ San.Francisco         <chr> "503", "102", NA, "320", "129", "1054", ...
## $ Seattle               <chr> "1841", "305", NA, "201", "61", "2408", ...
## $ TotalPerAirlineStatus <fct> 3274, 501, NA, 6438, 787,  ,  ,  ,  ,  ,
Conclusion Summary
# comparison of the 2 airlines and best city rate
df3.OnTimeRatePerCity<-c(round(df3.ontime1/(df4.row1+df4.row3),3),round(df3.ontime2/(df4.row1+df4.row3),3),round(df3.ontime3/(df4.row1+df4.row3),3),round(df3.ontime4/(df4.row1+df4.row3),3),round(df3.ontime5/(df4.row1+df4.row3),3))
df3.DelayRatePerCity<-c(round(df3.delayed1/(df4.row2+df4.row4),3),round(df3.delayed2/(df4.row2+df4.row4),3),round(df3.delayed3/(df4.row2+df4.row4),3),round(df3.delayed4/(df4.row2+df4.row4),3),round(df3.delayed5/(df4.row2+df4.row4),3))

# display table 
kable(data.frame("Best On Time Airline"=c(df3$Airline[which(df3$TotalPerAirlineStatus==max(na.omit(as.numeric(as.character(df3$TotalPerAirlineStatus)))))]),"Most Delayed Airline"=c(df3$Airline[which(df3$TotalPerAirlineStatus==min(na.omit(as.numeric(as.character(df3$TotalPerAirlineStatus)))))-1]),"Best Ontime Rate City"=c(paste("Phoenix, rate =",max(df3.OnTimeRatePerCity))),"AM WEST"=c(paste("on time=",round(6438/7225,4)*100,"%,delay=",round(787/7225,4)*100,"%")),"ALASKA"=c(paste("on time=",round(3274/3775,4)*100,"%,delay=",round(501/3775,4)*100,"%"))))
Best.On.Time.Airline Most.Delayed.Airline Best.Ontime.Rate.City AM.WEST ALASKA
AM WEST ALASKA Phoenix, rate = 0.521 on time= 89.11 %,delay= 10.89 % on time= 86.73 %,delay= 13.27 %
The following table is showing the city with best on time ranking as first:
# sort destinations with their respective total on time and on time rates
df5<-data.frame(n=cbind(colnames(df3)[3:7],TotalOnTimePerCity),df3.OnTimeRatePerCity)
df5<-df5[rev(order(TotalOnTimePerCity)),]
row.names(df5)<-NULL
names(df5)[1:3] <-c("Destination","TotalOnTimePerCity","OnTimeRatePerCity")

# display table 
datatable(df5)