R Markdown

1. Import libraries

library(stringr)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library('scales')
## Warning: package 'scales' was built under R version 3.5.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.3
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 3.5.3
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows

2. Get .csv file

csvfile <- read.csv("https://raw.githubusercontent.com/AjayArora35/Data-607-Assignment-Week-5/master/AssignmentWeek5.csv", header=FALSE, sep=",", stringsAsFactors=FALSE)
csvfile <- data.frame(csvfile)
csvfile
##        V1      V2            V3      V4          V5              V6
## 1                 Los \nAngeles Phoenix San \nDiego San \nFrancisco
## 2  ALASKA on time           497     221         212             503
## 3         delayed            62      12          20             102
## 4                                                                  
## 5 AM WEST on time           694   4,840         383             320
## 6         delayed           117     415          65             129
##        V7
## 1 Seattle
## 2   1,841
## 3     305
## 4        
## 5     201
## 6      61

3. Assign headers to ‘V1’ and ‘V2’ then copy the first row as headers, etc.

#Assign missing headers
csvfile$V1[1] <- "Airline"
csvfile$V2[1] <- "Status"
csvfile
##        V1      V2            V3      V4          V5              V6
## 1 Airline  Status Los \nAngeles Phoenix San \nDiego San \nFrancisco
## 2  ALASKA on time           497     221         212             503
## 3         delayed            62      12          20             102
## 4                                                                  
## 5 AM WEST on time           694   4,840         383             320
## 6         delayed           117     415          65             129
##        V7
## 1 Seattle
## 2   1,841
## 3     305
## 4        
## 5     201
## 6      61
#Copy the first row as headers
names(csvfile) <- csvfile[1,]
csvfile
##   Airline  Status Los \nAngeles Phoenix San \nDiego San \nFrancisco
## 1 Airline  Status Los \nAngeles Phoenix San \nDiego San \nFrancisco
## 2  ALASKA on time           497     221         212             503
## 3         delayed            62      12          20             102
## 4                                                                  
## 5 AM WEST on time           694   4,840         383             320
## 6         delayed           117     415          65             129
##   Seattle
## 1 Seattle
## 2   1,841
## 3     305
## 4        
## 5     201
## 6      61
#Remove the first row
csvfile <- csvfile[-c(1), ]
csvfile
##   Airline  Status Los \nAngeles Phoenix San \nDiego San \nFrancisco
## 2  ALASKA on time           497     221         212             503
## 3         delayed            62      12          20             102
## 4                                                                  
## 5 AM WEST on time           694   4,840         383             320
## 6         delayed           117     415          65             129
##   Seattle
## 2   1,841
## 3     305
## 4        
## 5     201
## 6      61
#Remove the NA row as well
csvfile <- csvfile[c(1,2,4,5), ]
csvfile
##   Airline  Status Los \nAngeles Phoenix San \nDiego San \nFrancisco
## 2  ALASKA on time           497     221         212             503
## 3         delayed            62      12          20             102
## 5 AM WEST on time           694   4,840         383             320
## 6         delayed           117     415          65             129
##   Seattle
## 2   1,841
## 3     305
## 5     201
## 6      61
#Fill in the Airline missing column data 
for (i in 1:dim(csvfile)[1]){
  if (i %% 2 == 0){
      csvfile$Airline[i] <-  csvfile$Airline[i-1]
  }
}


csvfile
##   Airline  Status Los \nAngeles Phoenix San \nDiego San \nFrancisco
## 2  ALASKA on time           497     221         212             503
## 3  ALASKA delayed            62      12          20             102
## 5 AM WEST on time           694   4,840         383             320
## 6 AM WEST delayed           117     415          65             129
##   Seattle
## 2   1,841
## 3     305
## 5     201
## 6      61

4. Some Analysis

Now, take the wide table and make it long: tidyr (Pivot), and additional cleanup

flights <- gather(csvfile, key = 'City', value = 'Flights', 3:7, convert=TRUE)
flights
##    Airline  Status            City Flights
## 1   ALASKA on time   Los \nAngeles     497
## 2   ALASKA delayed   Los \nAngeles      62
## 3  AM WEST on time   Los \nAngeles     694
## 4  AM WEST delayed   Los \nAngeles     117
## 5   ALASKA on time         Phoenix     221
## 6   ALASKA delayed         Phoenix      12
## 7  AM WEST on time         Phoenix   4,840
## 8  AM WEST delayed         Phoenix     415
## 9   ALASKA on time     San \nDiego     212
## 10  ALASKA delayed     San \nDiego      20
## 11 AM WEST on time     San \nDiego     383
## 12 AM WEST delayed     San \nDiego      65
## 13  ALASKA on time San \nFrancisco     503
## 14  ALASKA delayed San \nFrancisco     102
## 15 AM WEST on time San \nFrancisco     320
## 16 AM WEST delayed San \nFrancisco     129
## 17  ALASKA on time         Seattle   1,841
## 18  ALASKA delayed         Seattle     305
## 19 AM WEST on time         Seattle     201
## 20 AM WEST delayed         Seattle      61
#Remove new line character from city names and remove commas from number of flights
flights$City <- str_replace_all(flights$City, "[\r\n]" , "")
flights
##    Airline  Status          City Flights
## 1   ALASKA on time   Los Angeles     497
## 2   ALASKA delayed   Los Angeles      62
## 3  AM WEST on time   Los Angeles     694
## 4  AM WEST delayed   Los Angeles     117
## 5   ALASKA on time       Phoenix     221
## 6   ALASKA delayed       Phoenix      12
## 7  AM WEST on time       Phoenix   4,840
## 8  AM WEST delayed       Phoenix     415
## 9   ALASKA on time     San Diego     212
## 10  ALASKA delayed     San Diego      20
## 11 AM WEST on time     San Diego     383
## 12 AM WEST delayed     San Diego      65
## 13  ALASKA on time San Francisco     503
## 14  ALASKA delayed San Francisco     102
## 15 AM WEST on time San Francisco     320
## 16 AM WEST delayed San Francisco     129
## 17  ALASKA on time       Seattle   1,841
## 18  ALASKA delayed       Seattle     305
## 19 AM WEST on time       Seattle     201
## 20 AM WEST delayed       Seattle      61
flights$Flights <- str_replace_all(flights$Flights, "[,]" , "")
flights
##    Airline  Status          City Flights
## 1   ALASKA on time   Los Angeles     497
## 2   ALASKA delayed   Los Angeles      62
## 3  AM WEST on time   Los Angeles     694
## 4  AM WEST delayed   Los Angeles     117
## 5   ALASKA on time       Phoenix     221
## 6   ALASKA delayed       Phoenix      12
## 7  AM WEST on time       Phoenix    4840
## 8  AM WEST delayed       Phoenix     415
## 9   ALASKA on time     San Diego     212
## 10  ALASKA delayed     San Diego      20
## 11 AM WEST on time     San Diego     383
## 12 AM WEST delayed     San Diego      65
## 13  ALASKA on time San Francisco     503
## 14  ALASKA delayed San Francisco     102
## 15 AM WEST on time San Francisco     320
## 16 AM WEST delayed San Francisco     129
## 17  ALASKA on time       Seattle    1841
## 18  ALASKA delayed       Seattle     305
## 19 AM WEST on time       Seattle     201
## 20 AM WEST delayed       Seattle      61
#Now, separate out the status column into On-time and Delayed 
flights2 <-  flights %>% spread(Status, `Flights`, convert=TRUE)
flights2
##    Airline          City delayed on time
## 1   ALASKA   Los Angeles      62     497
## 2   ALASKA       Phoenix      12     221
## 3   ALASKA     San Diego      20     212
## 4   ALASKA San Francisco     102     503
## 5   ALASKA       Seattle     305    1841
## 6  AM WEST   Los Angeles     117     694
## 7  AM WEST       Phoenix     415    4840
## 8  AM WEST     San Diego      65     383
## 9  AM WEST San Francisco     129     320
## 10 AM WEST       Seattle      61     201
#Now, add some percentages for On-Time and Delayed
flights2 <- flights2 %>% 
  mutate(total = delayed + `on time`,
         OnTime = percent(`on time`/total*100, accuracy = .01, scale=1),
         Delayed = percent(delayed/total*100, accuracy = .01, scale=1))
#What is the percentage for on-time versus delayed
flights2 %>% 
  kable() %>%
  kable_styling()
Airline City delayed on time total OnTime Delayed
ALASKA Los Angeles 62 497 559 88.91% 11.09%
ALASKA Phoenix 12 221 233 94.85% 5.15%
ALASKA San Diego 20 212 232 91.38% 8.62%
ALASKA San Francisco 102 503 605 83.14% 16.86%
ALASKA Seattle 305 1841 2146 85.79% 14.21%
AM WEST Los Angeles 117 694 811 85.57% 14.43%
AM WEST Phoenix 415 4840 5255 92.10% 7.90%
AM WEST San Diego 65 383 448 85.49% 14.51%
AM WEST San Francisco 129 320 449 71.27% 28.73%
AM WEST Seattle 61 201 262 76.72% 23.28%
#Reference: http://www.sthda.com/english/wiki/ggplot2-barplots-quick-start-guide-r-software-and-data-visualization
#What is the total flights for each airline?
ggplot(flights2,
       aes(x=flights2$Airline,
           y=flights2$total
           ))+
    geom_bar(stat="identity")

#What is the total flights for each citry?
ggplot(flights2,
       aes(x=flights2$City,
           y=flights2$total, fill = flights2$Airline
           ))+
    geom_bar(stat="identity", position="dodge")

#What is the percentage delay for all cities per airline?
ggplot(flights2,
       aes(x=flights2$City,
           y=flights2$Delayed, fill = flights2$Airline
           ))+
    geom_bar(stat="identity", position="dodge")

#5. Final Analysis

#What is the best On-time percentage rate?

flights3 <- flights2 %>% filter(flights2$OnTime == max(flights2$OnTime))
flights3%>% 
  kable() %>%
  kable_styling()
Airline City delayed on time total OnTime Delayed
ALASKA Phoenix 12 221 233 94.85% 5.15%
#What is the best On-time arrival rate?

flights4 <- flights2 %>% 
  group_by(Airline) %>% 
  summarise(Delayed_Flights = sum(delayed), Total_Flights = sum(total)) %>% 
  mutate(Delayed = (Delayed_Flights / Total_Flights)*100)

flights4%>% 
  kable() %>%
  kable_styling()
Airline Delayed_Flights Total_Flights Delayed
ALASKA 501 3775 13.27152
AM WEST 787 7225 10.89273
#What are the ariline delays as percentage?
ggplot(flights4,
       aes(x=flights4$Airline,
           y=flights4$Delayed, fill = flights4$Airline
           ))+
    geom_bar(stat="identity", position="dodge")

#What are the airline total flights?
ggplot(flights4,
       aes(x=flights4$Airline,
           y=flights4$Total_Flights, fill = flights4$Airline
           ))+
    geom_bar(stat="identity", position="dodge")

#What are the airline total delayed flights?
ggplot(flights4,
       aes(x=flights4$Airline,
           y=flights4$Delayed_Flights, fill = flights4$Airline
           ))+
    geom_bar(stat="identity", position="dodge")

6. Conclusions

As indicated by the above graphs, AM West had many more flights than Alaska. The best arrival time was by Alaska to Phoneix where their on-time is a 94.85%. Alaska’s overall delay is at 13.27% compared with 10.89% with AM West. Lastly, for each city, AM West has a larger percentage delay than Alaska.