Loading appropriate libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(stringr)

Fetching data from a .csv file

#Read the .csv file as it is while replacing the word "NA" empty cells
two_airlines<-read.csv("2airlines.csv", stringsAsFactors = FALSE, header=TRUE,fileEncoding = "UTF-8-BOM",sep=",",na.strings = c("","NA"))
two_airlines
##         X     X.1 Los..Angeles Phoenix San..Diego San..Francisco. Seattle
## 1  ALASKA on time          497    221         212             503  1,841 
## 2    <NA> delayed           62     12          20             102    305 
## 3    <NA>    <NA>           NA    <NA>         NA              NA    <NA>
## 4 AM WEST on time          694  4,840         383             320    201 
## 5    <NA> delayed          117    415          65             129     61

Tidying and transforming data for better visualization

#Rearrange the dataframe removing "NA" fields, changing rows to columns using gather
two_airlines2 <- two_airlines %>%
  mutate(Airline=na.locf(X)) %>%
  mutate(Status=na.locf(X.1)) %>%
  select(-X) %>% select(-X.1) %>%
  gather(Destination, Flight, Los..Angeles:Seattle) %>% 
  drop_na()
two_airlines2
##    Airline  Status     Destination  Flight
## 1   ALASKA on time    Los..Angeles     497
## 2   ALASKA delayed    Los..Angeles      62
## 4  AM WEST on time    Los..Angeles     694
## 5  AM WEST delayed    Los..Angeles     117
## 6   ALASKA on time         Phoenix    221 
## 7   ALASKA delayed         Phoenix     12 
## 9  AM WEST on time         Phoenix  4,840 
## 10 AM WEST delayed         Phoenix    415 
## 11  ALASKA on time      San..Diego     212
## 12  ALASKA delayed      San..Diego      20
## 14 AM WEST on time      San..Diego     383
## 15 AM WEST delayed      San..Diego      65
## 16  ALASKA on time San..Francisco.     503
## 17  ALASKA delayed San..Francisco.     102
## 19 AM WEST on time San..Francisco.     320
## 20 AM WEST delayed San..Francisco.     129
## 21  ALASKA on time         Seattle  1,841 
## 22  ALASKA delayed         Seattle    305 
## 24 AM WEST on time         Seattle    201 
## 25 AM WEST delayed         Seattle     61
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
#Spread Status column into two separate columns as delayed and ontime
colnames(two_airlines2)[2] <- "Status"
colnames(two_airlines2)[4] <- "Flight"
two_airlines.alter <- spread(two_airlines2, "Status", "Flight" )
kable(two_airlines.alter, align = 'c')
Airline Destination delayed on time
ALASKA Los..Angeles 62 497
ALASKA Phoenix 12 221
ALASKA San..Diego 20 212
ALASKA San..Francisco. 102 503
ALASKA Seattle 305 1,841
AM WEST Los..Angeles 117 694
AM WEST Phoenix 415 4,840
AM WEST San..Diego 65 383
AM WEST San..Francisco. 129 320
AM WEST Seattle 61 201
#remove characters such as , inside the numbers
two_airlines.alter$`on time` <- str_remove(two_airlines.alter$`on time`, ",")
#Convert character type to numeric
as.numeric(two_airlines.alter$`on time`)
##  [1]  497  221  212  503 1841  694 4840  383  320  201
#Convert character type to numeric
as.numeric(two_airlines.alter$delayed)
##  [1]  62  12  20 102 305 117 415  65 129  61
#Basic Summary view of the data
two_airlines.alter$delayed = as.numeric(two_airlines.alter$delayed)
two_airlines.alter$`on time` = as.numeric(two_airlines.alter$`on time`)
summary(two_airlines.alter)
##    Airline          Destination           delayed          on time      
##  Length:10          Length:10          Min.   : 12.00   Min.   : 201.0  
##  Class :character   Class :character   1st Qu.: 61.25   1st Qu.: 245.8  
##  Mode  :character   Mode  :character   Median : 83.50   Median : 440.0  
##                                        Mean   :128.80   Mean   : 971.2  
##                                        3rd Qu.:126.00   3rd Qu.: 646.2  
##                                        Max.   :415.00   Max.   :4840.0
two_airlines.alter %>%
  group_by(Destination) %>%
  summarise(SD_OnTime = round(sd(`on time`),2), SD_Delayed = round(sd(delayed),2))
## # A tibble: 5 x 3
##   Destination     SD_OnTime SD_Delayed
##   <chr>               <dbl>      <dbl>
## 1 Los..Angeles         139.       38.9
## 2 Phoenix             3266.      285. 
## 3 San..Diego           121.       31.8
## 4 San..Francisco.      129.       19.1
## 5 Seattle             1160.      173.
#Analyse data separately for the airline ALASKA
flight_status_ALASKA <- two_airlines.alter%>%
  filter(Airline == "ALASKA")
flight_status_ALASKA
##   Airline     Destination delayed on time
## 1  ALASKA    Los..Angeles      62     497
## 2  ALASKA         Phoenix      12     221
## 3  ALASKA      San..Diego      20     212
## 4  ALASKA San..Francisco.     102     503
## 5  ALASKA         Seattle     305    1841
#Analyse data separately for the airline ALASKA and calculate its total arrival and delayed percentage
flight_status_ALASKA <- flight_status_ALASKA %>%
  mutate(
    Arrival_Total = flight_status_ALASKA$delayed + flight_status_ALASKA$`on time`,
    Delay_perc = (flight_status_ALASKA$delayed/Arrival_Total)*100)
flight_status_ALASKA
##   Airline     Destination delayed on time Arrival_Total Delay_perc
## 1  ALASKA    Los..Angeles      62     497           559  11.091234
## 2  ALASKA         Phoenix      12     221           233   5.150215
## 3  ALASKA      San..Diego      20     212           232   8.620690
## 4  ALASKA San..Francisco.     102     503           605  16.859504
## 5  ALASKA         Seattle     305    1841          2146  14.212488
#Analyse data separately for the airline AMWEST
flight_status_AMWEST <- two_airlines.alter %>%
  filter(Airline == "AM WEST")
flight_status_AMWEST
##   Airline     Destination delayed on time
## 1 AM WEST    Los..Angeles     117     694
## 2 AM WEST         Phoenix     415    4840
## 3 AM WEST      San..Diego      65     383
## 4 AM WEST San..Francisco.     129     320
## 5 AM WEST         Seattle      61     201
#Analyse data separately for the airline AMWEST and calculate its Total arrival and delayed percentage
flight_status_AMWEST <- flight_status_AMWEST %>%
  mutate(
    Arrival_Total = flight_status_AMWEST$delayed + flight_status_AMWEST$`on time`,
    Delay_perc = (flight_status_AMWEST$delayed/Arrival_Total)*100)
flight_status_AMWEST
##   Airline     Destination delayed on time Arrival_Total Delay_perc
## 1 AM WEST    Los..Angeles     117     694           811  14.426634
## 2 AM WEST         Phoenix     415    4840          5255   7.897241
## 3 AM WEST      San..Diego      65     383           448  14.508929
## 4 AM WEST San..Francisco.     129     320           449  28.730512
## 5 AM WEST         Seattle      61     201           262  23.282443
#Alter the original table with total arrival time and delay percentage for comparison
two_airlines.alter <- two_airlines.alter %>%
  mutate(
    Arrival_Total = two_airlines.alter$delayed + two_airlines.alter$`on time`,
    Delay_perc = (two_airlines.alter$delayed/Arrival_Total)*100
  )
print(two_airlines.alter)
##    Airline     Destination delayed on time Arrival_Total Delay_perc
## 1   ALASKA    Los..Angeles      62     497           559  11.091234
## 2   ALASKA         Phoenix      12     221           233   5.150215
## 3   ALASKA      San..Diego      20     212           232   8.620690
## 4   ALASKA San..Francisco.     102     503           605  16.859504
## 5   ALASKA         Seattle     305    1841          2146  14.212488
## 6  AM WEST    Los..Angeles     117     694           811  14.426634
## 7  AM WEST         Phoenix     415    4840          5255   7.897241
## 8  AM WEST      San..Diego      65     383           448  14.508929
## 9  AM WEST San..Francisco.     129     320           449  28.730512
## 10 AM WEST         Seattle      61     201           262  23.282443

Based on the data, AM WEST has a higher delay percentage for all the destinations comapred to ALASKA

#Compare Delay percentage between two airlines using a graph for better visualization
library(ggplot2)
ggplot(two_airlines.alter, aes(factor(Destination), Delay_perc, fill = Airline)) +
  geom_bar(stat = "identity", position = "dodge")+
  theme_minimal()+
  labs(x = "Destination", y = "Delayed %")