Loading appropriate libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(stringr)
Fetching data from a .csv file
#Read the .csv file as it is while replacing the word "NA" empty cells
two_airlines<-read.csv("2airlines.csv", stringsAsFactors = FALSE, header=TRUE,fileEncoding = "UTF-8-BOM",sep=",",na.strings = c("","NA"))
two_airlines
## X X.1 Los..Angeles Phoenix San..Diego San..Francisco. Seattle
## 1 ALASKA on time 497 221 212 503 1,841
## 2 <NA> delayed 62 12 20 102 305
## 3 <NA> <NA> NA <NA> NA NA <NA>
## 4 AM WEST on time 694 4,840 383 320 201
## 5 <NA> delayed 117 415 65 129 61
Tidying and transforming data for better visualization
#Rearrange the dataframe removing "NA" fields, changing rows to columns using gather
two_airlines2 <- two_airlines %>%
mutate(Airline=na.locf(X)) %>%
mutate(Status=na.locf(X.1)) %>%
select(-X) %>% select(-X.1) %>%
gather(Destination, Flight, Los..Angeles:Seattle) %>%
drop_na()
two_airlines2
## Airline Status Destination Flight
## 1 ALASKA on time Los..Angeles 497
## 2 ALASKA delayed Los..Angeles 62
## 4 AM WEST on time Los..Angeles 694
## 5 AM WEST delayed Los..Angeles 117
## 6 ALASKA on time Phoenix 221
## 7 ALASKA delayed Phoenix 12
## 9 AM WEST on time Phoenix 4,840
## 10 AM WEST delayed Phoenix 415
## 11 ALASKA on time San..Diego 212
## 12 ALASKA delayed San..Diego 20
## 14 AM WEST on time San..Diego 383
## 15 AM WEST delayed San..Diego 65
## 16 ALASKA on time San..Francisco. 503
## 17 ALASKA delayed San..Francisco. 102
## 19 AM WEST on time San..Francisco. 320
## 20 AM WEST delayed San..Francisco. 129
## 21 ALASKA on time Seattle 1,841
## 22 ALASKA delayed Seattle 305
## 24 AM WEST on time Seattle 201
## 25 AM WEST delayed Seattle 61
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
#Spread Status column into two separate columns as delayed and ontime
colnames(two_airlines2)[2] <- "Status"
colnames(two_airlines2)[4] <- "Flight"
two_airlines.alter <- spread(two_airlines2, "Status", "Flight" )
kable(two_airlines.alter, align = 'c')
Airline
|
Destination
|
delayed
|
on time
|
ALASKA
|
Los..Angeles
|
62
|
497
|
ALASKA
|
Phoenix
|
12
|
221
|
ALASKA
|
San..Diego
|
20
|
212
|
ALASKA
|
San..Francisco.
|
102
|
503
|
ALASKA
|
Seattle
|
305
|
1,841
|
AM WEST
|
Los..Angeles
|
117
|
694
|
AM WEST
|
Phoenix
|
415
|
4,840
|
AM WEST
|
San..Diego
|
65
|
383
|
AM WEST
|
San..Francisco.
|
129
|
320
|
AM WEST
|
Seattle
|
61
|
201
|
#remove characters such as , inside the numbers
two_airlines.alter$`on time` <- str_remove(two_airlines.alter$`on time`, ",")
#Convert character type to numeric
as.numeric(two_airlines.alter$`on time`)
## [1] 497 221 212 503 1841 694 4840 383 320 201
#Convert character type to numeric
as.numeric(two_airlines.alter$delayed)
## [1] 62 12 20 102 305 117 415 65 129 61
#Basic Summary view of the data
two_airlines.alter$delayed = as.numeric(two_airlines.alter$delayed)
two_airlines.alter$`on time` = as.numeric(two_airlines.alter$`on time`)
summary(two_airlines.alter)
## Airline Destination delayed on time
## Length:10 Length:10 Min. : 12.00 Min. : 201.0
## Class :character Class :character 1st Qu.: 61.25 1st Qu.: 245.8
## Mode :character Mode :character Median : 83.50 Median : 440.0
## Mean :128.80 Mean : 971.2
## 3rd Qu.:126.00 3rd Qu.: 646.2
## Max. :415.00 Max. :4840.0
two_airlines.alter %>%
group_by(Destination) %>%
summarise(SD_OnTime = round(sd(`on time`),2), SD_Delayed = round(sd(delayed),2))
## # A tibble: 5 x 3
## Destination SD_OnTime SD_Delayed
## <chr> <dbl> <dbl>
## 1 Los..Angeles 139. 38.9
## 2 Phoenix 3266. 285.
## 3 San..Diego 121. 31.8
## 4 San..Francisco. 129. 19.1
## 5 Seattle 1160. 173.
#Analyse data separately for the airline ALASKA
flight_status_ALASKA <- two_airlines.alter%>%
filter(Airline == "ALASKA")
flight_status_ALASKA
## Airline Destination delayed on time
## 1 ALASKA Los..Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San..Diego 20 212
## 4 ALASKA San..Francisco. 102 503
## 5 ALASKA Seattle 305 1841
#Analyse data separately for the airline ALASKA and calculate its total arrival and delayed percentage
flight_status_ALASKA <- flight_status_ALASKA %>%
mutate(
Arrival_Total = flight_status_ALASKA$delayed + flight_status_ALASKA$`on time`,
Delay_perc = (flight_status_ALASKA$delayed/Arrival_Total)*100)
flight_status_ALASKA
## Airline Destination delayed on time Arrival_Total Delay_perc
## 1 ALASKA Los..Angeles 62 497 559 11.091234
## 2 ALASKA Phoenix 12 221 233 5.150215
## 3 ALASKA San..Diego 20 212 232 8.620690
## 4 ALASKA San..Francisco. 102 503 605 16.859504
## 5 ALASKA Seattle 305 1841 2146 14.212488
#Analyse data separately for the airline AMWEST
flight_status_AMWEST <- two_airlines.alter %>%
filter(Airline == "AM WEST")
flight_status_AMWEST
## Airline Destination delayed on time
## 1 AM WEST Los..Angeles 117 694
## 2 AM WEST Phoenix 415 4840
## 3 AM WEST San..Diego 65 383
## 4 AM WEST San..Francisco. 129 320
## 5 AM WEST Seattle 61 201
#Analyse data separately for the airline AMWEST and calculate its Total arrival and delayed percentage
flight_status_AMWEST <- flight_status_AMWEST %>%
mutate(
Arrival_Total = flight_status_AMWEST$delayed + flight_status_AMWEST$`on time`,
Delay_perc = (flight_status_AMWEST$delayed/Arrival_Total)*100)
flight_status_AMWEST
## Airline Destination delayed on time Arrival_Total Delay_perc
## 1 AM WEST Los..Angeles 117 694 811 14.426634
## 2 AM WEST Phoenix 415 4840 5255 7.897241
## 3 AM WEST San..Diego 65 383 448 14.508929
## 4 AM WEST San..Francisco. 129 320 449 28.730512
## 5 AM WEST Seattle 61 201 262 23.282443
#Alter the original table with total arrival time and delay percentage for comparison
two_airlines.alter <- two_airlines.alter %>%
mutate(
Arrival_Total = two_airlines.alter$delayed + two_airlines.alter$`on time`,
Delay_perc = (two_airlines.alter$delayed/Arrival_Total)*100
)
print(two_airlines.alter)
## Airline Destination delayed on time Arrival_Total Delay_perc
## 1 ALASKA Los..Angeles 62 497 559 11.091234
## 2 ALASKA Phoenix 12 221 233 5.150215
## 3 ALASKA San..Diego 20 212 232 8.620690
## 4 ALASKA San..Francisco. 102 503 605 16.859504
## 5 ALASKA Seattle 305 1841 2146 14.212488
## 6 AM WEST Los..Angeles 117 694 811 14.426634
## 7 AM WEST Phoenix 415 4840 5255 7.897241
## 8 AM WEST San..Diego 65 383 448 14.508929
## 9 AM WEST San..Francisco. 129 320 449 28.730512
## 10 AM WEST Seattle 61 201 262 23.282443
Based on the data, AM WEST has a higher delay percentage for all the destinations comapred to ALASKA
#Compare Delay percentage between two airlines using a graph for better visualization
library(ggplot2)
ggplot(two_airlines.alter, aes(factor(Destination), Delay_perc, fill = Airline)) +
geom_bar(stat = "identity", position = "dodge")+
theme_minimal()+
labs(x = "Destination", y = "Delayed %")
