library(RCurl)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:RCurl':
##
## complete
library(stringr)
library(ggplot2)
x<-getURL("https://raw.githubusercontent.com/Raji030/607assignment04_data/main/data607_airlines_delayed_data.csv")
flight_data<-data.frame(read.csv(text=x, header = T, sep=";", na.strings=c("","NA")))
flight_data
## X X.1 X.Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 <NA> <NA> NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 <NA> delayed 117 415 65 129 61
#Renaming columns
colnames(flight_data)<-c("airlines","status","los angeles","phoenix","san diego","san francisco","seattle")
# Deleting empty row
flight_data<-flight_data[-3,]
# Replacing NA values in the airlines column with the value prior to it
flight_data<-fill(flight_data,airlines)
# Gathering values from 3rd, 4th, 5th, 6th and 7th columns and putting those values into their own columns
flight_data<-gather(flight_data,key="destination",value="flight_count",3:7) %>% spread(status,flight_count)
# Renaming on time column
flight_data<-rename(flight_data,c("ontime"="on time"))
# Adding two new columns
flight_data<- mutate(flight_data, total=delayed+ontime, proportion_delayed=delayed/total)
##Add new column Total and perform the proportion of delayed into another column Proportion_Delayed.
{r } flight_data<-rename(flight_data,c(“ontime”=“on time”)) flight_data flight_data<- mutate(flight_data, total=delayed+ontime, proportion_delayed=delayed/total) flight_data
ggplot(flight_data, aes(x = destination, y = proportion_delayed, fill = airlines)) + geom_bar(stat="identity", position="dodge") + xlab("destination") + ylab("proportion delayed")+ggtitle("Fig-1")
flight1 <- flight_data[,c(1,3:4)]
flight2<-flight1%>%group_by(airlines) %>% summarize_each(funs(sum))
## Warning: `summarise_each_()` was deprecated in dplyr 0.7.0.
## Please use `across()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
performance_table <- matrix(c(flight2$delayed,flight2$ontime),ncol=2,nrow=2,byrow=FALSE)
dimnames(performance_table) = list(c("Alaska", "Am WEST"), c("delayed", "ontime"))
round(prop.table(performance_table,2)*100, digits=2)
## delayed ontime
## Alaska 38.9 33.71
## Am WEST 61.1 66.29
barplot(prop.table(performance_table,2)*100, beside=T, ylab="%", ylim=c(0,100), main="Flight Performance", col=c("blue", "red"))
legend("center", legend = c("ALASKA", "AM WEST"), fill = c("blue", "red"),cex=0.85)
From Fig-1, it is seen that Am West airline’s arrival delays occur more frequently than Alaska airline in different city’s airports.It is also seen that both the airlines have the highest arrival delay rate at San Francisco airport and the lowest arrival delay rate at Phoenix airport.
From the performance table and flight performance plot, it is seen that Alaska airline’s performance is better than Am West airline’s performance. The Am West airline’s arrival delay is about 22% more than the Alaska airline’s arrival delay.Also, The Alaska airline is about 32% more likely to be on time than the Am West airline. Therefore, the Alaska airline is better option here to choose for travel.