Load libraries

library(RCurl)
library(readr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)

## 
## Attaching package: 'tidyr'

## The following object is masked from 'package:RCurl':
## 
##     complete

library(stringr)
library(ggplot2)

Getting the flight data from github

x<-getURL("https://raw.githubusercontent.com/Raji030/607assignment04_data/main/data607_airlines_delayed_data.csv")
flight_data<-data.frame(read.csv(text=x, header = T, sep=";", na.strings=c("","NA")))
flight_data

##         X     X.1 X.Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time           497     221       212           503    1841
## 2    <NA> delayed            62      12        20           102     305
## 3    <NA>    <NA>            NA      NA        NA            NA      NA
## 4 AM WEST on time           694    4840       383           320     201
## 5    <NA> delayed           117     415        65           129      61

Data cleaning

#Renaming columns

colnames(flight_data)<-c("airlines","status","los angeles","phoenix","san diego","san francisco","seattle")

# Deleting empty row

flight_data<-flight_data[-3,]

# Replacing NA values in the airlines column with the value prior to it

flight_data<-fill(flight_data,airlines)

Reshaping clean data

# Gathering values from 3rd, 4th, 5th, 6th and 7th columns and putting those values into their own columns

flight_data<-gather(flight_data,key="destination",value="flight_count",3:7) %>% spread(status,flight_count)

# Renaming on time column
flight_data<-rename(flight_data,c("ontime"="on time"))

# Adding two new columns
flight_data<- mutate(flight_data, total=delayed+ontime, proportion_delayed=delayed/total)

##Add new column Total and perform the proportion of delayed into another column Proportion_Delayed.

{r } flight_data<-rename(flight_data,c(“ontime”=“on time”)) flight_data flight_data<- mutate(flight_data, total=delayed+ontime, proportion_delayed=delayed/total) flight_data

Arival delays anaysis:

Graphical representation of airline’s arrival delays by city airports

ggplot(flight_data, aes(x = destination, y = proportion_delayed, fill = airlines)) + geom_bar(stat="identity", position="dodge") + xlab("destination") + ylab("proportion delayed")+ggtitle("Fig-1")

Creating performance table where each cell value shows as a proportion of the column sum

flight1 <- flight_data[,c(1,3:4)]
flight2<-flight1%>%group_by(airlines) %>% summarize_each(funs(sum))

## Warning: `summarise_each_()` was deprecated in dplyr 0.7.0.
## Please use `across()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

performance_table <- matrix(c(flight2$delayed,flight2$ontime),ncol=2,nrow=2,byrow=FALSE)
dimnames(performance_table) = list(c("Alaska", "Am WEST"), c("delayed", "ontime"))
round(prop.table(performance_table,2)*100, digits=2)

##         delayed ontime
## Alaska     38.9  33.71
## Am WEST    61.1  66.29

Graphical representation of airlines performance

barplot(prop.table(performance_table,2)*100, beside=T, ylab="%", ylim=c(0,100), main="Flight Performance", col=c("blue", "red"))
legend("center", legend = c("ALASKA", "AM WEST"), fill = c("blue", "red"),cex=0.85)

Conclusion

From Fig-1, it is seen that Am West airline’s arrival delays occur more frequently than Alaska airline in different city’s airports.It is also seen that both the airlines have the highest arrival delay rate at San Francisco airport and the lowest arrival delay rate at Phoenix airport.

From the performance table and flight performance plot, it is seen that Alaska airline’s performance is better than Am West airline’s performance. The Am West airline’s arrival delay is about 22% more than the Alaska airline’s arrival delay.Also, The Alaska airline is about 32% more likely to be on time than the Am West airline. Therefore, the Alaska airline is better option here to choose for travel.

data607_assignment04

Mahmud Hasan Al Raji

2022-10-05