rm(list = ls())
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(tidyselect)
data<- read.csv("https://raw.githubusercontent.com/Meccamarshall/Data607/main/flight%20status%20data%20607.csv", sep = ",", header = TRUE)
data
## Airline Delay.Status Los.Angeles Phoenix San.Diego San.Franciso Seattle
## 1 Alaska on time 497 221 212 503 1841
## 2 Alaska delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM West on time 694 4840 383 320 201
## 5 AM West delayed 117 415 65 129 61
Since there is a row with NA, I want remove that from my data set.
data<-drop_na(data,)
data
## Airline Delay.Status Los.Angeles Phoenix San.Diego San.Franciso Seattle
## 1 Alaska on time 497 221 212 503 1841
## 2 Alaska delayed 62 12 20 102 305
## 3 AM West on time 694 4840 383 320 201
## 4 AM West delayed 117 415 65 129 61
data$Airline[data$Airline==""]<-c("Alaska","AM West")
data
## Airline Delay.Status Los.Angeles Phoenix San.Diego San.Franciso Seattle
## 1 Alaska on time 497 221 212 503 1841
## 2 Alaska delayed 62 12 20 102 305
## 3 AM West on time 694 4840 383 320 201
## 4 AM West delayed 117 415 65 129 61
Since I need to compare the delayed time, I reformatted this to the long way that way it would be easier for me to view the dataset
data1<- data %>% gather(key = Destination, value= Count, c(3:7))
data1
## Airline Delay.Status Destination Count
## 1 Alaska on time Los.Angeles 497
## 2 Alaska delayed Los.Angeles 62
## 3 AM West on time Los.Angeles 694
## 4 AM West delayed Los.Angeles 117
## 5 Alaska on time Phoenix 221
## 6 Alaska delayed Phoenix 12
## 7 AM West on time Phoenix 4840
## 8 AM West delayed Phoenix 415
## 9 Alaska on time San.Diego 212
## 10 Alaska delayed San.Diego 20
## 11 AM West on time San.Diego 383
## 12 AM West delayed San.Diego 65
## 13 Alaska on time San.Franciso 503
## 14 Alaska delayed San.Franciso 102
## 15 AM West on time San.Franciso 320
## 16 AM West delayed San.Franciso 129
## 17 Alaska on time Seattle 1841
## 18 Alaska delayed Seattle 305
## 19 AM West on time Seattle 201
## 20 AM West delayed Seattle 61
Since I am only interested in viewing the delays, I am hiding the on time data from the data frame.
data2<-filter(data1, Delay.Status=="delayed")
data2
## Airline Delay.Status Destination Count
## 1 Alaska delayed Los.Angeles 62
## 2 AM West delayed Los.Angeles 117
## 3 Alaska delayed Phoenix 12
## 4 AM West delayed Phoenix 415
## 5 Alaska delayed San.Diego 20
## 6 AM West delayed San.Diego 65
## 7 Alaska delayed San.Franciso 102
## 8 AM West delayed San.Franciso 129
## 9 Alaska delayed Seattle 305
## 10 AM West delayed Seattle 61
data2$Count = as.numeric(as.character(data2$Count))
data2
## Airline Delay.Status Destination Count
## 1 Alaska delayed Los.Angeles 62
## 2 AM West delayed Los.Angeles 117
## 3 Alaska delayed Phoenix 12
## 4 AM West delayed Phoenix 415
## 5 Alaska delayed San.Diego 20
## 6 AM West delayed San.Diego 65
## 7 Alaska delayed San.Franciso 102
## 8 AM West delayed San.Franciso 129
## 9 Alaska delayed Seattle 305
## 10 AM West delayed Seattle 61
summary(data2$Count)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12.00 61.25 83.50 128.80 126.00 415.00
summarise(data2, mean=mean(Count), sd=sd(Count))
## mean sd
## 1 128.8 130.1382
From this given data set, it can be seen that that there is an average of 129 flight delays with a standard deviation of approximately 130.
barplot(data2$Count)
library(ggplot2)
Alaska_delays<-filter(data2, Airline=="Alaska")
Alaska_delays
## Airline Delay.Status Destination Count
## 1 Alaska delayed Los.Angeles 62
## 2 Alaska delayed Phoenix 12
## 3 Alaska delayed San.Diego 20
## 4 Alaska delayed San.Franciso 102
## 5 Alaska delayed Seattle 305
ggplot(Alaska_delays,aes(Destination,Count)) +
geom_bar(aes(fill = Airline), position = "dodge", stat = "identity", col = "red") +
ylab("Count")
library(ggplot2)
AM_West_delays<-filter(data2, Airline=="AM West")
AM_West_delays
## Airline Delay.Status Destination Count
## 1 AM West delayed Los.Angeles 117
## 2 AM West delayed Phoenix 415
## 3 AM West delayed San.Diego 65
## 4 AM West delayed San.Franciso 129
## 5 AM West delayed Seattle 61
ggplot(AM_West_delays,aes(Destination,Count)) +
geom_bar(aes(fill = Airline), position = "dodge", stat = "identity", col = "red") +
ylab("Count")
##Conclusion
After carefully analysizing the data in the dataset used in this assignment, Alaska airlines seems to be a more reliable airline with minimal flight delays, compared to AM West airlines.