Data gathering, tidying and transformation

rm(list = ls())
library(plyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(tidyselect)

Loading Data into R

data<- read.csv("https://raw.githubusercontent.com/Meccamarshall/Data607/main/flight%20status%20data%20607.csv", sep = ",", header = TRUE)
data
##   Airline Delay.Status Los.Angeles Phoenix San.Diego San.Franciso Seattle
## 1  Alaska      on time         497     221       212          503    1841
## 2  Alaska      delayed          62      12        20          102     305
## 3                               NA      NA        NA           NA      NA
## 4 AM West      on time         694    4840       383          320     201
## 5 AM West      delayed         117     415        65          129      61

Removing the NA’s

Since there is a row with NA, I want remove that from my data set.

data<-drop_na(data,)
data
##   Airline Delay.Status Los.Angeles Phoenix San.Diego San.Franciso Seattle
## 1  Alaska      on time         497     221       212          503    1841
## 2  Alaska      delayed          62      12        20          102     305
## 3 AM West      on time         694    4840       383          320     201
## 4 AM West      delayed         117     415        65          129      61
data$Airline[data$Airline==""]<-c("Alaska","AM West")
data
##   Airline Delay.Status Los.Angeles Phoenix San.Diego San.Franciso Seattle
## 1  Alaska      on time         497     221       212          503    1841
## 2  Alaska      delayed          62      12        20          102     305
## 3 AM West      on time         694    4840       383          320     201
## 4 AM West      delayed         117     415        65          129      61

Transform data from wide format to long format

Since I need to compare the delayed time, I reformatted this to the long way that way it would be easier for me to view the dataset

data1<- data %>% gather(key = Destination, value= Count, c(3:7))
data1
##    Airline Delay.Status  Destination Count
## 1   Alaska      on time  Los.Angeles   497
## 2   Alaska      delayed  Los.Angeles    62
## 3  AM West      on time  Los.Angeles   694
## 4  AM West      delayed  Los.Angeles   117
## 5   Alaska      on time      Phoenix   221
## 6   Alaska      delayed      Phoenix    12
## 7  AM West      on time      Phoenix  4840
## 8  AM West      delayed      Phoenix   415
## 9   Alaska      on time    San.Diego   212
## 10  Alaska      delayed    San.Diego    20
## 11 AM West      on time    San.Diego   383
## 12 AM West      delayed    San.Diego    65
## 13  Alaska      on time San.Franciso   503
## 14  Alaska      delayed San.Franciso   102
## 15 AM West      on time San.Franciso   320
## 16 AM West      delayed San.Franciso   129
## 17  Alaska      on time      Seattle  1841
## 18  Alaska      delayed      Seattle   305
## 19 AM West      on time      Seattle   201
## 20 AM West      delayed      Seattle    61

Filtering by delays

Since I am only interested in viewing the delays, I am hiding the on time data from the data frame.

data2<-filter(data1, Delay.Status=="delayed")
data2
##    Airline Delay.Status  Destination Count
## 1   Alaska      delayed  Los.Angeles    62
## 2  AM West      delayed  Los.Angeles   117
## 3   Alaska      delayed      Phoenix    12
## 4  AM West      delayed      Phoenix   415
## 5   Alaska      delayed    San.Diego    20
## 6  AM West      delayed    San.Diego    65
## 7   Alaska      delayed San.Franciso   102
## 8  AM West      delayed San.Franciso   129
## 9   Alaska      delayed      Seattle   305
## 10 AM West      delayed      Seattle    61

Comparisons

data2$Count = as.numeric(as.character(data2$Count))
data2
##    Airline Delay.Status  Destination Count
## 1   Alaska      delayed  Los.Angeles    62
## 2  AM West      delayed  Los.Angeles   117
## 3   Alaska      delayed      Phoenix    12
## 4  AM West      delayed      Phoenix   415
## 5   Alaska      delayed    San.Diego    20
## 6  AM West      delayed    San.Diego    65
## 7   Alaska      delayed San.Franciso   102
## 8  AM West      delayed San.Franciso   129
## 9   Alaska      delayed      Seattle   305
## 10 AM West      delayed      Seattle    61
summary(data2$Count)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   12.00   61.25   83.50  128.80  126.00  415.00
summarise(data2, mean=mean(Count), sd=sd(Count))
##    mean       sd
## 1 128.8 130.1382

From this given data set, it can be seen that that there is an average of 129 flight delays with a standard deviation of approximately 130.

barplot(data2$Count)

Viewing Alaska delays

library(ggplot2)
Alaska_delays<-filter(data2, Airline=="Alaska")
Alaska_delays
##   Airline Delay.Status  Destination Count
## 1  Alaska      delayed  Los.Angeles    62
## 2  Alaska      delayed      Phoenix    12
## 3  Alaska      delayed    San.Diego    20
## 4  Alaska      delayed San.Franciso   102
## 5  Alaska      delayed      Seattle   305
ggplot(Alaska_delays,aes(Destination,Count)) +
geom_bar(aes(fill = Airline), position = "dodge", stat = "identity", col = "red") + 
  ylab("Count")

Viewing AM West delays

library(ggplot2)
AM_West_delays<-filter(data2, Airline=="AM West")
AM_West_delays
##   Airline Delay.Status  Destination Count
## 1 AM West      delayed  Los.Angeles   117
## 2 AM West      delayed      Phoenix   415
## 3 AM West      delayed    San.Diego    65
## 4 AM West      delayed San.Franciso   129
## 5 AM West      delayed      Seattle    61
ggplot(AM_West_delays,aes(Destination,Count)) +
geom_bar(aes(fill = Airline), position = "dodge", stat = "identity", col = "red") + 
  ylab("Count")

##Conclusion

After carefully analysizing the data in the dataset used in this assignment, Alaska airlines seems to be a more reliable airline with minimal flight delays, compared to AM West airlines.