Import CSV file file with stringsAsFactors set to false

td<-read.csv(file = 'tidydata.csv', stringsAsFactors = FALSE, sep = ",")
head(td)

Setup to use Dplyr and Tidyr Libraries

##Load libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)

Remove any rows with NA

#Omit NA rows
td1 <-na.omit(td)
td1

Update table with any missing Airline names

td1$X<- c("ALASKA", "ALASKA", "AM WEST", "AM WEST")

Gather City Times into 1 column

td2<-td1%>%
    gather(city, value, 3:7)
td2
Spread the X.1 column to 2 columns for Delayed and OnTime data
td3<-spread(td2,X.1, value)
td3

Rename columns with descriptive names

td3<-rename(td3, "airline"="X",
            "on-time" = "on time")
td3

Analysis: Average delay by airline shows AM WEST had the highest average delay

avgdlbyal<-td3 %>% group_by(airline)%>% summarise(delayedmean = mean(delayed))%>%arrange(airline, delayedmean)
avgdlbyal

Analysis: Average delay by city shows Phoenix and Seattle had the highest average delays

avgdlbyct<-td3 %>% group_by(airline,city)%>% summarise(delayedmean = mean(delayed))%>%arrange(airline, delayedmean)
avgdlbyct

Graph confirms 2 Cities with highest rate is Seatel and Pheonix and AM West inclued the city wiht highestaverage delays

library(ggplot2)
ggplot(data= td3, mapping = aes(x = airline, y=delayed)) +
    geom_point(mapping = aes(size = delayed,  color = city))