library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)

###Create airline dataframe
airdata<-data.frame(Airline=c("ALASKA", "ALASKA"," ", "AMWEST", "AMWEST"), Status=c("on time", "delayed"," ", "on time", "delayed"),Los_Angeles= c(497, 62, " ", 694,117),Phoenix= c(221,12," ", 4840, 415),San_Diego=c(212, 20," ", 383, 65),San_Francisco= c(503, 102," ", 320, 129),Seattle= c(1841, 305, " ", 201, 61))

write.csv(airdata, file="C:/Users/ambra/Desktop/Data 607/W5/table.csv")

airdf<-read.csv("C:/Users/ambra/Desktop/Data 607/W5/table.csv")

airdf
##   X Airline  Status Los_Angeles Phoenix San_Diego San_Francisco Seattle
## 1 1  ALASKA on time         497     221       212           503    1841
## 2 2  ALASKA delayed          62      12        20           102     305
## 3 3                          NA      NA        NA            NA      NA
## 4 4  AMWEST on time         694    4840       383           320     201
## 5 5  AMWEST delayed         117     415        65           129      61
##The column names above are in reality values- gather the table using TIDYR

airdf2 <- airdf %>% drop_na() %>% gather("Airport", "n", 4:8)


###group_by, add new variable which shows ratio of flights by airline, filter by status "delayed" and summarise to compare total delays and their relative frequency between airlines

delays<- airdf2 %>% group_by(Airline) %>% 
mutate(ratio= n/sum(n)) %>% 
filter(Status=="delayed") %>%   
summarise(total_delays=sum(n), freq_delays=round(sum(ratio),2)) 

##Plot percentage of delays by airline in ggplot2

ggplot(delays, aes(x=Airline, y=freq_delays, fill=Airline))+
geom_bar(stat='identity', position=position_dodge())+
ggtitle("Percentage of Delayed Flights by Airline") +
xlab("Airline") + ylab("Percentage of Delays") +
geom_text(aes(label= paste(round(freq_delays*100,0),"%",sep="")))

##percentage of flights delayed by airport and airline

airportdelays<- airdf2 %>% group_by(Airport, Airline) %>% 
mutate(ratio= n/sum(n)) %>% 
filter(Status=="delayed") %>%   
summarise(total_delays=sum(n), freq_delays=round(sum(ratio),2)) 

##Plot percentage of delayed flight by airport and airline
ggplot(airportdelays, aes(x=Airport, y=freq_delays, fill=Airline)) +
geom_bar(stat="identity", position=position_dodge()) +
geom_text(aes(label=paste(round(freq_delays*100,0),"%",sep="")), position= position_dodge(width = 1))+
ggtitle("Percentage of Delayed Flights by Airport and Airline") +
xlab("Airport") + ylab("Percentage of Delayed Flights")