## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Import Data

# import csv into R 
flights <- read.csv("https://raw.githubusercontent.com/jasonjgy2000/IS607/master/Assignments/Week%206/flights.csv")
head(flights)
##        X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seatlle
## 1 ALASKA on time         497     221       212           503    1841
## 2        delayed          62      12        20           102     305
## 3                         NA      NA        NA            NA      NA
## 4 AMWEST on time         694    4840       383           320     201
## 5        delayed         117     415        65           129      61

Data Cleansing

I noticed that the 3rd row of the dataframe was empty. I also noticed that the 2nd and 5th observation were missing the airline data. So I then proceeded to remove the 3rd row and fill in the missing data.

#remove empty row
flights <- flights[c("1","2","4","5"),]
# Giving columns names
colnames(flights)[1] <- "Airline"
colnames(flights)[2] <- "Status"
# filling in missing data
flights[2,1] <-"ALASKA"
flights[4,1] <-"AMWEST"
head(flights)
##   Airline  Status Los.Angeles Phoenix San.Diego San.Francisco Seatlle
## 1  ALASKA on time         497     221       212           503    1841
## 2  ALASKA delayed          62      12        20           102     305
## 4  AMWEST on time         694    4840       383           320     201
## 5  AMWEST delayed         117     415        65           129      61

Data Wrangling

# transforming the data structure from wide to long
flights <- flights %>%  gather("City","n",3:7) %>% spread(Status,n)
head(flights)
##   Airline          City delayed on time
## 1  ALASKA   Los.Angeles      62     497
## 2  ALASKA       Phoenix      12     221
## 3  ALASKA     San.Diego      20     212
## 4  ALASKA San.Francisco     102     503
## 5  ALASKA       Seatlle     305    1841
## 6  AMWEST   Los.Angeles     117     694

Analysis

i <- flights %>% group_by(Airline) %>% summarise(dmean = mean(as.integer(delayed)))
# barplot of each airline's mean delays.
ggplot(data = i, aes(x=Airline, y= dmean, fill=Airline)) + geom_bar(stat = "identity")

i <- flights %>% group_by(Airline,City) %>% summarise(dmean = mean(as.integer(delayed)))
# barplot of delays grouped by airline and city.
ggplot(data = i, aes(x=Airline, y= dmean,fill = City)) + geom_bar(stat = "identity", position=position_dodge())