Step 1 - Importing the libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(stringr)

Step 2 - Importing the dataset

flightsdata <- read.csv("https://raw.githubusercontent.com/DevMeh/Assignment5/master/assign5.csv", sep = ",", header = TRUE)
## Warning in read.table(file = file, header = header, sep = sep, quote
## = quote, : incomplete final line found by readTableHeader on 'https://
## raw.githubusercontent.com/DevMeh/Assignment5/master/assign5.csv'
head(flightsdata)
##   Airline  Status Los_Angeles Phoenix San_Diego San_Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2  ALASKA delayed          62      12        20           102     305
## 3 AM WEST on time         694    4840       383           320     201
## 4 AM WEST delayed         117     415        65           129      61

Step 3 - Tidying up the data

# Seperate the data by creating a city column and splitting on status
tidyflights <- rename(flightsdata, Airline = Airline)
tidyflights <- tidyflights %>% 
  gather(city, number, 3:length(flightsdata)) %>%  
  spread(Status, number)                   

# Remove underscore from city name
tidyflights$city <- str_replace_all(tidyflights$city, "\\_", " ")

# Renaming on time as on_time 
tidyflights <- rename(tidyflights, on_time = `on time`)
#View the dataframe
head(tidyflights)
##   Airline          city delayed on_time
## 1  ALASKA   Los Angeles      62     497
## 2  ALASKA       Phoenix      12     221
## 3  ALASKA     San Diego      20     212
## 4  ALASKA San Francisco     102     503
## 5  ALASKA       Seattle     305    1841
## 6 AM WEST   Los Angeles     117     694

Step 4 - Creating calcuated columns

tidyflights <- tidyflights %>% 
  mutate(total_flights=on_time+ delayed, pct_delay = round(100*(delayed/total_flights),2),pct_ontime=round(100*(on_time/total_flights),2)) %>% 
  arrange(desc(total_flights))
tidyflights
##    Airline          city delayed on_time total_flights pct_delay
## 1  AM WEST       Phoenix     415    4840          5255      7.90
## 2   ALASKA       Seattle     305    1841          2146     14.21
## 3  AM WEST   Los Angeles     117     694           811     14.43
## 4   ALASKA San Francisco     102     503           605     16.86
## 5   ALASKA   Los Angeles      62     497           559     11.09
## 6  AM WEST San Francisco     129     320           449     28.73
## 7  AM WEST     San Diego      65     383           448     14.51
## 8  AM WEST       Seattle      61     201           262     23.28
## 9   ALASKA       Phoenix      12     221           233      5.15
## 10  ALASKA     San Diego      20     212           232      8.62
##    pct_ontime
## 1       92.10
## 2       85.79
## 3       85.57
## 4       83.14
## 5       88.91
## 6       71.27
## 7       85.49
## 8       76.72
## 9       94.85
## 10      91.38

Step 5 - Analysis

delaybycity <- tidyflights %>% 
  group_by(city) %>% 
  summarise(delay_pct = mean(pct_delay)) %>% 
  arrange(desc(delay_pct))
delaybycity
## # A tibble: 5 x 2
##   city          delay_pct
##   <chr>             <dbl>
## 1 San Francisco     22.8 
## 2 Seattle           18.7 
## 3 Los Angeles       12.8 
## 4 San Diego         11.6 
## 5 Phoenix            6.52
ontimebyairline <- tidyflights %>% 
  group_by(Airline) %>% 
  summarise(ontime_pct = mean(pct_ontime)) %>% 
  arrange(desc(ontime_pct))
ontimebyairline
## # A tibble: 2 x 2
##   Airline ontime_pct
##   <fct>        <dbl>
## 1 ALASKA        88.8
## 2 AM WEST       82.2
delaybyairline <- tidyflights %>% 
  group_by(city,Airline) %>% 
  summarise(delay_pct = mean(pct_delay)) %>% 
  arrange(city,delay_pct)
delaybyairline
## # A tibble: 10 x 3
## # Groups:   city [5]
##    city          Airline delay_pct
##    <chr>         <fct>       <dbl>
##  1 Los Angeles   ALASKA      11.1 
##  2 Los Angeles   AM WEST     14.4 
##  3 Phoenix       ALASKA       5.15
##  4 Phoenix       AM WEST      7.9 
##  5 San Diego     ALASKA       8.62
##  6 San Diego     AM WEST     14.5 
##  7 San Francisco ALASKA      16.9 
##  8 San Francisco AM WEST     28.7 
##  9 Seattle       ALASKA      14.2 
## 10 Seattle       AM WEST     23.3

Step 6 - Visualization

ggplot(data=tidyflights, aes(x=city, y=pct_delay,fill=Airline))+
  geom_bar(stat="identity", position=position_dodge(), colour="black") +
    geom_text(aes(label=pct_delay), vjust=.5, hjust=1.25,position= position_dodge(width=0.9),  color="black") +
        ggtitle("Flights Delayed by Airline by City") +
    xlab("City") + ylab("% of Flights Delayed") +coord_flip()