Load data set (CSV) into R and load required libraries

arrivals_raw <- data.frame(read.csv("https://raw.githubusercontent.com/humbertohpgit/MSDS1stSem/master/Arrival_Delays.csv", header = TRUE, stringsAsFactors=FALSE))
arrivals_raw
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503  1,841 
## 2         delayed          62      12        20           102     305
## 3                          NA                NA            NA        
## 4 AM WEST on time         694  4,840        383           320     201
## 5         delayed         117     415        65           129      61
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(stringr)

Data Wrangling

Pivoting columns to rows and Renaming columns

arrivals1 <- gather(arrivals_raw,"City", "Arrivals", 3:7)
names(arrivals1) <- c("Airline", "Type", "City", "Arrivals")

Filtering Nulls/NAs

arrivals2 <- filter(arrivals1, !is.na(Arrivals), Arrivals != "")

Removing punctuation/spaces and Recasting data types

arrivals2$Arrivals <- str_remove(arrivals2$Arrivals, ",")
arrivals2$Arrivals <- str_trim(arrivals2$Arrivals)
arrivals2$Arrivals <- as.numeric(arrivals2$Arrivals)
arrivals2$City <- str_replace_all(arrivals2$City, pattern = "\\.", replacement = " ")

Fill in missing values

arrivals2[arrivals2$Airline == "",1] <- NA
arrivals3 <- arrivals2 %>% fill(Airline)

Final tidy data set

tbl_df(arrivals3)
## # A tibble: 20 x 4
##    Airline Type    City          Arrivals
##    <chr>   <chr>   <chr>            <dbl>
##  1 ALASKA  on time Los Angeles        497
##  2 ALASKA  delayed Los Angeles         62
##  3 AM WEST on time Los Angeles        694
##  4 AM WEST delayed Los Angeles        117
##  5 ALASKA  on time Phoenix            221
##  6 ALASKA  delayed Phoenix             12
##  7 AM WEST on time Phoenix           4840
##  8 AM WEST delayed Phoenix            415
##  9 ALASKA  on time San Diego          212
## 10 ALASKA  delayed San Diego           20
## 11 AM WEST on time San Diego          383
## 12 AM WEST delayed San Diego           65
## 13 ALASKA  on time San Francisco      503
## 14 ALASKA  delayed San Francisco      102
## 15 AM WEST on time San Francisco      320
## 16 AM WEST delayed San Francisco      129
## 17 ALASKA  on time Seattle           1841
## 18 ALASKA  delayed Seattle            305
## 19 AM WEST on time Seattle            201
## 20 AM WEST delayed Seattle             61

Analysis comparing arrival delays for the two airlines

library(ggplot2)
arrivals_chart <- summarise(group_by(arrivals3, Airline, Type), Arrivals = sum(Arrivals))
arrivals_chart
## # A tibble: 4 x 3
## # Groups:   Airline [?]
##   Airline Type    Arrivals
##   <chr>   <chr>      <dbl>
## 1 ALASKA  delayed      501
## 2 ALASKA  on time     3274
## 3 AM WEST delayed      787
## 4 AM WEST on time     6438
ggplot(arrivals_chart, aes(x=Type, y=Arrivals, fill = Airline)) + geom_col(position = "dodge") + labs(y = "Arrivals", x = "Type", title = "Arrival Delays") + theme_bw()  ##+ stat_identity()

Arrival delays for AM West are greater than Alaska by 286. 787 vs 501 respectively