Load data set (CSV) into R and load required libraries
arrivals_raw <- data.frame(read.csv("https://raw.githubusercontent.com/humbertohpgit/MSDS1stSem/master/Arrival_Delays.csv", header = TRUE, stringsAsFactors=FALSE))
arrivals_raw
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1,841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA
## 4 AM WEST on time 694 4,840 383 320 201
## 5 delayed 117 415 65 129 61
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
Data Wrangling
Pivoting columns to rows and Renaming columns
arrivals1 <- gather(arrivals_raw,"City", "Arrivals", 3:7)
names(arrivals1) <- c("Airline", "Type", "City", "Arrivals")
Filtering Nulls/NAs
arrivals2 <- filter(arrivals1, !is.na(Arrivals), Arrivals != "")
Removing punctuation/spaces and Recasting data types
arrivals2$Arrivals <- str_remove(arrivals2$Arrivals, ",")
arrivals2$Arrivals <- str_trim(arrivals2$Arrivals)
arrivals2$Arrivals <- as.numeric(arrivals2$Arrivals)
arrivals2$City <- str_replace_all(arrivals2$City, pattern = "\\.", replacement = " ")
Fill in missing values
arrivals2[arrivals2$Airline == "",1] <- NA
arrivals3 <- arrivals2 %>% fill(Airline)
Final tidy data set
tbl_df(arrivals3)
## # A tibble: 20 x 4
## Airline Type City Arrivals
## <chr> <chr> <chr> <dbl>
## 1 ALASKA on time Los Angeles 497
## 2 ALASKA delayed Los Angeles 62
## 3 AM WEST on time Los Angeles 694
## 4 AM WEST delayed Los Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AM WEST on time Phoenix 4840
## 8 AM WEST delayed Phoenix 415
## 9 ALASKA on time San Diego 212
## 10 ALASKA delayed San Diego 20
## 11 AM WEST on time San Diego 383
## 12 AM WEST delayed San Diego 65
## 13 ALASKA on time San Francisco 503
## 14 ALASKA delayed San Francisco 102
## 15 AM WEST on time San Francisco 320
## 16 AM WEST delayed San Francisco 129
## 17 ALASKA on time Seattle 1841
## 18 ALASKA delayed Seattle 305
## 19 AM WEST on time Seattle 201
## 20 AM WEST delayed Seattle 61
Analysis comparing arrival delays for the two airlines
library(ggplot2)
arrivals_chart <- summarise(group_by(arrivals3, Airline, Type), Arrivals = sum(Arrivals))
arrivals_chart
## # A tibble: 4 x 3
## # Groups: Airline [?]
## Airline Type Arrivals
## <chr> <chr> <dbl>
## 1 ALASKA delayed 501
## 2 ALASKA on time 3274
## 3 AM WEST delayed 787
## 4 AM WEST on time 6438
ggplot(arrivals_chart, aes(x=Type, y=Arrivals, fill = Airline)) + geom_col(position = "dodge") + labs(y = "Arrivals", x = "Type", title = "Arrival Delays") + theme_bw() ##+ stat_identity()

Arrival delays for AM West are greater than Alaska by 286. 787 vs 501 respectively