Data Source: Numbersense, Kaiser Fung, McGraw Hill, 2013
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
1.1 Load CSV file Load CSV file from desktop and validate it using Head.
uD <- read.csv(paste0("C:/Users/Kleber/Documents/MSDS2019/DATA607/Week5/arrivalDelays.csv"), header=T)
kable(uD) %>%
kable_styling() %>%
scroll_box()| AirLine | Status | Los.Angeles | Phoenix | San.Diego | San.Francisco | Seattle |
|---|---|---|---|---|---|---|
| ALASKA | onTime | 497 | 221 | 212 | 503 | 1841 |
| ALASKA | delayed | 62 | 12 | 20 | 102 | 305 |
| AM WEST | onTime | 694 | 4840 | 383 | 320 | 201 |
| AM WEST | delayed | 117 | 415 | 65 | 129 | 61 |
2.1 Tidy Data - Untidy dataset by gathering the destination Column
uD <- uD %>% gather(Destination, n, Los.Angeles:Seattle)
kable(uD) %>%
kable_styling() %>%
scroll_box(height = "195px")| AirLine | Status | Destination | n |
|---|---|---|---|
| ALASKA | onTime | Los.Angeles | 497 |
| ALASKA | delayed | Los.Angeles | 62 |
| AM WEST | onTime | Los.Angeles | 694 |
| AM WEST | delayed | Los.Angeles | 117 |
| ALASKA | onTime | Phoenix | 221 |
| ALASKA | delayed | Phoenix | 12 |
| AM WEST | onTime | Phoenix | 4840 |
| AM WEST | delayed | Phoenix | 415 |
| ALASKA | onTime | San.Diego | 212 |
| ALASKA | delayed | San.Diego | 20 |
| AM WEST | onTime | San.Diego | 383 |
| AM WEST | delayed | San.Diego | 65 |
| ALASKA | onTime | San.Francisco | 503 |
| ALASKA | delayed | San.Francisco | 102 |
| AM WEST | onTime | San.Francisco | 320 |
| AM WEST | delayed | San.Francisco | 129 |
| ALASKA | onTime | Seattle | 1841 |
| ALASKA | delayed | Seattle | 305 |
| AM WEST | onTime | Seattle | 201 |
| AM WEST | delayed | Seattle | 61 |
2.2 Tidy Data - Create column names from Categorical Status to make the untidy dataset to tidy dataset variales.
tD <- uD %>% spread(Status, n)
kable(tD) %>%
kable_styling() %>%
scroll_box(height = "195px")| AirLine | Destination | delayed | onTime |
|---|---|---|---|
| ALASKA | Los.Angeles | 62 | 497 |
| ALASKA | Phoenix | 12 | 221 |
| ALASKA | San.Diego | 20 | 212 |
| ALASKA | San.Francisco | 102 | 503 |
| ALASKA | Seattle | 305 | 1841 |
| AM WEST | Los.Angeles | 117 | 694 |
| AM WEST | Phoenix | 415 | 4840 |
| AM WEST | San.Diego | 65 | 383 |
| AM WEST | San.Francisco | 129 | 320 |
| AM WEST | Seattle | 61 | 201 |
3.1 Probability - Add onTimeProbability and DelayedProbability columns to tidy dataset
tD$onTimeProbability <- round((tD$onTime / (tD$delayed + tD$onTime)), digits = 3)
tD$DelayedProbability <- round((tD$delayed / (tD$delayed + tD$onTime)), digits = 3)
kable(tD) %>%
kable_styling() %>%
scroll_box(height = "195px")| AirLine | Destination | delayed | onTime | onTimeProbability | DelayedProbability |
|---|---|---|---|---|---|
| ALASKA | Los.Angeles | 62 | 497 | 0.889 | 0.111 |
| ALASKA | Phoenix | 12 | 221 | 0.948 | 0.052 |
| ALASKA | San.Diego | 20 | 212 | 0.914 | 0.086 |
| ALASKA | San.Francisco | 102 | 503 | 0.831 | 0.169 |
| ALASKA | Seattle | 305 | 1841 | 0.858 | 0.142 |
| AM WEST | Los.Angeles | 117 | 694 | 0.856 | 0.144 |
| AM WEST | Phoenix | 415 | 4840 | 0.921 | 0.079 |
| AM WEST | San.Diego | 65 | 383 | 0.855 | 0.145 |
| AM WEST | San.Francisco | 129 | 320 | 0.713 | 0.287 |
| AM WEST | Seattle | 61 | 201 | 0.767 | 0.233 |
3.2 Summarize Data - Below we have summarized data on the basis of Probability.
summary(tD)## AirLine Destination delayed onTime
## ALASKA :5 Length:10 Min. : 12.00 Min. : 201.0
## AM WEST:5 Class :character 1st Qu.: 61.25 1st Qu.: 245.8
## Mode :character Median : 83.50 Median : 440.0
## Mean :128.80 Mean : 971.2
## 3rd Qu.:126.00 3rd Qu.: 646.2
## Max. :415.00 Max. :4840.0
## onTimeProbability DelayedProbability
## Min. :0.7130 Min. :0.05200
## 1st Qu.:0.8370 1st Qu.:0.09225
## Median :0.8570 Median :0.14300
## Mean :0.8552 Mean :0.14480
## 3rd Qu.:0.9077 3rd Qu.:0.16300
## Max. :0.9480 Max. :0.28700
Charts - The charts below show that Phoenix has best on time probability for both AirLines and San Francisco has least on time probability for both Airlines.
sc <- ggplot(tD, aes(x = Destination , y = onTimeProbability)) + geom_point(aes(color = onTimeProbability, size = onTimeProbability, shape = factor(AirLine))) + scale_colour_gradient(low = "#F0F8FF")
ggplotly(sc)ggplot(tD, aes(x = Destination, y = onTimeProbability, color = DelayedProbability)) +
geom_line() + scale_colour_gradient(low = "#db9833") +
expand_limits(x = 0)ggplot(tD, aes(x = Destination, y = onTimeProbability)) +
geom_col(aes(colour = DelayedProbability)) +
scale_colour_gradientn(colours = terrain.colors(10))Please email to: kleber.perez@live.com for any suggestion.