607 W5 Assignment
Question
Loading Package
| library(tidyverse) |
| library(lubridate) |
| library(stringr) |
| library(data.table) |
| library(DT) |
Solution
- Read File CSV
- Remove All NA
- Fill the Airport Name for Empty row (cell)
- Convert Wide dataset to Long Data set
- Perform analysis to compare the arrival delays for the two airlines.
Read File
# Read in the coal dataset
ssa <- suppressWarnings(read_csv("https://github.com/Rajwantmishra/msds/raw/master/week5/w5dataset.csv"))
## Parsed with column specification:
## cols(
## X1 = col_character(),
## X2 = col_character(),
## `Los Angeles` = col_double(),
## Phoenix = col_number(),
## `San Diego` = col_double(),
## `San Francisco` = col_double(),
## Seattle = col_number()
## )
ssa
ssa<- ssa[!is.na(ssa$X2),]
# method 2
#ssa<- filter(ssa,!is.na(X2))
data.table(ssa)
Data Wrangling
Using Mutate, spread, lag , rename
# DATA
ssa
#-------------------------------------------------- Method 1
# fill data from 1 row above the currnt cell.
# Using Shift to
#ssa <-mutate(t,X1 = ifelse( is.na(X1),shift(X1, 1L, type="lag"),X1))
#-------------------------------------------------- Method 2
# #Using Mutate Dplyr funtion
# ssa<- mutate(t,X1 = ifelse( is.na(X1),lag(X1),X1))
#-------------------------------------------------- Method 3 (Gather + Spread + Mutate)
ssaDF <- mutate(ssa,X1= ifelse( is.na(X1),lag(X1),X1)) %>%
gather(Status,value ,-c(X1,X2)) %>%
spread(key = X2,value = "value") %>%
dplyr::rename( AirPort = X1,FlyingFrom = Status)
## Warning: package 'bindrcpp' was built under R version 3.5.2
datatable(ssaDF)
Analysis
Perform analysis to compare the arrival delays for the two airlines.
# Perform analysis to compare the arrival delays for the two
ssaStat<- group_by(ssaDF,AirPort) %>%
summarise(MEAN = mean(delayed), Median = median(delayed), sd = sd(delayed))
ggplot(ssaDF, aes(x=delayed , fill=AirPort)) +
geom_histogram(aes(y=..density..),color = "white", position="dodge")+
geom_density(alpha=.2, fill="#FF6666")+
ggtitle("Delay by Airport") +
ylab("Density") + xlab("Delay") +
theme(legend.position="top")+
facet_wrap(~AirPort)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Histogram showing with density plot how datapoints are located for each airports delay.
ggplot(ssaDF, aes(y = delayed, x = FlyingFrom)) +
geom_bar(stat = "identity",position="dodge")+
facet_wrap(~ AirPort)
# Data points shows delay by Origin of flights to respective airport.
# Side by Side By Airport
statMean <- ggplot(ssaDF, aes(y = delayed, x = FlyingFrom,fill= AirPort)) +
geom_bar(stat = "identity",position="dodge")+
geom_hline(ssaStat,linetype = 2,
color=c("red","blue"),
yintercept = ssaStat$MEAN)+
annotate("text",label = sprintf("Mean %s, %.2f",ssaStat$AirPort,ssaStat$MEAN), x= 2, y= ssaStat$MEAN+10,color=c("red","blue"))+
ggtitle("Delay Side by Side By Airport/Flyingfrom Mean")
# Side by Side By Airport
statMedian <- ggplot(ssaDF, aes(y = delayed, x = FlyingFrom,fill= AirPort)) +
geom_bar(stat = "identity",position="dodge")+
geom_hline(ssaStat,linetype = 1,
color=c("red","blue"),
yintercept = ssaStat$Median)+
annotate("text",label = sprintf("Median %s, %.2f",ssaStat$AirPort,ssaStat$Median), x= 2, y= ssaStat$Median+10,color=c("red","blue")) +
ggtitle("Delay Side by Side By Airport/Flyingfrom Median")
statMedian
statMean
Further Data Question
So far we have not taken the airports traffic in considartion.
ggplot(ssaDF, aes(y = delayed, x = AirPort,fill= AirPort)) +
geom_boxplot()+
ggtitle("Delay By Airport") +
ylab("Density") + xlab("Delay")
If we consider overall traffic at the Airport, we note that the Airport AM WEST is very crowded and overall fligts is almost 2 time as that of ALASKA. Shown below:
# Group data by Airport and get summary by Delay and Total Flights and Also find % of Delay
group_by(ssaDF,AirPort) %>% summarise(sumDelay = sum(delayed),
sumAll = sum(delayed+`on time`)) %>%
mutate(DelayPercent = round((sumDelay/sumAll)*100,2))
It’s Surprising to see “AM WEST” have better % in overall trafic data here. Lets create scatterplots to detect the datapoints for Airport Delay by Orgianting Destinaiton (FLyingfrom)
### Its Surprising to see "AM WEST" have better % in overall trafic data here.
# Lets create scatterplots to detect the datapoints for Airport Delay
ggplot(ssaDF, aes(y = delayed, x = FlyingFrom,color= AirPort)) +
geom_point()+
geom_segment(xend= 2,yend=410, x= 1, y=350, color="red", arrow = arrow(length = unit(0.5,"cm")))+
geom_segment(xend= 4.9,yend=309, x= 4, y=250, color="blue", arrow = arrow(length = unit(0.5,"cm")))+
ggtitle("Delay By Airport/Flyingfrom")
We can see that Seattle and Phoenix data have very big differences, lets drop them for further check.
# After removing Seattle and Phoenix data as they seems like outlier
filter(ssaDF,!(FlyingFrom == "Phoenix" | FlyingFrom == "Seattle" )) %>%
group_by(AirPort) %>% summarise(sumDelay = sum(delayed),
sumAll = sum(delayed+`on time`)) %>%
mutate(DelayPercent = round((sumDelay/sumAll)*100,2))
Again after removing the two major high and Low data points from both the Airports, we are back with high percentage for AM WEST in dealy.
Conlcusion
We can say that Dealy frequecy is high with AM WEST, even though the overall trafic is also more than the ALASKA (Note the traffic after dropping the 2 data points).
With given information it can be said the AM WEST and ALASKA have a bright SPOT to consider from each others operation. ALASKA is doing better in terms of controlling the over all delay.