This assignment is to read the arrival delays for two airlines across five destination from a csv file. The data needs to be transformed using tidyr methods spread() and gather() in order to analyse the delay rate between two airlines.
library(readr)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(ggplot2)
flightsData <- data.frame(read_csv("https://raw.githubusercontent.com/charlsjoseph/CUNY-Data607/master/week5/FlightData.csv"))
## Warning: Missing column names filled in: 'X1' [1], 'X2' [2]
## Parsed with column specification:
## cols(
## X1 = col_character(),
## X2 = col_character(),
## `Los Angeles` = col_integer(),
## Phoenix = col_integer(),
## `San Diego` = col_integer(),
## `San Francisco` = col_integer(),
## Seattle = col_integer()
## )
head(flightsData)
## X1 X2 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 <NA> delayed 117 415 65 129 61
Added flightname and the status into column. flightname is missing on the alternate row.
flightsData[1] <- rep(flightsData[which(!is.na(flightsData[1])), 1], each = 2)
colnames(flightsData)[1:2] <- c("FlightName" , "status")
head(flightsData)
## FlightName status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
Gather() puts column values into row data. desitination, count are the new columns
flightsData1 <- flightsData %>% gather(desitination, count, 3:7)
head(flightsData1)
## FlightName status desitination count
## 1 ALASKA on time Los.Angeles 497
## 2 ALASKA delayed Los.Angeles 62
## 3 AM WEST on time Los.Angeles 694
## 4 AM WEST delayed Los.Angeles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
Spread() puts row value into column. values on status is the new columns
flightsData2 <- flightsData1 %>% spread(status, count)
colnames(flightsData2) <- str_replace(colnames(flightsData2), " ", "_")
head(flightsData2)
## FlightName desitination delayed on_time
## 1 ALASKA Los.Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San.Diego 20 212
## 4 ALASKA San.Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Los.Angeles 117 694
delay Rate is calculated by the delayed / (delayed + on_time
flightsData3 <- flightsData2 %>% mutate(delay_rate = delayed / (delayed + on_time))
head(flightsData3)
## FlightName desitination delayed on_time delay_rate
## 1 ALASKA Los.Angeles 62 497 0.11091234
## 2 ALASKA Phoenix 12 221 0.05150215
## 3 ALASKA San.Diego 20 212 0.08620690
## 4 ALASKA San.Francisco 102 503 0.16859504
## 5 ALASKA Seattle 305 1841 0.14212488
## 6 AM WEST Los.Angeles 117 694 0.14426634
Plot the chart with X axis as flights and y as Delay Rate. From the chart, it is evident that the “Am WEST” is having higher Delay Rate for all destination locations.
ggplot(flightsData3, aes(desitination, delay_rate)) +
geom_bar(aes(fill = FlightName), position = "dodge", stat = "identity") +
ylab("Delay Rate")