knitr::include_graphics('https://raw.githubusercontent.com/henryvalentine/MSDS2019/master/Classes/DATA%20607/Home%20works/Week5/departures.png')The chart above describes arrival delays for two airlines across five destinations. Your task is to:
All tidying and transformation activities will be performed using appropriate functions from the tidyr and dplyr packages
First, the above information was manually captured as comma separated values and stored in a .csv file Let’s read the .csv file to see how the data looks
untidyData <- read.csv("airlineDepartures.csv", header = TRUE, stringsAsFactors = FALSE) # Read the .csv file
untidyData## X X.1 Los.Ageles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
In its form right now, the data is completely untidy and filled with unavailable data.
Let’s rename the first two columns to airline and flightStatus respectively
names(untidyData)[1:2] <- c('airline', 'flightStatus')
untidyData## airline flightStatus Los.Ageles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
Next, Fill in the row spaces on the delay rows on the flightStatus column with the appropriate airline
untidyData[2, 1] <- 'ALASKA'
untidyData[5, 1] <- 'AM WEST'
untidyData## airline flightStatus Los.Ageles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 AM WEST delayed 117 415 65 129 61
Let’s remove the third row since it is completely empty
df <- untidyData[-3, ]
df## airline flightStatus Los.Ageles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 4 AM WEST on time 694 4840 383 320 201
## 5 AM WEST delayed 117 415 65 129 61
Now the data looks a bit in order in a wide form which can be presented to managers and other users, but it is not completely suitable for analysis and visualisation
On observation, columns 3 to 7 need to be values under a single column say destination using the gather function from tidyr
df <- gather(df, c(Los.Ageles:Seattle), key = 'destination', value = 'tally')
df## airline flightStatus destination tally
## 1 ALASKA on time Los.Ageles 497
## 2 ALASKA delayed Los.Ageles 62
## 3 AM WEST on time Los.Ageles 694
## 4 AM WEST delayed Los.Ageles 117
## 5 ALASKA on time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AM WEST on time Phoenix 4840
## 8 AM WEST delayed Phoenix 415
## 9 ALASKA on time San.Diego 212
## 10 ALASKA delayed San.Diego 20
## 11 AM WEST on time San.Diego 383
## 12 AM WEST delayed San.Diego 65
## 13 ALASKA on time San.Francisco 503
## 14 ALASKA delayed San.Francisco 102
## 15 AM WEST on time San.Francisco 320
## 16 AM WEST delayed San.Francisco 129
## 17 ALASKA on time Seattle 1841
## 18 ALASKA delayed Seattle 305
## 19 AM WEST on time Seattle 201
## 20 AM WEST delayed Seattle 61
df2 <- df # This will be used in this format for analysis laterWe are gradually getting to the desired format. Next, the values for the flightStatus column can be spread out into two columns namely on time and delayed
df <- spread(df, 'flightStatus', 'tally')
df## airline destination delayed on time
## 1 ALASKA Los.Ageles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San.Diego 20 212
## 4 ALASKA San.Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Los.Ageles 117 694
## 7 AM WEST Phoenix 415 4840
## 8 AM WEST San.Diego 65 383
## 9 AM WEST San.Francisco 129 320
## 10 AM WEST Seattle 61 201
datatable(df, class = 'cell-border stripe', options = list(
initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({'background-color': '#616161', 'color': '#fff', 'text-align': 'center !important'});",
"$(this.api().table().body()).css({'color': '#000', 'text-align': 'center !important'});",
"}")
))The data is cleaned up now and ready for downstream analysis and visualisations. But before then, it will be btter to remove the empty space in the on time variable for easy referencing
names(df) <- make.names(names(df))
df## airline destination delayed on.time
## 1 ALASKA Los.Ageles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San.Diego 20 212
## 4 ALASKA San.Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Los.Ageles 117 694
## 7 AM WEST Phoenix 415 4840
## 8 AM WEST San.Diego 65 383
## 9 AM WEST San.Francisco 129 320
## 10 AM WEST Seattle 61 201
Let’s compare the total number of flights(whether delayed or on time) for both airlines to all the cities using df2
grpdf <- df2 %>%
group_by(airline)%>%summarize(TotalFlights = sum(tally))
grpdf## # A tibble: 2 x 2
## airline TotalFlights
## <chr> <int>
## 1 ALASKA 3775
## 2 AM WEST 7225
To compare the total number of delayed and on time flights for both airlines, we will use df as the shape is suitable for it
d <- df %>% group_by(airline)%>%summarise_at(c("delayed", "on.time"), sum)
d## # A tibble: 2 x 3
## airline delayed on.time
## <chr> <int> <int>
## 1 ALASKA 501 3274
## 2 AM WEST 787 6438
on time despite having fewer flights than AM West.Let’s see what their delay rates look ike
delayRateAlaska <- d[1, 'delayed'] / (d[1, 'delayed'] + d[1, 'on.time'])
paste('The dalay rate for ALASKA is: ', round(delayRateAlaska$delayed * 100, 2), '%', sep = '')## [1] "The dalay rate for ALASKA is: 13.27%"
delayRateAmWest <- d[2, 'delayed'] / (d[2, 'delayed'] + d[2, 'on.time'])
paste('The dalay rate for AM WEST is: ', round(delayRateAmWest$delayed * 100, 2), '%', sep = '')## [1] "The dalay rate for AM WEST is: 10.89%"
Let’s see how this can compare against each other in visualisation
ggplot(filter(df2, flightStatus=="delayed"),aes(destination,tally,color=airline))+geom_point(size=5)+ggtitle("All Delayed Flights")+ylab("Delays")Let’s see how these stack up against each other on a Bar plot when we sum the total flights per airline per city
r<-df2 %>% spread(flightStatus,tally)%>%mutate(numFlights = rowSums(.[,3:4]), freq= .[,3]/numFlights)
r## airline destination delayed on time numFlights freq
## 1 ALASKA Los.Ageles 62 497 559 0.11091234
## 2 ALASKA Phoenix 12 221 233 0.05150215
## 3 ALASKA San.Diego 20 212 232 0.08620690
## 4 ALASKA San.Francisco 102 503 605 0.16859504
## 5 ALASKA Seattle 305 1841 2146 0.14212488
## 6 AM WEST Los.Ageles 117 694 811 0.14426634
## 7 AM WEST Phoenix 415 4840 5255 0.07897241
## 8 AM WEST San.Diego 65 383 448 0.14508929
## 9 AM WEST San.Francisco 129 320 449 0.28730512
## 10 AM WEST Seattle 61 201 262 0.23282443
ggplot(r,aes(x=destination,y=freq))+geom_bar(aes(fill=airline),stat="identity",position="dodge")+ylab("Rate of delays")+ggtitle("All Delayed Flights per city")