library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.0
## v tidyr 1.1.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
URL <- "https://raw.githubusercontent.com/okhaimova/DATA-607/master/Week5/W5data.csv"
delays <- read.csv(URL)
# could alternatively do na.omit(read.csv(URL)) to remove NA values
#Untidy Data
delays
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
First, I deleted the row with just NA values. Then I replaced the blank cells with NA values so that they can be filled with the appropriate name. Then, I grouped the city columns into a destination column and made a count column. Afterwards, I renamed the columns and separated the on time and delayed into two different columns.
# deleting the 3rd row with NA values
delays <- delays[-c(3), ]
# replacing blank cells with NA
delays[delays == ""] <- NA
# fill the NA cells in the X column and then change to a longer table
# renaming variables and separating the status variable
delays <- delays %>%
fill(X) %>%
pivot_longer(Los.Angeles:Seattle, names_to = "destination", values_to = "count") %>%
rename("airline" = "X", "status" = "X.1") %>%
pivot_wider(names_from = status, values_from = count)
# removing the period from the destination names
delays$destination <- str_replace(delays$destination, "\\.","\\ ")
I made an extra column that show the proportion of flights that were on time for each city and airline. I then found how many flights were on time and delayed for each airline.
delays <- delays %>%
mutate(ontimeprop = round(`on time` / (delayed + `on time`) * 100, 2))
delays
## # A tibble: 10 x 5
## airline destination `on time` delayed ontimeprop
## <chr> <chr> <int> <int> <dbl>
## 1 ALASKA Los Angeles 497 62 88.9
## 2 ALASKA Phoenix 221 12 94.8
## 3 ALASKA San Diego 212 20 91.4
## 4 ALASKA San Francisco 503 102 83.1
## 5 ALASKA Seattle 1841 305 85.8
## 6 AM WEST Los Angeles 694 117 85.6
## 7 AM WEST Phoenix 4840 415 92.1
## 8 AM WEST San Diego 383 65 85.5
## 9 AM WEST San Francisco 320 129 71.3
## 10 AM WEST Seattle 201 61 76.7
summarystatairline <- delays %>%
group_by(airline) %>%
summarise(totalontime = sum(`on time`),
totaldelayed = sum (delayed),
proportion = round (totalontime / (totaldelayed + totalontime) * 100, 2))
## `summarise()` ungrouping output (override with `.groups` argument)
summarystatairline
## # A tibble: 2 x 4
## airline totalontime totaldelayed proportion
## <chr> <int> <int> <dbl>
## 1 ALASKA 3274 501 86.7
## 2 AM WEST 6438 787 89.1
summarystatdest <- delays %>%
group_by(destination) %>%
summarise(totalontime = sum(`on time`),
totaldelayed = sum (delayed),
proportion = round (totalontime / (totaldelayed + totalontime) * 100, 2))
## `summarise()` ungrouping output (override with `.groups` argument)
summarystatdest
## # A tibble: 5 x 4
## destination totalontime totaldelayed proportion
## <chr> <int> <int> <dbl>
## 1 Los Angeles 1191 179 86.9
## 2 Phoenix 5061 427 92.2
## 3 San Diego 595 85 87.5
## 4 San Francisco 823 231 78.1
## 5 Seattle 2042 366 84.8
ggplot(summarystatairline, aes(x = airline, y = proportion, fill = airline)) +
geom_bar(stat = "identity") +
ggtitle("Overall On time Comparison")
ggplot(delays, aes(x = airline, y = `on time`, fill = airline)) +
geom_bar(stat = "identity") +
ggtitle("On time Comparison between Cities") +
ylab("On Time Count") +
facet_wrap(~destination)
ggplot(delays, aes(x = airline, y = ontimeprop, fill = airline)) +
geom_bar(stat = "identity") +
ggtitle("On time Comparison between Cities Percentage") +
ylab("Percentage On Time") +
facet_wrap(~destination)
AM West seems to have the higher proportion of on time flights. However, when comparing the values side by side for each city, Alaskan Airlines seems to make more on-time flights proportionally, but AM West makes more flights. Alaskan Airline flights to Phoenix have the highest proportion of on time flights and flights to San Francisco via AM West have the lowest proportion of on time flights. Overall, Phoenix had the highest on-time flights ratio and San Francisco had the lowest.