library(tidyr)
library(dplyr,warn.conflicts = FALSE)
options(dplyr.summarise.inform = FALSE)
library(stringr)
library(ggplot2)
create csv in excel and add to github
import csv from github
mydata <- read.csv("https://raw.githubusercontent.com/arinolan/assignment-4/main/airline%20data.csv")
mydata[2,1] <- mydata[1,1]
mydata[5,1] <- mydata[4,1]
mydata[,2] <- sapply(mydata[,2], str_replace, " ", ".")
mydata
## X X.1 los.angeles phoenix san.diego san.francisco seattle
## 1 alaska on.time 497 221 212 503 1841
## 2 alaska delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 am west on.time 694 4840 383 320 201
## 5 am west delayed 117 415 65 129 61
clean_mydata <- mydata %>%
na.omit() %>%
rename(airline = X, status = X.1) %>%
gather("dest.city", "n", 3:7) %>%
spread(status, "n") %>%
mutate(total.arrivals = delayed + on.time, on.time.percent.rate = on.time/total.arrivals*100) %>%
arrange(desc(total.arrivals))
clean_mydata[,2] <- sapply(clean_mydata[,2], str_replace, "\\.", " ")
clean_mydata
## airline dest.city delayed on.time total.arrivals on.time.percent.rate
## 1 am west phoenix 415 4840 5255 92.10276
## 2 alaska seattle 305 1841 2146 85.78751
## 3 am west los angeles 117 694 811 85.57337
## 4 alaska san francisco 102 503 605 83.14050
## 5 alaska los angeles 62 497 559 88.90877
## 6 am west san francisco 129 320 449 71.26949
## 7 am west san diego 65 383 448 85.49107
## 8 am west seattle 61 201 262 76.71756
## 9 alaska phoenix 12 221 233 94.84979
## 10 alaska san diego 20 212 232 91.37931
best and worst arrival by destination
best_dest <- clean_mydata %>%
filter(on.time.percent.rate == max(on.time.percent.rate))
best_dest
## airline dest.city delayed on.time total.arrivals on.time.percent.rate
## 1 alaska phoenix 12 221 233 94.84979
worst_dest <- clean_mydata %>%
filter(on.time.percent.rate == min(on.time.percent.rate))
worst_dest
## airline dest.city delayed on.time total.arrivals on.time.percent.rate
## 1 am west san francisco 129 320 449 71.26949
most and least timely arrival
timely <- clean_mydata %>%
group_by(airline) %>%
summarise(on.time.airline.perecent.rate = sum(on.time) / sum(total.arrivals)*100) %>%
filter(on.time.airline.perecent.rate == max(on.time.airline.perecent.rate))
timely
## # A tibble: 1 × 2
## airline on.time.airline.perecent.rate
## <chr> <dbl>
## 1 am west 89.1
least_timely <- clean_mydata %>%
group_by(airline) %>%
summarise(on.time.airline.perecent.rate = sum(on.time) / sum(total.arrivals)*100) %>%
filter(on.time.airline.perecent.rate == min(on.time.airline.perecent.rate))
least_timely
## # A tibble: 1 × 2
## airline on.time.airline.perecent.rate
## <chr> <dbl>
## 1 alaska 86.7
highest to lowest rank
ranked <- clean_mydata %>%
group_by(dest.city) %>%
summarise(on.time.city.percent.rate = sum(on.time) / sum(total.arrivals)*100) %>%
mutate(ranking.on.time = min_rank(desc(on.time.city.percent.rate))) %>%
arrange(ranking.on.time)
ranked
## # A tibble: 5 × 3
## dest.city on.time.city.percent.rate ranking.on.time
## <chr> <dbl> <int>
## 1 phoenix 92.2 1
## 2 san diego 87.5 2
## 3 los angeles 86.9 3
## 4 seattle 84.8 4
## 5 san francisco 78.1 5