#load libraries
library(tidyr)
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
data <- read.csv("https://raw.githubusercontent.com/marjete/flights.607/main/flights.607.csv")
data
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AMWEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
data[2,1] <- data[1,1] #add alaska and amwest to the row where its blank
data[5,1] <- data[4,1]
data[,2] <- sapply(data[,2], str_replace, " ", ".")
data
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on.time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AMWEST on.time 694 4840 383 320 201
## 5 AMWEST delayed 117 415 65 129 61
data <- data %>% drop_na #removes row with na
data
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on.time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AMWEST on.time 694 4840 383 320 201
## 4 AMWEST delayed 117 415 65 129 61
data <- data %>%
rename("company" = 1, "status" = 2) #column rename
data
## company status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on.time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AMWEST on.time 694 4840 383 320 201
## 4 AMWEST delayed 117 415 65 129 61
data <- data %>%
gather(destination, flight.count, 3:7) #change wide structure to long
data
## company status destination flight.count
## 1 ALASKA on.time Los.Angeles 497
## 2 ALASKA delayed Los.Angeles 62
## 3 AMWEST on.time Los.Angeles 694
## 4 AMWEST delayed Los.Angeles 117
## 5 ALASKA on.time Phoenix 221
## 6 ALASKA delayed Phoenix 12
## 7 AMWEST on.time Phoenix 4840
## 8 AMWEST delayed Phoenix 415
## 9 ALASKA on.time San.Diego 212
## 10 ALASKA delayed San.Diego 20
## 11 AMWEST on.time San.Diego 383
## 12 AMWEST delayed San.Diego 65
## 13 ALASKA on.time San.Francisco 503
## 14 ALASKA delayed San.Francisco 102
## 15 AMWEST on.time San.Francisco 320
## 16 AMWEST delayed San.Francisco 129
## 17 ALASKA on.time Seattle 1841
## 18 ALASKA delayed Seattle 305
## 19 AMWEST on.time Seattle 201
## 20 AMWEST delayed Seattle 61
```r
ddply(data, "company", transform, perct = flight.count / sum(flight.count)) %>%
filter(status == "delayed") %>%
group_by(company) %>%
dplyr::summarise(total_perct = sum(perct))
## # A tibble: 2 × 2
## company total_perct
## <chr> <dbl>
## 1 ALASKA 0.133
## 2 AMWEST 0.109
Compare the arrival delays for the two airlines; in a general comparision: 13% of Alaska airlines are typically delayed whereas, about 11% of Amwest flight are delayed. This doesnt seem like much of a difference so next I will look more closely per desitnation. Graphing the data might be helpful but finding the exact % of on delayed flights per destination and company would be best.
flightstatus <- spread(data, status, flight.count)
flightstatus <- flightstatus %>%
mutate(total = rowSums(across(where(is.numeric))))
flightstatus
## company destination delayed on.time total
## 1 ALASKA Los.Angeles 62 497 559
## 2 ALASKA Phoenix 12 221 233
## 3 ALASKA San.Diego 20 212 232
## 4 ALASKA San.Francisco 102 503 605
## 5 ALASKA Seattle 305 1841 2146
## 6 AMWEST Los.Angeles 117 694 811
## 7 AMWEST Phoenix 415 4840 5255
## 8 AMWEST San.Diego 65 383 448
## 9 AMWEST San.Francisco 129 320 449
## 10 AMWEST Seattle 61 201 262
summary <- flightstatus %>%
group_by (company, destination) %>%
summarise(total_delayed = sum(delayed), total_ontime = sum(on.time))%>%
mutate(total = rowSums(across(where(is.numeric))),
percent_delayed = round(total_delayed / total*100, 2),
percent_ontime = round (total_ontime / total*100, 2))
## `summarise()` has grouped output by 'company'. You can override using the
## `.groups` argument.
summary
## # A tibble: 10 × 7
## # Groups: company [2]
## company destination total_delayed total_ontime total percent_dela…¹ perce…²
## <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 ALASKA Los.Angeles 62 497 559 11.1 88.9
## 2 ALASKA Phoenix 12 221 233 5.15 94.8
## 3 ALASKA San.Diego 20 212 232 8.62 91.4
## 4 ALASKA San.Francisco 102 503 605 16.9 83.1
## 5 ALASKA Seattle 305 1841 2146 14.2 85.8
## 6 AMWEST Los.Angeles 117 694 811 14.4 85.6
## 7 AMWEST Phoenix 415 4840 5255 7.9 92.1
## 8 AMWEST San.Diego 65 383 448 14.5 85.5
## 9 AMWEST San.Francisco 129 320 449 28.7 71.3
## 10 AMWEST Seattle 61 201 262 23.3 76.7
## # … with abbreviated variable names ¹percent_delayed, ²percent_ontime
Conclusion: When comparing airline companies arrival delays per destination, the data above reflects that 28.7% of flights to San Francisco with AMWest were delayed. The next highest amount of delays are also with Amwest to Seattle. Flights to Phoenix have the highest rates of being on time, with only 5% delay with Alaska airlines and ~8% delay with AMWest.