#first let's read in the data
flights <- read.csv("https://raw.githubusercontent.com/evanmclaughlin/ECM607/master/flights.csv", na.strings = c("", "NA"),header = TRUE, sep = ",")
flights <- tbl_df(flights)
## Warning: `tbl_df()` is deprecated as of dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
#now let's take a look
flights
## # A tibble: 5 x 7
## Airline Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## <chr> <chr> <int> <int> <int> <int> <int>
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 <NA> <NA> NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 <NA> delayed 117 415 65 129 61
#Remove empty row and tidy data a bit using a loop
flights <- flights[-c(3),]
flights
## # A tibble: 4 x 7
## Airline Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## <chr> <chr> <int> <int> <int> <int> <int>
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 <NA> delayed 117 415 65 129 61
#create new dataframe
flights_df <- gather(flights, City, Number, 3:7)
## fill in blanks in new dataframe with for loop
for(i in 2:nrow(flights_df)){
if(is.na(flights_df$Airline[i])){
flights_df$Airline[i] <- flights_df$Airline[i-1]
}
}
# we'll go ahead and split status into two columns to ease calculations
flight_spread <- spread(flights_df, Status, Number, 3:7)
## Warning in if (!is.na(fill)) {: the condition has length > 1 and only the first
## element will be used
flight_spread = rename (flight_spread, "on_time" = "on time")
flight_spread
## # A tibble: 10 x 4
## Airline City delayed on_time
## <chr> <chr> <int> <int>
## 1 ALASKA Los.Angeles 62 497
## 2 ALASKA Phoenix 12 221
## 3 ALASKA San.Diego 20 212
## 4 ALASKA San.Francisco 102 503
## 5 ALASKA Seattle 305 1841
## 6 AM WEST Los.Angeles 117 694
## 7 AM WEST Phoenix 415 4840
## 8 AM WEST San.Diego 65 383
## 9 AM WEST San.Francisco 129 320
## 10 AM WEST Seattle 61 201
#Alaska first
ak <- filter(flight_spread, Airline == "ALASKA")
ak_ot <- sum(ak$on_time) / (sum(ak$delayed) + sum(ak$on_time))
round(ak_ot, digits = 2)
## [1] 0.87
#AM West next
aw <- filter(flight_spread, Airline == "AM WEST")
aw_ot <- sum(aw$on_time) / (sum(aw$delayed) + sum(aw$on_time))
round(aw_ot, digits = 2)
## [1] 0.89
#At first blush, AM West has a slightly better percentage of on time flights in the aggregate, but it’s worth looking drilling down into any problematic destinations
n <- 1
for (n in c(1:10)){
flight_spread$perc_ot[n] <- round(flight_spread$on_time[n] / (flight_spread$on_time[n] + flight_spread$delayed[n]), digits = 2)
n <- n+1
}
## Warning: Unknown or uninitialised column: `perc_ot`.
# let's iterate through the df and calculate on time arrivals for each destination city
flight_spread
## # A tibble: 10 x 5
## Airline City delayed on_time perc_ot
## <chr> <chr> <int> <int> <dbl>
## 1 ALASKA Los.Angeles 62 497 0.89
## 2 ALASKA Phoenix 12 221 0.95
## 3 ALASKA San.Diego 20 212 0.91
## 4 ALASKA San.Francisco 102 503 0.83
## 5 ALASKA Seattle 305 1841 0.86
## 6 AM WEST Los.Angeles 117 694 0.86
## 7 AM WEST Phoenix 415 4840 0.92
## 8 AM WEST San.Diego 65 383 0.85
## 9 AM WEST San.Francisco 129 320 0.71
## 10 AM WEST Seattle 61 201 0.77
ggbarplot(flight_spread , x= "Airline", y= "perc_ot", color = "City" , position = position_dodge())