arrival_stats <- readr::read_csv("https://raw.githubusercontent.com/metis-macys-66898/data_607_sp2020/master/arrival_stats.csv")
## Warning: Missing column names filled in: 'X1' [1], 'X2' [2]
## Parsed with column specification:
## cols(
## X1 = col_character(),
## X2 = col_character(),
## `Los Angeles` = col_double(),
## Phoenix = col_number(),
## `San Diego` = col_double(),
## `San Francisco` = col_double(),
## Seattle = col_number()
## )
arrival_stats
## # A tibble: 5 x 7
## X1 X2 `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALASKA on time 497 221 212 503 1841
## 2 <NA> delayed 62 12 20 102 305
## 3 <NA> <NA> NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 <NA> delayed 117 415 65 129 61
#class(arrival_stats)
kable(arrival_stats, format = 'markdown')
| X1 | X2 | Los Angeles | Phoenix | San Diego | San Francisco | Seattle |
|---|---|---|---|---|---|---|
| ALASKA | on time | 497 | 221 | 212 | 503 | 1841 |
| NA | delayed | 62 | 12 | 20 | 102 | 305 |
| NA | NA | NA | NA | NA | NA | NA |
| AM WEST | on time | 694 | 4840 | 383 | 320 | 201 |
| NA | delayed | 117 | 415 | 65 | 129 | 61 |
I renamed the variables X1, X2 to Carrier and FlightStatus respectively. Removed an empty record at row 3. Populated Carrier for empty rows using the fill method from tidyr. Transforming the data.frame to a longer form by using pivot_longer. End resultant dataframe is called arrival_stats_transformed.
arrival_stats <- arrival_stats %>% rename(Carrier = X1, FlightStatus = X2) %>% filter(!is.na(FlightStatus))
#.direction = "down" (default)
arrival_stats <- tidyr::fill(arrival_stats, Carrier)
arrival_stats_transformed <- arrival_stats %>% pivot_longer(cols = c(-Carrier, -FlightStatus), names_to = "destination", values_to = "arrivals")
arrival_stats_transformed
## # A tibble: 20 x 4
## Carrier FlightStatus destination arrivals
## <chr> <chr> <chr> <dbl>
## 1 ALASKA on time Los Angeles 497
## 2 ALASKA on time Phoenix 221
## 3 ALASKA on time San Diego 212
## 4 ALASKA on time San Francisco 503
## 5 ALASKA on time Seattle 1841
## 6 ALASKA delayed Los Angeles 62
## 7 ALASKA delayed Phoenix 12
## 8 ALASKA delayed San Diego 20
## 9 ALASKA delayed San Francisco 102
## 10 ALASKA delayed Seattle 305
## 11 AM WEST on time Los Angeles 694
## 12 AM WEST on time Phoenix 4840
## 13 AM WEST on time San Diego 383
## 14 AM WEST on time San Francisco 320
## 15 AM WEST on time Seattle 201
## 16 AM WEST delayed Los Angeles 117
## 17 AM WEST delayed Phoenix 415
## 18 AM WEST delayed San Diego 65
## 19 AM WEST delayed San Francisco 129
## 20 AM WEST delayed Seattle 61
summary_df <- arrival_stats_transformed %>% group_by (Carrier, FlightStatus) %>% summarize_at ('arrivals', sum)
summary_df
## # A tibble: 4 x 3
## # Groups: Carrier [2]
## Carrier FlightStatus arrivals
## <chr> <chr> <dbl>
## 1 ALASKA delayed 501
## 2 ALASKA on time 3274
## 3 AM WEST delayed 787
## 4 AM WEST on time 6438
# since filter is a 2 argument function, we use .x and .y (https://community.rstudio.com/t/function-argument-naming-conventions-x-vs-x/7764)
# setNames() - sets the names on an object and returns the object
airline_names <- unique(arrival_stats_transformed$Carrier)
airline_totals <- purrr::map_dbl(airline_names, ~ sum(filter(arrival_stats_transformed, Carrier == .x)[['arrivals']])) %>% setNames(airline_names)
airline_totals
## ALASKA AM WEST
## 3775 7225
# the following will create a column called proportion where it is defined by arrivals divided by the total of the carrier from airline_totals
summary_df <- summary_df %>% mutate(proportion = arrivals / airline_totals[Carrier])
summary_df
## # A tibble: 4 x 4
## # Groups: Carrier [2]
## Carrier FlightStatus arrivals proportion
## <chr> <chr> <dbl> <dbl>
## 1 ALASKA delayed 501 0.133
## 2 ALASKA on time 3274 0.867
## 3 AM WEST delayed 787 0.109
## 4 AM WEST on time 6438 0.891
plt1 <- ggplot(data=summary_df, aes(x=Carrier, y=proportion, fill=Carrier)) + geom_bar(stat='identity', position="dodge") + ggtitle("Visualizing Proprotion of Delays between the 2 Carriers") + ylab("% of Delayed Flights")
plt1
summary_df1 <- arrival_stats_transformed %>% group_by (destination, FlightStatus) %>% summarize_at ('arrivals', sum)
summary_df1
## # A tibble: 10 x 3
## # Groups: destination [5]
## destination FlightStatus arrivals
## <chr> <chr> <dbl>
## 1 Los Angeles delayed 179
## 2 Los Angeles on time 1191
## 3 Phoenix delayed 427
## 4 Phoenix on time 5061
## 5 San Diego delayed 85
## 6 San Diego on time 595
## 7 San Francisco delayed 231
## 8 San Francisco on time 823
## 9 Seattle delayed 366
## 10 Seattle on time 2042
airport_names <- unique(arrival_stats_transformed$destination)
airport_totals <- purrr::map_dbl(airport_names, ~ sum(filter(arrival_stats_transformed, destination == .x)[['arrivals']])) %>% setNames(airport_names)
airport_totals
## Los Angeles Phoenix San Diego San Francisco Seattle
## 1370 5488 680 1054 2408
# the following will create a column called proportion where it is defined by arrivals divided by the total of each destination from airport_totals
summary_df1 <- summary_df1 %>% mutate(proportion = arrivals / airport_totals[destination])
summary_df1
## # A tibble: 10 x 4
## # Groups: destination [5]
## destination FlightStatus arrivals proportion
## <chr> <chr> <dbl> <dbl>
## 1 Los Angeles delayed 179 0.131
## 2 Los Angeles on time 1191 0.869
## 3 Phoenix delayed 427 0.0778
## 4 Phoenix on time 5061 0.922
## 5 San Diego delayed 85 0.125
## 6 San Diego on time 595 0.875
## 7 San Francisco delayed 231 0.219
## 8 San Francisco on time 823 0.781
## 9 Seattle delayed 366 0.152
## 10 Seattle on time 2042 0.848
plt2 <- ggplot(data=summary_df1, aes(x=destination, y=proportion, fill=FlightStatus)) + geom_bar(stat='identity', position="dodge") + ggtitle("Visualizing Proprotion of Delays across Airports") + ylab("% of Delayed Flights")
plt2