Load the libraries and view the “flights” dataset
library(tidyverse)
library(nycflights13)
library(psych)
summary(flights)
## year month day dep_time sched_dep_time
## Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1 Min. : 106
## 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 907 1st Qu.: 906
## Median :2013 Median : 7.000 Median :16.00 Median :1401 Median :1359
## Mean :2013 Mean : 6.549 Mean :15.71 Mean :1349 Mean :1344
## 3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1744 3rd Qu.:1729
## Max. :2013 Max. :12.000 Max. :31.00 Max. :2400 Max. :2359
## NA's :8255
## dep_delay arr_time sched_arr_time arr_delay
## Min. : -43.00 Min. : 1 Min. : 1 Min. : -86.000
## 1st Qu.: -5.00 1st Qu.:1104 1st Qu.:1124 1st Qu.: -17.000
## Median : -2.00 Median :1535 Median :1556 Median : -5.000
## Mean : 12.64 Mean :1502 Mean :1536 Mean : 6.895
## 3rd Qu.: 11.00 3rd Qu.:1940 3rd Qu.:1945 3rd Qu.: 14.000
## Max. :1301.00 Max. :2400 Max. :2359 Max. :1272.000
## NA's :8255 NA's :8713 NA's :9430
## carrier flight tailnum origin
## Length:336776 Min. : 1 Length:336776 Length:336776
## Class :character 1st Qu.: 553 Class :character Class :character
## Mode :character Median :1496 Mode :character Mode :character
## Mean :1972
## 3rd Qu.:3465
## Max. :8500
##
## dest air_time distance hour
## Length:336776 Min. : 20.0 Min. : 17 Min. : 1.00
## Class :character 1st Qu.: 82.0 1st Qu.: 502 1st Qu.: 9.00
## Mode :character Median :129.0 Median : 872 Median :13.00
## Mean :150.7 Mean :1040 Mean :13.18
## 3rd Qu.:192.0 3rd Qu.:1389 3rd Qu.:17.00
## Max. :695.0 Max. :4983 Max. :23.00
## NA's :9430
## minute time_hour
## Min. : 0.00 Min. :2013-01-01 05:00:00
## 1st Qu.: 8.00 1st Qu.:2013-04-04 13:00:00
## Median :29.00 Median :2013-07-03 10:00:00
## Mean :26.23 Mean :2013-07-03 05:22:54
## 3rd Qu.:44.00 3rd Qu.:2013-10-01 07:00:00
## Max. :59.00 Max. :2013-12-31 23:00:00
##
Prepare dataframe for plotting
library(tidyverse)
#find top 10 airlines based on distance traveled
select(flights, carrier, distance) %>%
group_by(carrier) %>%
summarize(total = sum(distance)) %>%
arrange(desc(total))
## # A tibble: 16 x 2
## carrier total
## <chr> <dbl>
## 1 UA 89705524
## 2 DL 59507317
## 3 B6 58384137
## 4 AA 43864584
## 5 EV 30498951
## 6 MQ 15033955
## 7 VX 12902327
## 8 WN 12229203
## 9 US 11365778
## 10 9E 9788152
## 11 FL 2167344
## 12 AS 1715028
## 13 HA 1704186
## 14 F9 1109700
## 15 YV 225395
## 16 OO 16026
my_df <- flights %>%
select(carrier, month, distance) %>% #alluvial needs category, time-variable, value
filter(carrier == c("UA", "DL", "B6", "AA", "EV", "MQ", "VX", "WN", "US", "9E")) %>%
group_by(month, carrier) %>%
summarize(total_dist = sum(distance)) %>%
select(carrier, month, total_dist)
## Warning in carrier == c("UA", "DL", "B6", "AA", "EV", "MQ", "VX", "WN", : longer
## object length is not a multiple of shorter object length
my_df %>% drop_na()
## # A tibble: 120 x 3
## # Groups: month [12]
## carrier month total_dist
## <chr> <int> <dbl>
## 1 9E 1 78685
## 2 AA 1 342364
## 3 B6 1 502188
## 4 DL 1 462127
## 5 EV 1 241320
## 6 MQ 1 126603
## 7 UA 1 676284
## 8 US 1 76927
## 9 VX 1 75272
## 10 WN 1 104466
## # … with 110 more rows
Which airlines are flying the most in 2013?
#library(plotly)
library(alluvial)
## Warning: package 'alluvial' was built under R version 4.0.2
alluvial_ts(my_df, wave = .3, ygap = 5, grid = TRUE, xlab = "Month", ylab = "Distance Traveled", border = NA, axis.cex = .8, leg.mode = F, leg.max = 250000, leg.y = .96, leg.cex = .7, title = "Airline Miles Traveled\nTop 10 in 2013")
