NYC Flights Homework

Load the libraries and view the “flights” dataset

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(nycflights13)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
view(flights)

Now create one data visualization with this dataset

Your assignment is to create one plot to visualize one aspect of this dataset. The plot may be any type we have covered so far in this class (bargraphs, scatterplots, boxplots, histograms, treemaps, heatmaps, streamgraphs, or alluvials)

str(flights)
## tibble [336,776 × 19] (S3: tbl_df/tbl/data.frame)
##  $ year          : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ month         : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ day           : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ dep_time      : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
##  $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
##  $ dep_delay     : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
##  $ arr_time      : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
##  $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
##  $ arr_delay     : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
##  $ carrier       : chr [1:336776] "UA" "UA" "AA" "B6" ...
##  $ flight        : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
##  $ tailnum       : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
##  $ origin        : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
##  $ dest          : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
##  $ air_time      : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
##  $ distance      : num [1:336776] 1400 1416 1089 1576 762 ...
##  $ hour          : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
##  $ minute        : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
##  $ time_hour     : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
delays_by_day <- flights %>% select(arr_delay, carrier, month) # selects just the variables we want to plot
setwd("/Users/maryglantz/Desktop/DATA 110")
write_csv(flights, "flights.csv")

Clean up the data so it works with the alluvial(function)

library(alluvial)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
count(delays_by_day, carrier)  # to figure out how many carriers we have
## # A tibble: 16 x 2
##    carrier     n
##    <chr>   <int>
##  1 9E      18460
##  2 AA      32729
##  3 AS        714
##  4 B6      54635
##  5 DL      48110
##  6 EV      54173
##  7 F9        685
##  8 FL       3260
##  9 HA        342
## 10 MQ      26397
## 11 OO         32
## 12 UA      58665
## 13 US      20536
## 14 VX       5162
## 15 WN      12275
## 16 YV        601
str(delays_by_day)
## tibble [336,776 × 3] (S3: tbl_df/tbl/data.frame)
##  $ arr_delay: num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
##  $ carrier  : chr [1:336776] "UA" "UA" "AA" "B6" ...
##  $ month    : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
delays_by_day2 <- delays_by_day[, c(2, 3, 1)] # re-order columns so time value is second column
str(delays_by_day2)
## tibble [336,776 × 3] (S3: tbl_df/tbl/data.frame)
##  $ carrier  : chr [1:336776] "UA" "UA" "AA" "B6" ...
##  $ month    : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ arr_delay: num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
sum(is.na(delays_by_day2))  # check for NA values
## [1] 9430
delays_by_day2_clean <- na.omit(delays_by_day2) # remove NA values
unique(delays_by_day2_clean$carrier)
##  [1] "UA" "AA" "B6" "DL" "EV" "MQ" "US" "WN" "VX" "FL" "AS" "9E" "F9" "HA" "YV"
## [16] "OO"
UA_delays <- delays_by_day2_clean %>% filter(carrier == "UA")
UA_monthly_delays <- UA_delays %>% group_by(month) %>% mutate(total_by_month = sum(arr_delay))

AA_delays <- delays_by_day2_clean %>% filter(carrier == "AA")
AA_monthly_delays <- AA_delays %>% group_by(month) %>% mutate(total_by_month = sum(arr_delay))

B6_delays <- delays_by_day2_clean %>% filter(carrier == "B6")
B6_monthly_delays <- B6_delays %>% group_by(month) %>% mutate(total_by_month = sum(arr_delay))

DL_delays <- delays_by_day2_clean %>% filter(carrier == "DL")
DL_monthly_delays <- DL_delays %>% group_by(month) %>% mutate(total_by_month = sum(arr_delay))

EV_delays <- delays_by_day2_clean %>% filter(carrier == "EV")
EV_monthly_delays <- EV_delays %>% group_by(month) %>% mutate(total_by_month = sum(arr_delay))

MQ_delays <- delays_by_day2_clean %>% filter(carrier == "MQ")
MQ_monthly_delays <- MQ_delays %>% group_by(month) %>% mutate(total_by_month = sum(arr_delay))

US_delays <- delays_by_day2_clean %>% filter(carrier == "US")
US_monthly_delays <- US_delays %>% group_by(month) %>% mutate(total_by_month = sum(arr_delay))

WN_delays <- delays_by_day2_clean %>% filter(carrier == "WN")
WN_monthly_delays <- WN_delays %>% group_by(month) %>% mutate(total_by_month = sum(arr_delay))

VX_delays <- delays_by_day2_clean %>% filter(carrier == "VX")
VX_monthly_delays <- VX_delays %>% group_by(month) %>% mutate(total_by_month = sum(arr_delay))

FL_delays <- delays_by_day2_clean %>% filter(carrier == "FL")
FL_monthly_delays <- FL_delays %>% group_by(month) %>% mutate(total_by_month = sum(arr_delay))

AS_delays <- delays_by_day2_clean %>% filter(carrier == "AS")
AS_monthly_delays <- AS_delays %>% group_by(month) %>% mutate(total_by_month = sum(arr_delay))

NineE_delays <- delays_by_day2_clean %>% filter(carrier == "9E")
NineE_monthly_delays <- NineE_delays %>% group_by(month) %>% mutate(total_by_month = sum(arr_delay))

F9_delays <- delays_by_day2_clean %>% filter(carrier == "F9")
F9_monthly_delays <- F9_delays %>% group_by(month) %>% mutate(total_by_month = sum(arr_delay))

HA_delays <- delays_by_day2_clean %>% filter(carrier == "HA")
HA_monthly_delays <- HA_delays %>% group_by(month) %>% mutate(total_by_month = sum(arr_delay))

YV_delays <- delays_by_day2_clean %>% filter(carrier == "YV")
YV_monthly_delays <- YV_delays %>% group_by(month) %>% mutate(total_by_month = sum(arr_delay))

OO_delays <- delays_by_day2_clean %>% filter(carrier == "OO")
OO_monthly_delays <- OO_delays %>% group_by(month) %>% mutate(total_by_month = sum(arr_delay))


monthly_carrier_delays <- bind_rows(UA_monthly_delays, AA_monthly_delays, B6_monthly_delays, DL_monthly_delays, EV_monthly_delays, MQ_monthly_delays, US_monthly_delays, WN_monthly_delays, VX_monthly_delays, FL_monthly_delays, AS_monthly_delays, NineE_monthly_delays, F9_monthly_delays, HA_monthly_delays, YV_monthly_delays, OO_monthly_delays)


monthly_carrier_delays
## # A tibble: 327,346 x 4
## # Groups:   month [12]
##    carrier month arr_delay total_by_month
##    <chr>   <int>     <dbl>          <dbl>
##  1 UA          1        11          14576
##  2 UA          1        20          14576
##  3 UA          1        12          14576
##  4 UA          1         7          14576
##  5 UA          1       -14          14576
##  6 UA          1        -8          14576
##  7 UA          1       -17          14576
##  8 UA          1        14          14576
##  9 UA          1         1          14576
## 10 UA          1        29          14576
## # … with 327,336 more rows

Figure out how to select each carrier, each month, and the total for each month and put it in a new dataframe (because repeated attempts to run alluvial on the above data have resulted in long, long thinking for the computer.)

monthly_carrier_delays$arr_delay <- NULL
str(monthly_carrier_delays)
## tibble [327,346 × 3] (S3: grouped_df/tbl_df/tbl/data.frame)
##  $ carrier       : chr [1:327346] "UA" "UA" "UA" "UA" ...
##  $ month         : int [1:327346] 1 1 1 1 1 1 1 1 1 1 ...
##  $ total_by_month: num [1:327346] 14576 14576 14576 14576 14576 ...
##  - attr(*, "groups")= tibble [12 × 2] (S3: tbl_df/tbl/data.frame)
##   ..$ month: int [1:12] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ .rows: list<int> [1:12] 
##   .. ..$ : int [1:26398] 1 2 3 4 5 6 7 8 9 10 ...
##   .. ..$ : int [1:23611] 19272 19273 19274 19275 19276 19277 19278 19279 19280 19281 ...
##   .. ..$ : int [1:27902] 23429 23430 23431 23432 23433 23434 23435 23436 23437 23438 ...
##   .. ..$ : int [1:27564] 28338 28339 28340 28341 28342 28343 28344 28345 28346 28347 ...
##   .. ..$ : int [1:28128] 33316 33317 33318 33319 33320 33321 33322 33323 33324 33325 ...
##   .. ..$ : int [1:27075] 38206 38207 38208 38209 38210 38211 38212 38213 38214 38215 ...
##   .. ..$ : int [1:28293] 43091 43092 43093 43094 43095 43096 43097 43098 43099 43100 ...
##   .. ..$ : int [1:28756] 48062 48063 48064 48065 48066 48067 48068 48069 48070 48071 ...
##   .. ..$ : int [1:27010] 53147 53148 53149 53150 53151 53152 53153 53154 53155 53156 ...
##   .. ..$ : int [1:28618] 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 ...
##   .. ..$ : int [1:26971] 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 ...
##   .. ..$ : int [1:27020] 14453 14454 14455 14456 14457 14458 14459 14460 14461 14462 ...
##   .. ..@ ptype: int(0) 
##   ..- attr(*, ".drop")= logi TRUE
simplified <- unique(monthly_carrier_delays)
view(simplified)
simplified %>% group_by(month)
## # A tibble: 185 x 3
## # Groups:   month [12]
##    carrier month total_by_month
##    <chr>   <int>          <dbl>
##  1 UA          1          14576
##  2 UA         10          -7683
##  3 UA         11          -6287
##  4 UA         12          67488
##  5 UA          2            809
##  6 UA          3           7624
##  7 UA          4          43346
##  8 UA          5         -11913
##  9 UA          6          62503
## 10 UA          7          53097
## # … with 175 more rows

graph the carriers by month and delay

set.seed(39)  # for nice colors

cols <- hsv(h = sample(1:16/16), s = sample (3:12)/15, v = sample(3:12)/15)  # creates the vector of 16 colors

alluvial_ts(simplified, wave = .3, ygap = 5, col = cols, plotdir = 'centred', alpha = .9, grid = TRUE, grid.lwd = 5, leg.mode = TRUE,  xmargin = 0.2, lab.cex = .7, xlab = 'Month', ylab = 'Air Carrier', border = NA, axis.cex = .8, leg.cex = .7, leg.col = 'white', title = "Flight Delays by Carrier and Month in NYC 2013\n")

Requirements for the plot:

  1. Include at least one dplyr command (filter, sort, summarize, group_by, select, mutate, ….)
  2. Include labels for the x- and y-axes
  3. Include a title
  4. Your plot must incorporate at least 2 colors
  5. Include a legend that indicates what the colors represent

Start early so that if you do have trouble, you can email me with questions