NYC Flights Homework

Load the libraries and view the “flights” dataset

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.2     v dplyr   1.0.6
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(nycflights13)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(ggplot2)
#view(flights)
describe(flights)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
##                vars      n    mean      sd median trimmed     mad  min  max
## year              1 336776 2013.00    0.00   2013 2013.00    0.00 2013 2013
## month             2 336776    6.55    3.41      7    6.56    4.45    1   12
## day               3 336776   15.71    8.77     16   15.70   11.86    1   31
## dep_time          4 328521 1349.11  488.28   1401 1346.82  634.55    1 2400
## sched_dep_time    5 336776 1344.25  467.34   1359 1341.60  613.80  106 2359
## dep_delay         6 328521   12.64   40.21     -2    3.32    5.93  -43 1301
## arr_time          7 328063 1502.05  533.26   1535 1526.42  619.73    1 2400
## sched_arr_time    8 336776 1536.38  497.46   1556 1550.67  618.24    1 2359
## arr_delay         9 327346    6.90   44.63     -5   -1.03   20.76  -86 1272
## carrier*         10 336776    7.14    4.14      6    7.00    5.93    1   16
## flight           11 336776 1971.92 1632.47   1496 1830.51 1608.62    1 8500
## tailnum*         12 334264 1814.32 1199.75   1798 1778.21 1587.86    1 4043
## origin*          13 336776    1.95    0.82      2    1.94    1.48    1    3
## dest*            14 336776   50.03   28.12     50   49.56   32.62    1  105
## air_time         15 327346  150.69   93.69    129  140.03   75.61   20  695
## distance         16 336776 1039.91  733.23    872  955.27  569.32   17 4983
## hour             17 336776   13.18    4.66     13   13.15    5.93    1   23
## minute           18 336776   26.23   19.30     29   25.64   23.72    0   59
## time_hour        19 336776     NaN      NA     NA     NaN      NA  Inf -Inf
##                range  skew kurtosis   se
## year               0   NaN      NaN 0.00
## month             11 -0.01    -1.19 0.01
## day               30  0.01    -1.19 0.02
## dep_time        2399 -0.02    -1.09 0.85
## sched_dep_time  2253 -0.01    -1.20 0.81
## dep_delay       1344  4.80    43.95 0.07
## arr_time        2399 -0.47    -0.19 0.93
## sched_arr_time  2358 -0.35    -0.38 0.86
## arr_delay       1358  3.72    29.23 0.08
## carrier*          15  0.36    -1.21 0.01
## flight          8499  0.66    -0.85 2.81
## tailnum*        4042  0.17    -1.24 2.08
## origin*            2  0.09    -1.50 0.00
## dest*            104  0.13    -1.08 0.05
## air_time         675  1.07     0.86 0.16
## distance        4966  1.13     1.19 1.26
## hour              22  0.00    -1.21 0.01
## minute            59  0.09    -1.24 0.03
## time_hour       -Inf    NA       NA   NA
head(flights)
## # A tibble: 6 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     1      517            515         2      830            819
## 2  2013     1     1      533            529         4      850            830
## 3  2013     1     1      542            540         2      923            850
## 4  2013     1     1      544            545        -1     1004           1022
## 5  2013     1     1      554            600        -6      812            837
## 6  2013     1     1      554            558        -4      740            728
## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

Now create one data visualization with this dataset

Your assignment is to create one plot to visualize one aspect of this dataset. The plot may be any type we have covered so far in this class (bargraphs, scatterplots, boxplots, histograms, treemaps, heatmaps, streamgraphs, or alluvials)

Requirements for the plot:

  1. Include at least one dplyr command (filter, sort, summarize, group_by, select, mutate, ….)
  2. Include labels for the x- and y-axes
  3. Include a title
  4. Your plot must incorporate at least 2 colors
  5. Include a legend that indicates what the colors represent
  6. Write a brief paragraph that describes the visualization you have created and at least one aspect of the plot that you would like to highlight.

Start early so that if you do have trouble, you can email me with questions

str(flights)
## tibble [336,776 x 19] (S3: tbl_df/tbl/data.frame)
##  $ year          : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ month         : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ day           : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ dep_time      : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
##  $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
##  $ dep_delay     : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
##  $ arr_time      : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
##  $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
##  $ arr_delay     : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
##  $ carrier       : chr [1:336776] "UA" "UA" "AA" "B6" ...
##  $ flight        : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
##  $ tailnum       : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
##  $ origin        : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
##  $ dest          : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
##  $ air_time      : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
##  $ distance      : num [1:336776] 1400 1416 1089 1576 762 ...
##  $ hour          : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
##  $ minute        : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
##  $ time_hour     : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
summary(flights)
##       year          month             day           dep_time    sched_dep_time
##  Min.   :2013   Min.   : 1.000   Min.   : 1.00   Min.   :   1   Min.   : 106  
##  1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 907   1st Qu.: 906  
##  Median :2013   Median : 7.000   Median :16.00   Median :1401   Median :1359  
##  Mean   :2013   Mean   : 6.549   Mean   :15.71   Mean   :1349   Mean   :1344  
##  3rd Qu.:2013   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:1744   3rd Qu.:1729  
##  Max.   :2013   Max.   :12.000   Max.   :31.00   Max.   :2400   Max.   :2359  
##                                                  NA's   :8255                 
##    dep_delay          arr_time    sched_arr_time   arr_delay       
##  Min.   : -43.00   Min.   :   1   Min.   :   1   Min.   : -86.000  
##  1st Qu.:  -5.00   1st Qu.:1104   1st Qu.:1124   1st Qu.: -17.000  
##  Median :  -2.00   Median :1535   Median :1556   Median :  -5.000  
##  Mean   :  12.64   Mean   :1502   Mean   :1536   Mean   :   6.895  
##  3rd Qu.:  11.00   3rd Qu.:1940   3rd Qu.:1945   3rd Qu.:  14.000  
##  Max.   :1301.00   Max.   :2400   Max.   :2359   Max.   :1272.000  
##  NA's   :8255      NA's   :8713                  NA's   :9430      
##    carrier              flight       tailnum             origin         
##  Length:336776      Min.   :   1   Length:336776      Length:336776     
##  Class :character   1st Qu.: 553   Class :character   Class :character  
##  Mode  :character   Median :1496   Mode  :character   Mode  :character  
##                     Mean   :1972                                        
##                     3rd Qu.:3465                                        
##                     Max.   :8500                                        
##                                                                         
##      dest              air_time        distance         hour      
##  Length:336776      Min.   : 20.0   Min.   :  17   Min.   : 1.00  
##  Class :character   1st Qu.: 82.0   1st Qu.: 502   1st Qu.: 9.00  
##  Mode  :character   Median :129.0   Median : 872   Median :13.00  
##                     Mean   :150.7   Mean   :1040   Mean   :13.18  
##                     3rd Qu.:192.0   3rd Qu.:1389   3rd Qu.:17.00  
##                     Max.   :695.0   Max.   :4983   Max.   :23.00  
##                     NA's   :9430                                  
##      minute        time_hour                  
##  Min.   : 0.00   Min.   :2013-01-01 05:00:00  
##  1st Qu.: 8.00   1st Qu.:2013-04-04 13:00:00  
##  Median :29.00   Median :2013-07-03 10:00:00  
##  Mean   :26.23   Mean   :2013-07-03 05:22:54  
##  3rd Qu.:44.00   3rd Qu.:2013-10-01 07:00:00  
##  Max.   :59.00   Max.   :2013-12-31 23:00:00  
## 

Using the filter function target flights from LAX in the First half of 2013

monthly_flights <- flights %>%
    filter(dest == "LAX" & month <= 6 & year == 2013)
monthly_flights
## # A tibble: 7,632 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      558            600        -2      924            917
##  2  2013     1     1      628            630        -2     1016            947
##  3  2013     1     1      658            700        -2     1027           1025
##  4  2013     1     1      702            700         2     1058           1014
##  5  2013     1     1      743            730        13     1107           1100
##  6  2013     1     1      828            823         5     1150           1143
##  7  2013     1     1      829            830        -1     1152           1200
##  8  2013     1     1      856            900        -4     1226           1220
##  9  2013     1     1      859            900        -1     1223           1225
## 10  2013     1     1      921            900        21     1237           1227
## # ... with 7,622 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
head(monthly_flights)
## # A tibble: 6 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     1      558            600        -2      924            917
## 2  2013     1     1      628            630        -2     1016            947
## 3  2013     1     1      658            700        -2     1027           1025
## 4  2013     1     1      702            700         2     1058           1014
## 5  2013     1     1      743            730        13     1107           1100
## 6  2013     1     1      828            823         5     1150           1143
## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
delayed_flights <- flights %>%
    filter(dest == "LAX" & dep_time & month)
delayed_flights
## # A tibble: 16,076 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      558            600        -2      924            917
##  2  2013     1     1      628            630        -2     1016            947
##  3  2013     1     1      658            700        -2     1027           1025
##  4  2013     1     1      702            700         2     1058           1014
##  5  2013     1     1      743            730        13     1107           1100
##  6  2013     1     1      828            823         5     1150           1143
##  7  2013     1     1      829            830        -1     1152           1200
##  8  2013     1     1      856            900        -4     1226           1220
##  9  2013     1     1      859            900        -1     1223           1225
## 10  2013     1     1      921            900        21     1237           1227
## # ... with 16,066 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Rename the month varibles from int to names

delayed_flights$month[delayed_flights$month == 1]<- "January"
delayed_flights$month[delayed_flights$month == 2]<- "Febuary"
delayed_flights$month[delayed_flights$month == 3]<- "March"
delayed_flights$month[delayed_flights$month == 4]<- "April"
delayed_flights$month[delayed_flights$month == 5]<- "May"
delayed_flights$month[delayed_flights$month == 6]<- "June"
delayed_flights$month[delayed_flights$month == 7]<- "July"
delayed_flights$month[delayed_flights$month == 8]<- "August"
delayed_flights$month[delayed_flights$month == 9]<- "September"
delayed_flights$month[delayed_flights$month == 10]<- "October"
delayed_flights$month[delayed_flights$month == 11]<- "November"
delayed_flights$month[delayed_flights$month == 12]<- "Decemeber"
head(delayed_flights)
## # A tibble: 6 x 19
##    year month     day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <chr>   <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013 January     1      558            600        -2      924            917
## 2  2013 January     1      628            630        -2     1016            947
## 3  2013 January     1      658            700        -2     1027           1025
## 4  2013 January     1      702            700         2     1058           1014
## 5  2013 January     1      743            730        13     1107           1100
## 6  2013 January     1      828            823         5     1150           1143
## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

Target LAX delays with each month shown

monthly_flight_delays <- delayed_flights%>%
  filter( month == "January" | month == "Febuary" |month == "March" | month == "April" | month == "May")%>%
  group_by(year, month) %>%
  arrange(dep_time)      
head(monthly_flight_delays)
## # A tibble: 6 x 19
## # Groups:   year, month [2]
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <chr> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013 April    22        2           2032       210      319              3
## 2  2013 May      23        7           2001       246      256           2320
## 3  2013 May       1        9           1655       434      308           2020
## 4  2013 May       8       12           2025       227      241           2333
## 5  2013 May      16       13           2145       148      255            105
## 6  2013 April    18       14           2145       149      253            105
## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
monthly_flights_plot <- monthly_flight_delays %>%
  ggplot() + 
  geom_bar(mapping =aes(x=month, y = dep_delay, fill = month),
           position = "dodge", stat = "identity") +
  ggtitle("LAX Monthly flight delays in 2013") +
  ylab("Number of Flight delays") +
  labs(fill = "Flight delay months")
monthly_flights_plot

Summary

The plot above is a geom barplot which visualizes the number of flight delays from least to greatest with each month shown. The largest box plot is the month with the most number of delayed flights. One thing I liked about this box plot is the dark blue to blue colorway. One thing I would like more practice on is targeting specific parts of the data.