NYC Flights Homework

Load the libraries and view the “flights” dataset

library(treemap)
library(RColorBrewer)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.1.2     ✓ dplyr   1.0.6
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)
library(nycflights13)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
view(flights)
describe(flights)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
##                vars      n    mean      sd median trimmed     mad  min  max
## year              1 336776 2013.00    0.00   2013 2013.00    0.00 2013 2013
## month             2 336776    6.55    3.41      7    6.56    4.45    1   12
## day               3 336776   15.71    8.77     16   15.70   11.86    1   31
## dep_time          4 328521 1349.11  488.28   1401 1346.82  634.55    1 2400
## sched_dep_time    5 336776 1344.25  467.34   1359 1341.60  613.80  106 2359
## dep_delay         6 328521   12.64   40.21     -2    3.32    5.93  -43 1301
## arr_time          7 328063 1502.05  533.26   1535 1526.42  619.73    1 2400
## sched_arr_time    8 336776 1536.38  497.46   1556 1550.67  618.24    1 2359
## arr_delay         9 327346    6.90   44.63     -5   -1.03   20.76  -86 1272
## carrier*         10 336776    7.14    4.14      6    7.00    5.93    1   16
## flight           11 336776 1971.92 1632.47   1496 1830.51 1608.62    1 8500
## tailnum*         12 334264 1814.32 1199.75   1798 1778.21 1587.86    1 4043
## origin*          13 336776    1.95    0.82      2    1.94    1.48    1    3
## dest*            14 336776   50.03   28.12     50   49.56   32.62    1  105
## air_time         15 327346  150.69   93.69    129  140.03   75.61   20  695
## distance         16 336776 1039.91  733.23    872  955.27  569.32   17 4983
## hour             17 336776   13.18    4.66     13   13.15    5.93    1   23
## minute           18 336776   26.23   19.30     29   25.64   23.72    0   59
## time_hour        19 336776     NaN      NA     NA     NaN      NA  Inf -Inf
##                range  skew kurtosis   se
## year               0   NaN      NaN 0.00
## month             11 -0.01    -1.19 0.01
## day               30  0.01    -1.19 0.02
## dep_time        2399 -0.02    -1.09 0.85
## sched_dep_time  2253 -0.01    -1.20 0.81
## dep_delay       1344  4.80    43.95 0.07
## arr_time        2399 -0.47    -0.19 0.93
## sched_arr_time  2358 -0.35    -0.38 0.86
## arr_delay       1358  3.72    29.23 0.08
## carrier*          15  0.36    -1.21 0.01
## flight          8499  0.66    -0.85 2.81
## tailnum*        4042  0.17    -1.24 2.08
## origin*            2  0.09    -1.50 0.00
## dest*            104  0.13    -1.08 0.05
## air_time         675  1.07     0.86 0.16
## distance        4966  1.13     1.19 1.26
## hour              22  0.00    -1.21 0.01
## minute            59  0.09    -1.24 0.03
## time_hour       -Inf    NA       NA   NA

Now create one data visualization with this dataset

Your assignment is to create one plot to visualize one aspect of this dataset. The plot may be any type we have covered so far in this class (bargraphs, scatterplots, boxplots, histograms, treemaps, heatmaps, streamgraphs, or alluvials)

Requirements for the plot:

  1. Include at least one dplyr command (filter, sort, summarize, group_by, select, mutate, ….)
  2. Include labels for the x- and y-axes
  3. Include a title
  4. Your plot must incorporate at least 2 colors
  5. Include a legend that indicates what the colors represent
  6. Write a brief paragraph that describes the visualization you have created and at least one aspect of the plot that you would like to highlight.

Start early so that if you do have trouble, you can email me with questions

# display the structure of the pre - build dataset
str(flights)
## tibble [336,776 × 19] (S3: tbl_df/tbl/data.frame)
##  $ year          : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ month         : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ day           : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ dep_time      : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
##  $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
##  $ dep_delay     : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
##  $ arr_time      : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
##  $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
##  $ arr_delay     : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
##  $ carrier       : chr [1:336776] "UA" "UA" "AA" "B6" ...
##  $ flight        : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
##  $ tailnum       : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
##  $ origin        : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
##  $ dest          : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
##  $ air_time      : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
##  $ distance      : num [1:336776] 1400 1416 1089 1576 762 ...
##  $ hour          : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
##  $ minute        : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
##  $ time_hour     : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
#clean the dataset
names(flights) <- tolower(names(flights))
names(flights) <- gsub(" ","",names(flights))
str(flights)
## tibble [336,776 × 19] (S3: tbl_df/tbl/data.frame)
##  $ year          : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ month         : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ day           : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ dep_time      : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
##  $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
##  $ dep_delay     : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
##  $ arr_time      : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
##  $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
##  $ arr_delay     : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
##  $ carrier       : chr [1:336776] "UA" "UA" "AA" "B6" ...
##  $ flight        : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
##  $ tailnum       : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
##  $ origin        : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
##  $ dest          : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
##  $ air_time      : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
##  $ distance      : num [1:336776] 1400 1416 1089 1576 762 ...
##  $ hour          : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
##  $ minute        : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
##  $ time_hour     : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
# Does not work
boxpl <- flights %>%
  ggplot() + 
  geom_bar(aes(x=arr_delay,fill=carrier),position = "dodge")+scale_y_continuous(limits=c(0,500))+scale_x_continuous(limits=c(0,200))
boxpl
## Warning: Removed 201163 rows containing non-finite values (stat_count).
## Warning: position_dodge requires non-overlapping x intervals
## Warning: Removed 55 rows containing missing values (geom_bar).

# do some adjustment to restruct some essential data which meet the requirement
flights2 <- flights %>% 
  select( year, carrier,dep_delay,arr_delay) %>%
  group_by(year,carrier)
head(flights2)
## # A tibble: 6 x 4
## # Groups:   year, carrier [4]
##    year carrier dep_delay arr_delay
##   <int> <chr>       <dbl>     <dbl>
## 1  2013 UA              2        11
## 2  2013 UA              4        20
## 3  2013 AA              2        33
## 4  2013 B6             -1       -18
## 5  2013 DL             -6       -25
## 6  2013 UA             -4        12
a = flights2 %>%
  filter(dep_delay>0 & arr_delay<=0)%>%
  arrange((arr_delay))
a
## # A tibble: 35,442 x 4
## # Groups:   year, carrier [16]
##     year carrier dep_delay arr_delay
##    <int> <chr>       <dbl>     <dbl>
##  1  2013 DL              1       -65
##  2  2013 DL              9       -63
##  3  2013 UA              3       -63
##  4  2013 HA              4       -62
##  5  2013 UA              6       -62
##  6  2013 UA              4       -62
##  7  2013 AA              5       -62
##  8  2013 DL             17       -62
##  9  2013 VX              1       -60
## 10  2013 UA              2       -60
## # … with 35,432 more rows
b = a %>%
  group_by(carrier)%>%
  summarise(total = n())

b
## # A tibble: 16 x 2
##    carrier total
##    <chr>   <int>
##  1 9E       1925
##  2 AA       3437
##  3 AS        100
##  4 B6       4936
##  5 DL       5060
##  6 EV       3793
##  7 F9         84
##  8 FL        261
##  9 HA         32
## 10 MQ       1022
## 11 OO          1
## 12 UA      10519
## 13 US        997
## 14 VX        999
## 15 WN       2242
## 16 YV         34
ggplot(data = b) + 
  geom_bar(mapping = aes(x = carrier, y = total,fill = carrier), stat = "identity",position = "dodge")+
  ylab('frequency')+
  xlab('Name of the company')+
  ggtitle("The aircraft arrived on time but depart late")