library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(nycflights13)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
view(flights)
describe(flights)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## vars n mean sd median trimmed mad min max
## year 1 336776 2013.00 0.00 2013 2013.00 0.00 2013 2013
## month 2 336776 6.55 3.41 7 6.56 4.45 1 12
## day 3 336776 15.71 8.77 16 15.70 11.86 1 31
## dep_time 4 328521 1349.11 488.28 1401 1346.82 634.55 1 2400
## sched_dep_time 5 336776 1344.25 467.34 1359 1341.60 613.80 106 2359
## dep_delay 6 328521 12.64 40.21 -2 3.32 5.93 -43 1301
## arr_time 7 328063 1502.05 533.26 1535 1526.42 619.73 1 2400
## sched_arr_time 8 336776 1536.38 497.46 1556 1550.67 618.24 1 2359
## arr_delay 9 327346 6.90 44.63 -5 -1.03 20.76 -86 1272
## carrier* 10 336776 7.14 4.14 6 7.00 5.93 1 16
## flight 11 336776 1971.92 1632.47 1496 1830.51 1608.62 1 8500
## tailnum* 12 334264 1814.32 1199.75 1798 1778.21 1587.86 1 4043
## origin* 13 336776 1.95 0.82 2 1.94 1.48 1 3
## dest* 14 336776 50.03 28.12 50 49.56 32.62 1 105
## air_time 15 327346 150.69 93.69 129 140.03 75.61 20 695
## distance 16 336776 1039.91 733.23 872 955.27 569.32 17 4983
## hour 17 336776 13.18 4.66 13 13.15 5.93 1 23
## minute 18 336776 26.23 19.30 29 25.64 23.72 0 59
## time_hour 19 336776 NaN NA NA NaN NA Inf -Inf
## range skew kurtosis se
## year 0 NaN NaN 0.00
## month 11 -0.01 -1.19 0.01
## day 30 0.01 -1.19 0.02
## dep_time 2399 -0.02 -1.09 0.85
## sched_dep_time 2253 -0.01 -1.20 0.81
## dep_delay 1344 4.80 43.95 0.07
## arr_time 2399 -0.47 -0.19 0.93
## sched_arr_time 2358 -0.35 -0.38 0.86
## arr_delay 1358 3.72 29.23 0.08
## carrier* 15 0.36 -1.21 0.01
## flight 8499 0.66 -0.85 2.81
## tailnum* 4042 0.17 -1.24 2.08
## origin* 2 0.09 -1.50 0.00
## dest* 104 0.13 -1.08 0.05
## air_time 675 1.07 0.86 0.16
## distance 4966 1.13 1.19 1.26
## hour 22 0.00 -1.21 0.01
## minute 59 0.09 -1.24 0.03
## time_hour -Inf NA NA NA
Your assignment is to create one plot to visualize one aspect of this dataset. The plot may be any type we have covered so far in this class (bargraphs, scatterplots, boxplots, histograms, treemaps, heatmaps, streamgraphs, or alluvials)
Start early so that if you do have trouble, you can email me with questions
library(tidyverse)
summary(flights)
## year month day dep_time sched_dep_time
## Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1 Min. : 106
## 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 907 1st Qu.: 906
## Median :2013 Median : 7.000 Median :16.00 Median :1401 Median :1359
## Mean :2013 Mean : 6.549 Mean :15.71 Mean :1349 Mean :1344
## 3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1744 3rd Qu.:1729
## Max. :2013 Max. :12.000 Max. :31.00 Max. :2400 Max. :2359
## NA's :8255
## dep_delay arr_time sched_arr_time arr_delay
## Min. : -43.00 Min. : 1 Min. : 1 Min. : -86.000
## 1st Qu.: -5.00 1st Qu.:1104 1st Qu.:1124 1st Qu.: -17.000
## Median : -2.00 Median :1535 Median :1556 Median : -5.000
## Mean : 12.64 Mean :1502 Mean :1536 Mean : 6.895
## 3rd Qu.: 11.00 3rd Qu.:1940 3rd Qu.:1945 3rd Qu.: 14.000
## Max. :1301.00 Max. :2400 Max. :2359 Max. :1272.000
## NA's :8255 NA's :8713 NA's :9430
## carrier flight tailnum origin
## Length:336776 Min. : 1 Length:336776 Length:336776
## Class :character 1st Qu.: 553 Class :character Class :character
## Mode :character Median :1496 Mode :character Mode :character
## Mean :1972
## 3rd Qu.:3465
## Max. :8500
##
## dest air_time distance hour
## Length:336776 Min. : 20.0 Min. : 17 Min. : 1.00
## Class :character 1st Qu.: 82.0 1st Qu.: 502 1st Qu.: 9.00
## Mode :character Median :129.0 Median : 872 Median :13.00
## Mean :150.7 Mean :1040 Mean :13.18
## 3rd Qu.:192.0 3rd Qu.:1389 3rd Qu.:17.00
## Max. :695.0 Max. :4983 Max. :23.00
## NA's :9430
## minute time_hour
## Min. : 0.00 Min. :2013-01-01 05:00:00
## 1st Qu.: 8.00 1st Qu.:2013-04-04 13:00:00
## Median :29.00 Median :2013-07-03 10:00:00
## Mean :26.23 Mean :2013-07-03 05:22:54
## 3rd Qu.:44.00 3rd Qu.:2013-10-01 07:00:00
## Max. :59.00 Max. :2013-12-31 23:00:00
##
unique(flights$origin)
## [1] "EWR" "LGA" "JFK"
view(flights)
flights$month[flights$month == 1]<-"January"
flights$month[flights$month == 2]<-"February"
flights$month[flights$month == 3]<-"March"
flights$month[flights$month == 4]<-"April"
flights$month[flights$month == 5]<-"May"
flights$month[flights$month == 6]<-"June"
flights$month[flights$month == 7]<- "July"
flights$month[flights$month == 8]<- "August"
flights$month[flights$month == 9]<- "September"
flights$month[flights$month == 10]<-"October"
flights$month[flights$month == 11]<-"November"
flights$month[flights$month == 12]<-"December"
LGA <- flights[flights$origin == "LGA", ]
EWR <- flights[flights$origin == "EWR", ]
JFK <- flights[flights$origin == "JFK", ]
LGA %>% filter(carrier == "AA")
## # A tibble: 15,459 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <chr> <int> <int> <int> <dbl> <int> <int>
## 1 2013 January 1 558 600 -2 753 745
## 2 2013 January 1 559 600 -1 941 910
## 3 2013 January 1 623 610 13 920 915
## 4 2013 January 1 629 630 -1 824 810
## 5 2013 January 1 635 635 0 1028 940
## 6 2013 January 1 656 700 -4 854 850
## 7 2013 January 1 659 700 -1 1008 1015
## 8 2013 January 1 724 730 -6 1111 1040
## 9 2013 January 1 739 745 -6 918 930
## 10 2013 January 1 753 755 -2 1056 1110
## # ... with 15,449 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
LGA %>%
ggplot (data = LGA, mapping = aes(x = factor(month), y = air_time)) +
geom_boxplot(color = "blue", fill = "pink") +
labs(x = "Month", y = "Air Time Minutes)",
title = "Air Time LGA") +
scale_x_discrete(labels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"))
## Warning: Removed 3522 rows containing non-finite values (stat_boxplot).
EWR <- flights[flights$origin == "EWR", ]
EWR %>% filter(carrier == "AA") %>%
ggplot (data = EWR, mapping = aes(x = factor(month), y = arr_time)) +
geom_boxplot(color = "black", fill = "gray") +
labs(x = "Month", y = "Air Time Minutes)",
title = "Air Times for EWR") +
scale_x_discrete(labels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"))
## Warning: Removed 3390 rows containing non-finite values (stat_boxplot).
# Black: Range, Median, Q1, Q3 # Grey: Interquartile Range
JFK <- flights[flights$origin == "JFK", ]
JFK %>% filter(carrier == "AA")
## # A tibble: 13,783 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <chr> <int> <int> <int> <dbl> <int> <int>
## 1 2013 January 1 542 540 2 923 850
## 2 2013 January 1 628 630 -2 1137 1140
## 3 2013 January 1 656 659 -3 949 959
## 4 2013 January 1 712 715 -3 1023 1035
## 5 2013 January 1 743 730 13 1107 1100
## 6 2013 January 1 745 745 0 1135 1125
## 7 2013 January 1 803 810 -7 903 925
## 8 2013 January 1 826 715 71 1136 1045
## 9 2013 January 1 840 845 -5 1311 1350
## 10 2013 January 1 856 900 -4 1226 1220
## # ... with 13,773 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
JFK %>% filter(carrier == "AA") %>%
ggplot (data = JFK, mapping = aes(x = factor(month), y = air_time)) +
geom_boxplot(color = "blue", fill = "red") +
labs(x = "Month", y = "Air Time Minutes)",
title = "Air Times for JFK") +
scale_x_discrete(labels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"))
## Warning: Removed 2200 rows containing non-finite values (stat_boxplot).