library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.2 v dplyr 1.0.6
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(nycflights13)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(ggplot2)
#view(flights)
describe(flights)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## vars n mean sd median trimmed mad min max
## year 1 336776 2013.00 0.00 2013 2013.00 0.00 2013 2013
## month 2 336776 6.55 3.41 7 6.56 4.45 1 12
## day 3 336776 15.71 8.77 16 15.70 11.86 1 31
## dep_time 4 328521 1349.11 488.28 1401 1346.82 634.55 1 2400
## sched_dep_time 5 336776 1344.25 467.34 1359 1341.60 613.80 106 2359
## dep_delay 6 328521 12.64 40.21 -2 3.32 5.93 -43 1301
## arr_time 7 328063 1502.05 533.26 1535 1526.42 619.73 1 2400
## sched_arr_time 8 336776 1536.38 497.46 1556 1550.67 618.24 1 2359
## arr_delay 9 327346 6.90 44.63 -5 -1.03 20.76 -86 1272
## carrier* 10 336776 7.14 4.14 6 7.00 5.93 1 16
## flight 11 336776 1971.92 1632.47 1496 1830.51 1608.62 1 8500
## tailnum* 12 334264 1814.32 1199.75 1798 1778.21 1587.86 1 4043
## origin* 13 336776 1.95 0.82 2 1.94 1.48 1 3
## dest* 14 336776 50.03 28.12 50 49.56 32.62 1 105
## air_time 15 327346 150.69 93.69 129 140.03 75.61 20 695
## distance 16 336776 1039.91 733.23 872 955.27 569.32 17 4983
## hour 17 336776 13.18 4.66 13 13.15 5.93 1 23
## minute 18 336776 26.23 19.30 29 25.64 23.72 0 59
## time_hour 19 336776 NaN NA NA NaN NA Inf -Inf
## range skew kurtosis se
## year 0 NaN NaN 0.00
## month 11 -0.01 -1.19 0.01
## day 30 0.01 -1.19 0.02
## dep_time 2399 -0.02 -1.09 0.85
## sched_dep_time 2253 -0.01 -1.20 0.81
## dep_delay 1344 4.80 43.95 0.07
## arr_time 2399 -0.47 -0.19 0.93
## sched_arr_time 2358 -0.35 -0.38 0.86
## arr_delay 1358 3.72 29.23 0.08
## carrier* 15 0.36 -1.21 0.01
## flight 8499 0.66 -0.85 2.81
## tailnum* 4042 0.17 -1.24 2.08
## origin* 2 0.09 -1.50 0.00
## dest* 104 0.13 -1.08 0.05
## air_time 675 1.07 0.86 0.16
## distance 4966 1.13 1.19 1.26
## hour 22 0.00 -1.21 0.01
## minute 59 0.09 -1.24 0.03
## time_hour -Inf NA NA NA
head(flights)
## # A tibble: 6 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
Your assignment is to create one plot to visualize one aspect of this dataset. The plot may be any type we have covered so far in this class (bargraphs, scatterplots, boxplots, histograms, treemaps, heatmaps, streamgraphs, or alluvials)
Start early so that if you do have trouble, you can email me with questions
str(flights)
## tibble [336,776 x 19] (S3: tbl_df/tbl/data.frame)
## $ year : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
## $ month : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
## $ day : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
## $ dep_time : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
## $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
## $ dep_delay : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
## $ arr_time : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
## $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
## $ arr_delay : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
## $ carrier : chr [1:336776] "UA" "UA" "AA" "B6" ...
## $ flight : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
## $ tailnum : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
## $ origin : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
## $ dest : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
## $ air_time : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
## $ distance : num [1:336776] 1400 1416 1089 1576 762 ...
## $ hour : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
## $ minute : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
## $ time_hour : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
summary(flights)
## year month day dep_time sched_dep_time
## Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1 Min. : 106
## 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 907 1st Qu.: 906
## Median :2013 Median : 7.000 Median :16.00 Median :1401 Median :1359
## Mean :2013 Mean : 6.549 Mean :15.71 Mean :1349 Mean :1344
## 3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1744 3rd Qu.:1729
## Max. :2013 Max. :12.000 Max. :31.00 Max. :2400 Max. :2359
## NA's :8255
## dep_delay arr_time sched_arr_time arr_delay
## Min. : -43.00 Min. : 1 Min. : 1 Min. : -86.000
## 1st Qu.: -5.00 1st Qu.:1104 1st Qu.:1124 1st Qu.: -17.000
## Median : -2.00 Median :1535 Median :1556 Median : -5.000
## Mean : 12.64 Mean :1502 Mean :1536 Mean : 6.895
## 3rd Qu.: 11.00 3rd Qu.:1940 3rd Qu.:1945 3rd Qu.: 14.000
## Max. :1301.00 Max. :2400 Max. :2359 Max. :1272.000
## NA's :8255 NA's :8713 NA's :9430
## carrier flight tailnum origin
## Length:336776 Min. : 1 Length:336776 Length:336776
## Class :character 1st Qu.: 553 Class :character Class :character
## Mode :character Median :1496 Mode :character Mode :character
## Mean :1972
## 3rd Qu.:3465
## Max. :8500
##
## dest air_time distance hour
## Length:336776 Min. : 20.0 Min. : 17 Min. : 1.00
## Class :character 1st Qu.: 82.0 1st Qu.: 502 1st Qu.: 9.00
## Mode :character Median :129.0 Median : 872 Median :13.00
## Mean :150.7 Mean :1040 Mean :13.18
## 3rd Qu.:192.0 3rd Qu.:1389 3rd Qu.:17.00
## Max. :695.0 Max. :4983 Max. :23.00
## NA's :9430
## minute time_hour
## Min. : 0.00 Min. :2013-01-01 05:00:00
## 1st Qu.: 8.00 1st Qu.:2013-04-04 13:00:00
## Median :29.00 Median :2013-07-03 10:00:00
## Mean :26.23 Mean :2013-07-03 05:22:54
## 3rd Qu.:44.00 3rd Qu.:2013-10-01 07:00:00
## Max. :59.00 Max. :2013-12-31 23:00:00
##
Using the filter function target flights from LAX in the First half of 2013
monthly_flights <- flights %>%
filter(dest == "LAX" & month <= 6 & year == 2013)
monthly_flights
## # A tibble: 7,632 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 558 600 -2 924 917
## 2 2013 1 1 628 630 -2 1016 947
## 3 2013 1 1 658 700 -2 1027 1025
## 4 2013 1 1 702 700 2 1058 1014
## 5 2013 1 1 743 730 13 1107 1100
## 6 2013 1 1 828 823 5 1150 1143
## 7 2013 1 1 829 830 -1 1152 1200
## 8 2013 1 1 856 900 -4 1226 1220
## 9 2013 1 1 859 900 -1 1223 1225
## 10 2013 1 1 921 900 21 1237 1227
## # ... with 7,622 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
head(monthly_flights)
## # A tibble: 6 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 558 600 -2 924 917
## 2 2013 1 1 628 630 -2 1016 947
## 3 2013 1 1 658 700 -2 1027 1025
## 4 2013 1 1 702 700 2 1058 1014
## 5 2013 1 1 743 730 13 1107 1100
## 6 2013 1 1 828 823 5 1150 1143
## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
delayed_flights <- flights %>%
filter(dest == "LAX" & dep_time & month)
delayed_flights
## # A tibble: 16,076 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 558 600 -2 924 917
## 2 2013 1 1 628 630 -2 1016 947
## 3 2013 1 1 658 700 -2 1027 1025
## 4 2013 1 1 702 700 2 1058 1014
## 5 2013 1 1 743 730 13 1107 1100
## 6 2013 1 1 828 823 5 1150 1143
## 7 2013 1 1 829 830 -1 1152 1200
## 8 2013 1 1 856 900 -4 1226 1220
## 9 2013 1 1 859 900 -1 1223 1225
## 10 2013 1 1 921 900 21 1237 1227
## # ... with 16,066 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
delayed_flights$month[delayed_flights$month == 1]<- "January"
delayed_flights$month[delayed_flights$month == 2]<- "Febuary"
delayed_flights$month[delayed_flights$month == 3]<- "March"
delayed_flights$month[delayed_flights$month == 4]<- "April"
delayed_flights$month[delayed_flights$month == 5]<- "May"
delayed_flights$month[delayed_flights$month == 6]<- "June"
delayed_flights$month[delayed_flights$month == 7]<- "July"
delayed_flights$month[delayed_flights$month == 8]<- "August"
delayed_flights$month[delayed_flights$month == 9]<- "September"
delayed_flights$month[delayed_flights$month == 10]<- "October"
delayed_flights$month[delayed_flights$month == 11]<- "November"
delayed_flights$month[delayed_flights$month == 12]<- "Decemeber"
head(delayed_flights)
## # A tibble: 6 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <chr> <int> <int> <int> <dbl> <int> <int>
## 1 2013 January 1 558 600 -2 924 917
## 2 2013 January 1 628 630 -2 1016 947
## 3 2013 January 1 658 700 -2 1027 1025
## 4 2013 January 1 702 700 2 1058 1014
## 5 2013 January 1 743 730 13 1107 1100
## 6 2013 January 1 828 823 5 1150 1143
## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
monthly_flight_delays <- delayed_flights%>%
filter( month == "January" | month == "Febuary" |month == "March" | month == "April" | month == "May")%>%
group_by(year, month) %>%
arrange(dep_time)
head(monthly_flight_delays)
## # A tibble: 6 x 19
## # Groups: year, month [2]
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <chr> <int> <int> <int> <dbl> <int> <int>
## 1 2013 April 22 2 2032 210 319 3
## 2 2013 May 23 7 2001 246 256 2320
## 3 2013 May 1 9 1655 434 308 2020
## 4 2013 May 8 12 2025 227 241 2333
## 5 2013 May 16 13 2145 148 255 105
## 6 2013 April 18 14 2145 149 253 105
## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
monthly_flights_plot <- monthly_flight_delays %>%
ggplot() +
geom_bar(mapping =aes(x=month, y = dep_delay, fill = month),
position = "dodge", stat = "identity") +
ggtitle("LAX Monthly flight delays in 2013") +
ylab("Number of Flight delays") +
labs(fill = "Flight delay months")
monthly_flights_plot
The plot above is a geom barplot which visualizes the number of flight delays from least to greatest with each month shown. The largest box plot is the month with the most number of delayed flights. One thing I liked about this box plot is the dark blue to blue colorway. One thing I would like more practice on is targeting specific parts of the data.