library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(nycflights13)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
view(flights)
describe(flights)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## vars n mean sd median trimmed mad min max
## year 1 336776 2013.00 0.00 2013 2013.00 0.00 2013 2013
## month 2 336776 6.55 3.41 7 6.56 4.45 1 12
## day 3 336776 15.71 8.77 16 15.70 11.86 1 31
## dep_time 4 328521 1349.11 488.28 1401 1346.82 634.55 1 2400
## sched_dep_time 5 336776 1344.25 467.34 1359 1341.60 613.80 106 2359
## dep_delay 6 328521 12.64 40.21 -2 3.32 5.93 -43 1301
## arr_time 7 328063 1502.05 533.26 1535 1526.42 619.73 1 2400
## sched_arr_time 8 336776 1536.38 497.46 1556 1550.67 618.24 1 2359
## arr_delay 9 327346 6.90 44.63 -5 -1.03 20.76 -86 1272
## carrier* 10 336776 7.14 4.14 6 7.00 5.93 1 16
## flight 11 336776 1971.92 1632.47 1496 1830.51 1608.62 1 8500
## tailnum* 12 334264 1814.32 1199.75 1798 1778.21 1587.86 1 4043
## origin* 13 336776 1.95 0.82 2 1.94 1.48 1 3
## dest* 14 336776 50.03 28.12 50 49.56 32.62 1 105
## air_time 15 327346 150.69 93.69 129 140.03 75.61 20 695
## distance 16 336776 1039.91 733.23 872 955.27 569.32 17 4983
## hour 17 336776 13.18 4.66 13 13.15 5.93 1 23
## minute 18 336776 26.23 19.30 29 25.64 23.72 0 59
## time_hour 19 336776 NaN NA NA NaN NA Inf -Inf
## range skew kurtosis se
## year 0 NaN NaN 0.00
## month 11 -0.01 -1.19 0.01
## day 30 0.01 -1.19 0.02
## dep_time 2399 -0.02 -1.09 0.85
## sched_dep_time 2253 -0.01 -1.20 0.81
## dep_delay 1344 4.80 43.95 0.07
## arr_time 2399 -0.47 -0.19 0.93
## sched_arr_time 2358 -0.35 -0.38 0.86
## arr_delay 1358 3.72 29.23 0.08
## carrier* 15 0.36 -1.21 0.01
## flight 8499 0.66 -0.85 2.81
## tailnum* 4042 0.17 -1.24 2.08
## origin* 2 0.09 -1.50 0.00
## dest* 104 0.13 -1.08 0.05
## air_time 675 1.07 0.86 0.16
## distance 4966 1.13 1.19 1.26
## hour 22 0.00 -1.21 0.01
## minute 59 0.09 -1.24 0.03
## time_hour -Inf NA NA NA
Your assignment is to create one plot to visualize one aspect of this dataset. The plot may be any type we have covered so far in this class (bargraphs, scatterplots, boxplots, histograms, treemaps, heatmaps, streamgraphs, or alluvials)
Start early so that if you do have trouble, you can email me with questions
flights$month[flights$month == 1]<-"January"
flights$month[flights$month == 2]<-"February"
flights$month[flights$month == 3]<-"March"
flights$month[flights$month == 4]<-"April"
flights$month[flights$month == 5]<-"May"
flights$month[flights$month == 6]<-"June"
flights$month[flights$month == 7]<- "July"
flights$month[flights$month == 8]<- "August"
flights$month[flights$month == 9]<- "September"
flights$month[flights$month == 10]<-"October"
flights$month[flights$month == 11]<-"November"
flights$month[flights$month == 12]<-"December"
LGA <- flights[flights$origin == "LGA", ]
LGA %>% filter(carrier == "AA")
## # A tibble: 15,459 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <chr> <int> <int> <int> <dbl> <int> <int>
## 1 2013 January 1 558 600 -2 753 745
## 2 2013 January 1 559 600 -1 941 910
## 3 2013 January 1 623 610 13 920 915
## 4 2013 January 1 629 630 -1 824 810
## 5 2013 January 1 635 635 0 1028 940
## 6 2013 January 1 656 700 -4 854 850
## 7 2013 January 1 659 700 -1 1008 1015
## 8 2013 January 1 724 730 -6 1111 1040
## 9 2013 January 1 739 745 -6 918 930
## 10 2013 January 1 753 755 -2 1056 1110
## # … with 15,449 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
LGA %>% filter(carrier == "AA") %>%
ggplot (data = LGA, mapping = aes(x = factor(month), y = air_time)) +
geom_boxplot(color = "red", fill = "blue") +
labs(x = "Month", y = "Airtime (Minutes)",
title = "American Airlines Airtime from LaGuardia Airport in 2013 by Month") +
scale_x_discrete(labels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"))
## Warning: Removed 3522 rows containing non-finite values (stat_boxplot).
In reviewing the nycflights13 data I decided that instead of looking at the broader focus of the dataset that I would look at a specific airline. I created a boxplot and filtered the data set to look at American Airlines airtime from LaGuardia Airport in 2013 by month. My goal was to see if airtime changed dependent on the time of year for the company, especially when you consider holidays. In reviewing the boxplot it is of importance to note that in the month of December there were two large outliers. March, April, and May had average airtimes slightly higher than the rest of the months. However, when comparing all months it appeared that the airtime stayed consistent throughout the year average around 120 minutes of airtime.