library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.8
## ✓ tidyr 1.2.0 ✓ stringr 1.4.0
## ✓ readr 2.1.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(nycflights13)
#install.packages("psych")
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
view(flights)
describe(flights)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## vars n mean sd median trimmed mad min max
## year 1 336776 2013.00 0.00 2013 2013.00 0.00 2013 2013
## month 2 336776 6.55 3.41 7 6.56 4.45 1 12
## day 3 336776 15.71 8.77 16 15.70 11.86 1 31
## dep_time 4 328521 1349.11 488.28 1401 1346.82 634.55 1 2400
## sched_dep_time 5 336776 1344.25 467.34 1359 1341.60 613.80 106 2359
## dep_delay 6 328521 12.64 40.21 -2 3.32 5.93 -43 1301
## arr_time 7 328063 1502.05 533.26 1535 1526.42 619.73 1 2400
## sched_arr_time 8 336776 1536.38 497.46 1556 1550.67 618.24 1 2359
## arr_delay 9 327346 6.90 44.63 -5 -1.03 20.76 -86 1272
## carrier* 10 336776 7.14 4.14 6 7.00 5.93 1 16
## flight 11 336776 1971.92 1632.47 1496 1830.51 1608.62 1 8500
## tailnum* 12 334264 1814.32 1199.75 1798 1778.21 1587.86 1 4043
## origin* 13 336776 1.95 0.82 2 1.94 1.48 1 3
## dest* 14 336776 50.03 28.12 50 49.56 32.62 1 105
## air_time 15 327346 150.69 93.69 129 140.03 75.61 20 695
## distance 16 336776 1039.91 733.23 872 955.27 569.32 17 4983
## hour 17 336776 13.18 4.66 13 13.15 5.93 1 23
## minute 18 336776 26.23 19.30 29 25.64 23.72 0 59
## time_hour 19 336776 NaN NA NA NaN NA Inf -Inf
## range skew kurtosis se
## year 0 NaN NaN 0.00
## month 11 -0.01 -1.19 0.01
## day 30 0.01 -1.19 0.02
## dep_time 2399 -0.02 -1.09 0.85
## sched_dep_time 2253 -0.01 -1.20 0.81
## dep_delay 1344 4.80 43.95 0.07
## arr_time 2399 -0.47 -0.19 0.93
## sched_arr_time 2358 -0.35 -0.38 0.86
## arr_delay 1358 3.72 29.23 0.08
## carrier* 15 0.36 -1.21 0.01
## flight 8499 0.66 -0.85 2.81
## tailnum* 4042 0.17 -1.24 2.08
## origin* 2 0.09 -1.50 0.00
## dest* 104 0.13 -1.08 0.05
## air_time 675 1.07 0.86 0.16
## distance 4966 1.13 1.19 1.26
## hour 22 0.00 -1.21 0.01
## minute 59 0.09 -1.24 0.03
## time_hour -Inf NA NA NA
Your assignment is to create one plot to visualize one aspect of this dataset. The plot may be any type we have covered so far in this class (bargraphs, scatterplots, boxplots, histograms, treemaps, heatmaps, streamgraphs, or alluvials)
Start early so that if you do have trouble, you can email me with questions
library(treemap)
library(tidyverse)
library(RColorBrewer)
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
## The following objects are masked from 'package:psych':
##
## alpha, rescale
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
flights_nona <- flights %>%
filter(!is.na(distance) & !is.na(dep_delay))
new <- flights %>%
group_by(origin) %>%
filter(!is.na(as.numeric(dep_delay))) %>%
summarise(mean(dep_delay))
new
## # A tibble: 3 × 2
## origin `mean(dep_delay)`
## <chr> <dbl>
## 1 EWR 15.1
## 2 JFK 12.1
## 3 LGA 10.3
names(new)[2] <- "Avg_Dep_Delay"
new
## # A tibble: 3 × 2
## origin Avg_Dep_Delay
## <chr> <dbl>
## 1 EWR 15.1
## 2 JFK 12.1
## 3 LGA 10.3
ggplot(new, aes(fill=origin,y=Avg_Dep_Delay, x=origin)) +
geom_bar(position="dodge", stat="identity") + ggtitle("Average Departure Delay By Airport Origin")
month <- flights %>%
group_by(month) %>%
filter(!is.na(as.numeric(dep_delay))) %>%
summarise(max(dep_delay))
names(month)[2] <- "Max_Dep_Delay"
month
## # A tibble: 12 × 2
## month Max_Dep_Delay
## <int> <dbl>
## 1 1 1301
## 2 2 853
## 3 3 911
## 4 4 960
## 5 5 878
## 6 6 1137
## 7 7 1005
## 8 8 520
## 9 9 1014
## 10 10 702
## 11 11 798
## 12 12 896
ggplot(month, aes(fill=Max_Dep_Delay,y=Max_Dep_Delay, x=month)) +
geom_bar(position="dodge", stat="identity") + ggtitle("Max Time of Departure Delay by Month") + scale_x_continuous(breaks= unique(month$month))
After completing Unit 5, it is apparent that we are starting to get into the really complex parts of data visualization. I really wanted to try utilizing treemap or an alluvial for this assignment, but I had a hard time getting the chunks to run without there being an error. Specifically, I wanted to look at each origin and see the average departure delay per month to see the relationships between month and departure delay and origin and departure delay. Instead, I created a graph looking at average departure delay vs. origin and a graph looking at the max departure delay vs. month. For the first graph, EWR is shown to have the highest average departure delay time and LGA has the lowest. For the second graph, January had the highest departure delay time and August had the lowest. With this information, a person who hates delays can keep LGA in mind when purchasing their next departure ticket and aim to travel in August.