Check working directory
getwd()
## [1] "C:/Users/libcl/OneDrive/Documents/DATA110"
#add weather data to join
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(nycflights13)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
view(flights)
view(weather)
# clean weather data by changing wind_gust na to 0
flights$dep_delay <- as.integer(flights$dep_delay)
flights$dep_delay[is.na(flights$dep_delay)] <- 0
weather$wind_gust[is.na(weather$wind_gust)] <- 0
df2 <- flights %>%
inner_join(weather)
## Joining, by = c("year", "month", "day", "origin", "hour", "time_hour")
str(df2)
## tibble [335,220 x 28] (S3: tbl_df/tbl/data.frame)
## $ year : int [1:335220] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
## $ month : int [1:335220] 1 1 1 1 1 1 1 1 1 1 ...
## $ day : int [1:335220] 1 1 1 1 1 1 1 1 1 1 ...
## $ dep_time : int [1:335220] 517 533 542 544 554 554 555 557 557 558 ...
## $ sched_dep_time: int [1:335220] 515 529 540 545 600 558 600 600 600 600 ...
## $ dep_delay : num [1:335220] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
## $ arr_time : int [1:335220] 830 850 923 1004 812 740 913 709 838 753 ...
## $ sched_arr_time: int [1:335220] 819 830 850 1022 837 728 854 723 846 745 ...
## $ arr_delay : num [1:335220] 11 20 33 -18 -25 12 19 -14 -8 8 ...
## $ carrier : chr [1:335220] "UA" "UA" "AA" "B6" ...
## $ flight : int [1:335220] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
## $ tailnum : chr [1:335220] "N14228" "N24211" "N619AA" "N804JB" ...
## $ origin : chr [1:335220] "EWR" "LGA" "JFK" "JFK" ...
## $ dest : chr [1:335220] "IAH" "IAH" "MIA" "BQN" ...
## $ air_time : num [1:335220] 227 227 160 183 116 150 158 53 140 138 ...
## $ distance : num [1:335220] 1400 1416 1089 1576 762 ...
## $ hour : num [1:335220] 5 5 5 5 6 5 6 6 6 6 ...
## $ minute : num [1:335220] 15 29 40 45 0 58 0 0 0 0 ...
## $ time_hour : POSIXct[1:335220], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
## $ temp : num [1:335220] 39 39.9 39 39 39.9 ...
## $ dewp : num [1:335220] 28 25 27 27 25 ...
## $ humid : num [1:335220] 64.4 54.8 61.6 61.6 54.8 ...
## $ wind_dir : num [1:335220] 260 250 260 260 260 260 240 260 260 260 ...
## $ wind_speed : num [1:335220] 12.7 15 15 15 16.1 ...
## $ wind_gust : num [1:335220] 0 21.9 0 0 23 ...
## $ precip : num [1:335220] 0 0 0 0 0 0 0 0 0 0 ...
## $ pressure : num [1:335220] 1012 1011 1012 1012 1012 ...
## $ visib : num [1:335220] 10 10 10 10 10 10 10 10 10 10 ...
df2 %>%
filter(dep_delay >= 0) %>%
ggplot() +
geom_point(aes(x = wind_gust, y = dep_delay, color = origin)) +
geom_smooth(aes(x = wind_gust, y = dep_delay), method = "lm") +
xlab("Wind Gusts(mph)") +
ylab("Flight delay (minutes)") +
ggtitle("Departure Delay - NYC Airports")
## `geom_smooth()` using formula 'y ~ x'
median(df2$dep_delay)
## [1] -1
mean(df2$wind_gust)
## [1] 6.053964
df2 %>%
filter(wind_gust > 35) %>%
ggplot() +
geom_histogram(aes(x = wind_gust, fill = origin), position = "dodge", binwidth = 5) +
xlab("Wind Gust (mph)" ) +
ylab("frequency") +
ggtitle("Delay Frequency: NYC Airports
with Wind Gust > 35mph")