library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df <- nycflights13::flights
flights <- nycflights13::flights
head(df)
## # A tibble: 6 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
Add a complete, date column.
df <- df %>%
unite("date", year, month, day, sep = "-", remove = FALSE) %>%
mutate(date = ymd(date))
month_aggregate <- df |>
group_by(month) |>
summarise(
dep_delay_mean = mean(dep_delay, na.rm = TRUE),
distance_mean = mean(distance, na.rm = TRUE),
arr_delay_mean = mean(arr_delay, na.rm = TRUE)
)
month_aggregate
## # A tibble: 12 × 4
## month dep_delay_mean distance_mean arr_delay_mean
## <int> <dbl> <dbl> <dbl>
## 1 1 10.0 1007. 6.13
## 2 2 10.8 1001. 5.61
## 3 3 13.2 1012. 5.81
## 4 4 13.9 1039. 11.2
## 5 5 13.0 1041. 3.52
## 6 6 20.8 1057. 16.5
## 7 7 21.7 1059. 16.7
## 8 8 12.6 1062. 6.04
## 9 9 6.72 1041. -4.02
## 10 10 6.24 1039. -0.167
## 11 11 5.44 1050. 0.461
## 12 12 16.6 1065. 14.9
month_aggregate |>
ggplot(aes(x=month, y=arr_delay_mean)) +
geom_line()
df_delay <- df |>
select(dep_delay, arr_delay)
df_delay |>
arrange(desc(dep_delay))
## # A tibble: 336,776 × 2
## dep_delay arr_delay
## <dbl> <dbl>
## 1 1301 1272
## 2 1137 1127
## 3 1126 1109
## 4 1014 1007
## 5 1005 989
## 6 960 931
## 7 911 915
## 8 899 850
## 9 898 895
## 10 896 878
## # ℹ 336,766 more rows
df_delay |>
ggplot(aes(x=dep_delay)) +
geom_histogram(bins=6) +
scale_y_continuous(labels = scales::number_format(accuracy = 1))
## Warning: Removed 8255 rows containing non-finite outside the scale range
## (`stat_bin()`).
cor(df_delay, use = 'pairwise.complete.obs')
## dep_delay arr_delay
## dep_delay 1.0000000 0.9148028
## arr_delay 0.9148028 1.0000000
df_delay |>
ggplot(aes(x=dep_delay, y=arr_delay)) +
geom_point()
## Warning: Removed 9430 rows containing missing values or values outside the scale range
## (`geom_point()`).
df <- df %>% mutate(days_after_epoch = as.numeric(date - ymd('1970-01-01')))
ggplot(df, aes(x=days_after_epoch)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
df %>% filter(dep_delay %>% is.na())
## # A tibble: 8,255 × 21
## date year month day dep_time sched_dep_time dep_delay arr_time
## <date> <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013-01-01 2013 1 1 NA 1630 NA NA
## 2 2013-01-01 2013 1 1 NA 1935 NA NA
## 3 2013-01-01 2013 1 1 NA 1500 NA NA
## 4 2013-01-01 2013 1 1 NA 600 NA NA
## 5 2013-01-02 2013 1 2 NA 1540 NA NA
## 6 2013-01-02 2013 1 2 NA 1620 NA NA
## 7 2013-01-02 2013 1 2 NA 1355 NA NA
## 8 2013-01-02 2013 1 2 NA 1420 NA NA
## 9 2013-01-02 2013 1 2 NA 1321 NA NA
## 10 2013-01-02 2013 1 2 NA 1545 NA NA
## # ℹ 8,245 more rows
## # ℹ 13 more variables: sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## # days_after_epoch <dbl>
df %>% select(days_after_epoch, dep_delay) %>% cor()
## days_after_epoch dep_delay
## days_after_epoch 1 NA
## dep_delay NA 1
ggplot(sample_n(df, 10000), aes(x=date, y=dep_delay)) + geom_smooth(method = 'gam') + geom_smooth()
## `geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 227 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 227 rows containing non-finite outside the scale range
## (`stat_smooth()`).