What happens if you parse a string that contains invalid dates?

ymd(c("2010-10-10", "bananas"))
## Warning: 1 failed to parse.
## [1] "2010-10-10" NA

Use the appropriate lubridate function to parse each of the following dates:

d1 <- "January 1, 2010"
d2 <- "2015-Mar-07"
d3 <- "06-Jun-2017"
d4 <- c("August 19 (2015)", "July 1 (2015)")
d5 <- "12/30/14" # Dec 30, 2014


mdy(d1)
## [1] "2010-01-01"
ymd(d2)
## [1] "2015-03-07"
dmy(d3)
## [1] "2017-06-06"
mdy(d4)
## [1] "2015-08-19" "2015-07-01"
mdy(d5)
## [1] "2014-12-30"

Try what will happen if you don’t arrange in the example above.

make_datetime_100 <- function(year, month, day, time, tz = "EST") {
  make_datetime(year, month, day, time %/% 100, time %% 100, 0, tz)
}
airports1 <- airports %>%
   dplyr::select(faa, tzone)

flights1 <- flights %>%
  left_join(airports1, by = c("dest" = "faa")) %>%
  rename("dest_tzone" = "tzone") %>%
  glimpse()
## Rows: 336,776
## Columns: 20
## $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
## $ dest_tzone     <chr> "America/Chicago", "America/Chicago", "America/New_York…
flights1 %>%
  left_join(airports1, by = c("origin" = "faa")) %>%
  rename("origin_tzone" = "tzone") -> flights1
flights_dt <- flights1 %>% 
  filter(!is.na(dep_time), !is.na(arr_time)) %>% 
  mutate(
    dep_time = make_datetime_100(year, month, day, dep_time, tz = origin_tzone),
    arr_time = make_datetime_100(year, month, day, arr_time, tz = dest_tzone),
    sched_dep_time = make_datetime_100(year, month, day, sched_dep_time, tz = origin_tzone),
    sched_arr_time = make_datetime_100(year, month, day, sched_arr_time, tz = dest_tzone)
  ) %>% 
  dplyr::select(origin, dest, ends_with("delay"), ends_with("time"), air_time, hour, origin_tzone, dest_tzone)

flights_dt %>%
  filter(dep_time > ymd("2013-06-01", tz = Sys.timezone()))
## # A tibble: 194,177 × 12
##    origin dest  dep_delay arr_delay dep_time            sched_dep_time     
##    <chr>  <chr>     <dbl>     <dbl> <dttm>              <dttm>             
##  1 EWR    CLT         -13       -34 2013-10-01 04:47:00 2013-10-01 05:00:00
##  2 EWR    IAH           5       -22 2013-10-01 05:22:00 2013-10-01 05:17:00
##  3 JFK    MIA          -9       -46 2013-10-01 05:36:00 2013-10-01 05:45:00
##  4 LGA    IAH          -6       -26 2013-10-01 05:39:00 2013-10-01 05:45:00
##  5 JFK    SJU          -6       -16 2013-10-01 05:39:00 2013-10-01 05:45:00
##  6 JFK    BQN          -6       -20 2013-10-01 05:44:00 2013-10-01 05:50:00
##  7 JFK    IAD         -11       -23 2013-10-01 05:49:00 2013-10-01 06:00:00
##  8 LGA    PHL         -10       -12 2013-10-01 05:50:00 2013-10-01 06:00:00
##  9 LGA    DCA         -10       -10 2013-10-01 05:50:00 2013-10-01 06:00:00
## 10 EWR    ORD          -9        -3 2013-10-01 05:51:00 2013-10-01 06:00:00
## # ℹ 194,167 more rows
## # ℹ 6 more variables: arr_time <dttm>, sched_arr_time <dttm>, air_time <dbl>,
## #   hour <dbl>, origin_tzone <chr>, dest_tzone <chr>

Study flights departing at which hour had the worst arrival delay the longest/shortest air time.

flights_dt %>%
  filter(arr_delay >= 0) %>%
  group_by(hour) %>%
  summarise(mean_arr_delay = mean(arr_delay, na.rm = TRUE), mean_air_time = mean(air_time, na.rm = TRUE)) %>%
  ggplot() + stat_summary(aes(x = hour, y = mean_arr_delay), geom = "bar")
## No summary function supplied, defaulting to `mean_se()`

Answer 7 pm is when we had the worst arrival delay on average

flights_dt %>%
  filter(arr_delay >= 0) %>%
  group_by(hour) %>%
  summarise(mean_arr_delay = mean(arr_delay, na.rm = TRUE), mean_air_time = mean(air_time, na.rm = TRUE)) %>%
  ggplot() + stat_summary(aes(x = hour, y = mean_air_time), geom = "bar")
## No summary function supplied, defaulting to `mean_se()`

Answer 5 am is when we had the longest air time and 10 pm is when we had the shortest air time on average

Run round_date(dt2, “week”). what do you get? Can you explain the result?

dt2 <- ymd_hms("2023-04-05 03:12:34pm", tz = Sys.timezone())
round_date(dt2, "week") 
## [1] "2023-04-09 EDT"

Answer It is the first day of the next week since 3ish pm is already passed half of the day so it will round up and since we round by week then it would round to the first day of the next week