Exercise -12

Author

P K Parida

Logical vector

This chapter deals with useful function, if_else() and cases_when()

library(tidyverse)

Warning: package 'ggplot2' was built under R version 4.3.3

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.0     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(nycflights13)

Warning: package 'nycflights13' was built under R version 4.3.3

Exercise

12.2.4 Exercises

How does dplyr::near() work? Type near to see the source code. Is sqrt(2)^2 near 2?

dplyr::near

function (x, y, tol = .Machine$double.eps^0.5) 
{
    abs(x - y) < tol
}
<bytecode: 0x000001df1c00dd98>
<environment: namespace:dplyr>

Instead of checking for exact equality, it checks that two numbers are within a certain tolerance, tol. By default the tolerance is set to the square root of .Machine$double.eps, which is the smallest floating point number that the computer can represent.

sqrt(2)^2

[1] 2

sqrt(2) ^ 2 == 2

[1] FALSE

near(sqrt(2)^2, 2)

[1] TRUE

Q. 2. Use mutate(), is.na(), and count() together to describe how the missing values in dep_time, sched_dep_time and dep_delay are connected.

flights %>% 
  mutate(dep_time1= is.na(dep_time)) %>% 
  select(dep_time1) %>% 
  count(dep_time1)

# A tibble: 2 × 2
  dep_time1      n
  <lgl>      <int>
1 FALSE     328521
2 TRUE        8255

flights %>% 
  mutate(sched_dep_time1=is.na(sched_dep_time)) %>% 
  select (sched_dep_time1) %>% 
  count(sched_dep_time1)

# A tibble: 1 × 2
  sched_dep_time1      n
  <lgl>            <int>
1 FALSE           336776

flights %>% 
  mutate(dep_delay1=is.na(dep_delay)) %>% 
  select (dep_delay1) %>% 
  count(dep_delay1)

# A tibble: 2 × 2
  dep_delay1      n
  <lgl>       <int>
1 FALSE      328521
2 TRUE         8255

Exercise-2

Exercise 12.3.4

Find all flights where arr_delay is missing but dep_delay is not. Find all flights where neither arr_time nor sched_arr_time are missing, but arr_delay is.

flights %>% 
  drop_na (dep_delay) %>% 
  filter(arr_delay %in% NA)

# A tibble: 1,175 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1     1525           1530        -5     1934           1805
 2  2013     1     1     1528           1459        29     2002           1647
 3  2013     1     1     1740           1745        -5     2158           2020
 4  2013     1     1     1807           1738        29     2251           2103
 5  2013     1     1     1939           1840        59       29           2151
 6  2013     1     1     1952           1930        22     2358           2207
 7  2013     1     1     2016           1930        46       NA           2220
 8  2013     1     2      905            822        43     1313           1045
 9  2013     1     2     1125            925       120     1445           1146
10  2013     1     2     1848           1840         8     2333           2151
# ℹ 1,165 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

Find all flights where neither arr_time nor sched_arr_time are missing, but arr_delay is

flights %>% 
  drop_na(arr_time) %>% 
  filter(arr_delay %in% NA) %>% 
  drop_na(sched_arr_time)

# A tibble: 717 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1     1525           1530        -5     1934           1805
 2  2013     1     1     1528           1459        29     2002           1647
 3  2013     1     1     1740           1745        -5     2158           2020
 4  2013     1     1     1807           1738        29     2251           2103
 5  2013     1     1     1939           1840        59       29           2151
 6  2013     1     1     1952           1930        22     2358           2207
 7  2013     1     2      905            822        43     1313           1045
 8  2013     1     2     1125            925       120     1445           1146
 9  2013     1     2     1848           1840         8     2333           2151
10  2013     1     2     1849           1724        85     2235           1938
# ℹ 707 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

How many flights have a missing dep_time? What other variables are missing in these rows? What might these rows represent?

flights %>% 
  filter(dep_time %in% NA)

# A tibble: 8,255 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1       NA           1630        NA       NA           1815
 2  2013     1     1       NA           1935        NA       NA           2240
 3  2013     1     1       NA           1500        NA       NA           1825
 4  2013     1     1       NA            600        NA       NA            901
 5  2013     1     2       NA           1540        NA       NA           1747
 6  2013     1     2       NA           1620        NA       NA           1746
 7  2013     1     2       NA           1355        NA       NA           1459
 8  2013     1     2       NA           1420        NA       NA           1644
 9  2013     1     2       NA           1321        NA       NA           1536
10  2013     1     2       NA           1545        NA       NA           1910
# ℹ 8,245 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

Assuming that a missing dep_time implies that a flight is cancelled, look at the number of cancelled flights per day. Is there a pattern? Is there a connection between the proportion of cancelled flights and the average delay of non-cancelled flights?

flights %>%
  mutate(dep_date = lubridate::make_datetime(year, month, day)) %>%
  group_by(dep_date) %>%
  summarise(cancelled = sum(is.na(dep_time)), 
            non_cancelled = sum(!is.na (dep_time)),
            n = n())

# A tibble: 365 × 4
   dep_date            cancelled non_cancelled     n
   <dttm>                  <int>         <int> <int>
 1 2013-01-01 00:00:00         4           838   842
 2 2013-01-02 00:00:00         8           935   943
 3 2013-01-03 00:00:00        10           904   914
 4 2013-01-04 00:00:00         6           909   915
 5 2013-01-05 00:00:00         3           717   720
 6 2013-01-06 00:00:00         1           831   832
 7 2013-01-07 00:00:00         3           930   933
 8 2013-01-08 00:00:00         4           895   899
 9 2013-01-09 00:00:00         5           897   902
10 2013-01-10 00:00:00         3           929   932
# ℹ 355 more rows