Code Along 11

library("tidyverse")

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library("lubridate")

df <- tibble::tibble(
  a = rnorm(10),
  b = rnorm(10),
  c = rnorm(10),
  d = rnorm(10)
)

df$a <- (df$a - min(df$a, na.rm = TRUE)) / 
  (max(df$a, na.rm = TRUE) - min(df$a, na.rm = TRUE))
df$b <- (df$b - min(df$b, na.rm = TRUE)) / 
  (max(df$b, na.rm = TRUE) - min(df$a, na.rm = TRUE))
df$c <- (df$c - min(df$c, na.rm = TRUE)) / 
  (max(df$c, na.rm = TRUE) - min(df$c, na.rm = TRUE))
df$d <- (df$d - min(df$d, na.rm = TRUE)) / 
  (max(df$d, na.rm = TRUE) - min(df$d, na.rm = TRUE))

(df$a - min(df$a, na.rm = TRUE)) /
  (max(df$a, na.rm = TRUE) - min(df$a, na.rm = TRUE))

##  [1] 0.4434324 1.0000000 0.6694723 0.1702379 0.1660593 0.0000000 0.6158430
##  [8] 0.3553321 0.1823770 0.4247616

x <- df$a
(x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))

##  [1] 0.4434324 1.0000000 0.6694723 0.1702379 0.1660593 0.0000000 0.6158430
##  [8] 0.3553321 0.1823770 0.4247616

#>  [1] 0.2892677 0.7509271 0.0000000 0.6781686 0.8530656 1.0000000 0.1716402
#>  [8] 0.6107464 0.6116181 0.6008793

rng <- range(x, na.rm = TRUE)
(x - rng[1]) / (rng[2] - rng[1])

##  [1] 0.4434324 1.0000000 0.6694723 0.1702379 0.1660593 0.0000000 0.6158430
##  [8] 0.3553321 0.1823770 0.4247616

#>  [1] 0.2892677 0.7509271 0.0000000 0.6781686 0.8530656 1.0000000 0.1716402
#>  [8] 0.6107464 0.6116181 0.6008793

rescale01 <- function(x) {
  rng <- range(x, na.rm = TRUE)
  (x - rng[1]) / (rng[2] - rng[1])
}
rescale01(c(0, 5, 10))

## [1] 0.0 0.5 1.0

#> [1] 0.0 0.5 1.0

rescale01(c(-10, 0, 10))

## [1] 0.0 0.5 1.0

#> [1] 0.0 0.5 1.0
rescale01(c(1, 2, 3, NA, 5))

## [1] 0.00 0.25 0.50   NA 1.00

#> [1] 0.00 0.25 0.50   NA 1.00

df$a <- rescale01(df$a)
df$b <- rescale01(df$b)
df$c <- rescale01(df$c)
df$d <- rescale01(df$d)

x <- c(1:10, Inf)
rescale01(x)

##  [1]   0   0   0   0   0   0   0   0   0   0 NaN

#>  [1]   0   0   0   0   0   0   0   0   0   0 NaN

rescale01 <- function(x) {
  rng <- range(x, na.rm = TRUE, finite = TRUE)
  (x - rng[1]) / (rng[2] - rng[1])
}
rescale01(x)

##  [1] 0.0000000 0.1111111 0.2222222 0.3333333 0.4444444 0.5555556 0.6666667
##  [8] 0.7777778 0.8888889 1.0000000       Inf

#>  [1] 0.0000000 0.1111111 0.2222222 0.3333333 0.4444444 0.5555556 0.6666667
#>  [8] 0.7777778 0.8888889 1.0000000       Inf

##Exercises 1. Why is TRUE not a parameter to rescale01()? What would happen if x contained a single missing value, and na.rm was FALSE? If x contains a single missing value and na.rm = FALSE, then this function stills return a non-missing value. The option finite = TRUE to range() will drop all non-finite elements, and NA is a non-finite element.

2.In the second variant of rescale01(), infinite values are left unchanged. Rewrite rescale01() so that -Inf is mapped to 0, and Inf is mapped to 1.

rescale01 <- function(x) {
  rng <- range(x, na.rm = TRUE, finite = TRUE)
  y <- (x - rng[1]) / (rng[2] - rng[1])
  y[y == -Inf] <- 0
  y[y == Inf] <- 1
  y
}

rescale01(c(Inf, -Inf, 0:5, NA))

## [1] 1.0 0.0 0.0 0.2 0.4 0.6 0.8 1.0  NA

#> [1] 1.0 0.0 0.0 0.2 0.4 0.6 0.8 1.0  NA

Practice turning the following code snippets into functions. Think about what each function does. What would you call it? How many arguments does it need? Can you rewrite it to be more expressive or less duplicative?

mean(is.na(x))

## [1] 0

x / sum(x, na.rm = TRUE)

##  [1]   0   0   0   0   0   0   0   0   0   0 NaN

sd(x, na.rm = TRUE) / mean(x, na.rm = TRUE)

## [1] NaN

coef_variation <- function(x, na.rm = FALSE) {
  sd(x, na.rm = na.rm) / mean(x, na.rm = na.rm)
}
coef_variation(1:5)

## [1] 0.5270463

#> [1] 0.527
coef_variation(c(1:5, NA))

## [1] NA

#> [1] NA
coef_variation(c(1:5, NA), na.rm = TRUE)

## [1] 0.5270463

#> [1] 0.527

4.write your own functions to compute the variance and skewness of a numeric vector. Variance is defined as

variance <- function(x, na.rm = TRUE) {
  n <- length(x)
  m <- mean(x, na.rm = TRUE)
  sq_err <- (x - m)^2
  sum(sq_err) / (n - 1)
}

var(1:10)

## [1] 9.166667

#> [1] 9.17
variance(1:10)

## [1] 9.166667

#> [1] 9.17

5.Write both_na(), a function that takes two vectors of the same length and returns the number of positions that have an NA in both vectors.

both_na <- function(x, y) {
  sum(is.na(x) & is.na(y))
}
both_na(
  c(NA, NA, 1, 2),
  c(NA, 1, NA, 2)
)

## [1] 1

#> [1] 1
both_na(
  c(NA, NA, 1, 2, NA, NA, 1),
  c(NA, 1, NA, 2, NA, NA, 1)
)

## [1] 3

#> [1] 3

6.What do the following functions do? Why are they useful even though they are so short?

is_directory <- function(x) file.info(x)$isdir
is_readable <- function(x) file.access(x, 4) == 0

The function is_directory() checks whether the path in x is a directory. The function is_readable() checks whether the path in x is readable, meaning that the file exists and the user has permission to open it. These functions are useful even though they are short because their names make it much clearer what the code is doing. 7.Read the complete lyrics to “Little Bunny Foo Foo”. There’s a lot of duplication in this song. Extend the initial piping example to recreate the complete song, and use functions to reduce the duplication. threat <- function(chances) { give_chances( from = Good_Fairy, to = foo_foo, number = chances, condition = “Don’t behave”, consequence = turn_into_goon ) }

lyric <- function() { foo_foo %>% hop(through = forest) %>% scoop(up = field_mouse) %>% bop(on = head)

down_came(Good_Fairy) said( Good_Fairy, c( “Little bunny Foo Foo”, “I don’t want to see you”, “Scooping up the field mice”, “And bopping them on the head.” ) ) }

lyric() threat(3) lyric() threat(2) lyric() threat(1) lyric() turn_into_goon(Good_Fairy, foo_foo)

Functions are for humans and computers

# Never do this!
col_mins <- function(x, y) {}
rowMaxes <- function(y, x) {}

# Don't do this!
T <- FALSE
c <- 10
mean <- function(x) sum(x)

# Load data --------------------------------------

# Plot data --------------------------------------

Exercises

Read the source code for each of the following three functions, puzzle out what they do, and then brainstorm better names.

f1 <- function(string, prefix) {
  substr(string, 1, nchar(prefix)) == prefix
}
f2 <- function(x) {
  if (length(x) <= 1) return(NULL)
  x[-length(x)]
}
f3 <- function(x, y) {
  rep(y, length.out = length(x))
}

The function f1 tests whether each element of the character vector nchar starts with the string prefix. A better name for f1 is has_prefix()

The function f2 drops the last element of the vector x.

A better name for f2 is drop_last().

The function f3 repeats y once for each element of x. 2. Compare and contrast rnorm() and MASS::mvrnorm(). How could you make them more consistent?

rnorm() samples from the univariate normal distribution, while MASS::mvrnorm samples from the multivariate normal distribution. The main arguments in rnorm() are n, mean, sd. The main arguments is MASS::mvrnorm are n, mu, Sigma. To be consistent they should have the same names. However, this is difficult. In general, it is better to be consistent with more widely used functions, e.g. rmvnorm() should follow the conventions of rnorm(). However, while mean is correct in the multivariate case, sd does not make sense in the multivariate case. However, both functions are internally consistent. It would not be good practice to have mu and sd as arguments or mean and Sigma as arguments.

3.Make a case for why norm_r(), norm_d() etc would be better than rnorm(), dnorm(). Make a case for the opposite. If named norm_r() and norm_d(), the naming convention groups functions by their distribution.

If named rnorm(), and dnorm(), the naming convention groups functions by the action they perform.

##Conditional execution

has_name <- function(x) {
  nms <- names(x)
  if (is.null(nms)) {
    rep(FALSE, length(x))
  } else {
    !is.na(nms) & nms != ""
  }
}

identical(0L, 0)

## [1] FALSE

#> [1] FALSE

x <- sqrt(2) ^ 2
x

## [1] 2

#> [1] 2
x == 2

## [1] FALSE

#> [1] FALSE
x - 2

## [1] 4.440892e-16

#> [1] 4.440892e-16

y <- 10
x <- if (y < 20) "Too low" else "Too high"

if (y < 20) {
  x <- "Too low" 
} else {
  x <- "Too high"
}

Exercises

1.What’s the difference between if and ifelse()? Carefully read the help and construct three examples that illustrate the key differences.

The keyword if tests a single condition, while ifelse() tests each element.

2.Write a greeting function that says “good morning”, “good afternoon”, or “good evening”, depending on the time of day. (Hint: use a time argument that defaults to lubridate::now(). That will make it easier to test your function.)

greet <- function(time = lubridate::now()) {
  hr <- lubridate::hour(time)
  # I don't know what to do about times after midnight,
  # are they evening or morning?
  if (hr < 12) {
    print("good morning")
  } else if (hr < 17) {
    print("good afternoon")
  } else {
    print("good evening")
  }
}
greet()

## [1] "good afternoon"

#> [1] "good morning"
greet(ymd_h("2017-01-08:05"))

## [1] "good morning"

#> [1] "good morning"
greet(ymd_h("2017-01-08:13"))

## [1] "good afternoon"

#> [1] "good afternoon"
greet(ymd_h("2017-01-08:20"))

## [1] "good evening"

#> [1] "good evening"

Implement a fizzbuzz() function. It takes a single number as input. If the number is divisible by three, it returns “fizz”. If it’s divisible by five it returns “buzz”. If it’s divisible by three and five, it returns “fizzbuzz”. Otherwise, it returns the number. Make sure you first write working code before you create the function.

1:10 %% 3 == 0

##  [1] FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE

#>  [1] FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE

!(1:10 %% 3)

##  [1] FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE

#>  [1] FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE

fizzbuzz <- function(x) {
  # these two lines check that x is a valid input
  stopifnot(length(x) == 1)
  stopifnot(is.numeric(x))
  if (!(x %% 3) && !(x %% 5)) {
    "fizzbuzz"
  } else if (!(x %% 3)) {
    "fizz"
  } else if (!(x %% 5)) {
    "buzz"
  } else {
    # ensure that the function returns a character vector
    as.character(x)
  }
}
fizzbuzz(6)

## [1] "fizz"

#> [1] "fizz"
fizzbuzz(10)

## [1] "buzz"

#> [1] "buzz"
fizzbuzz(15)

## [1] "fizzbuzz"

#> [1] "fizzbuzz"
fizzbuzz(2)

## [1] "2"

#> [1] "2"

fizzbuzz2 <- function(x) {
  # these two lines check that x is a valid input
  stopifnot(length(x) == 1)
  stopifnot(is.numeric(x))
  if (!(x %% 3)) {
    if (!(x %% 5)) {
      "fizzbuzz"
    } else {
      "fizz"
    }
  } else if (!(x %% 5)) {
    "buzz"
  } else {
    # ensure that the function returns a character vector
    as.character(x)
  }
}
fizzbuzz2(6)

## [1] "fizz"

#> [1] "fizz"
fizzbuzz2(10)

## [1] "buzz"

#> [1] "buzz"
fizzbuzz2(15)

## [1] "fizzbuzz"

#> [1] "fizzbuzz"
fizzbuzz2(2)

## [1] "2"

#> [1] "2"

fizzbuzz_vec <- function(x) {
  case_when(!(x %% 3) & !(x %% 5) ~ "fizzbuzz",
            !(x %% 3) ~ "fizz",
            !(x %% 5) ~ "buzz",
            TRUE ~ as.character(x)
            )
}
fizzbuzz_vec(c(0, 1, 2, 3, 5, 9, 10, 12, 15))

## [1] "fizzbuzz" "1"        "2"        "fizz"     "buzz"     "fizz"     "buzz"    
## [8] "fizz"     "fizzbuzz"

#> [1] "fizzbuzz" "1"        "2"        "fizz"     "buzz"     "fizz"     "buzz"    
#> [8] "fizz"     "fizzbuzz"

fizzbuzz_vec2 <- function(x) {
  y <- as.character(x)
  # put the individual cases first - any elements divisible by both 3 and 5
  # will be overwritten with fizzbuzz later
  y[!(x %% 3)] <- "fizz"
  y[!(x %% 3)] <- "buzz"
  y[!(x %% 3) & !(x %% 5)] <- "fizzbuzz"
  y
}
fizzbuzz_vec2(c(0, 1, 2, 3, 5, 9, 10, 12, 15))

## [1] "fizzbuzz" "1"        "2"        "buzz"     "5"        "buzz"     "10"      
## [8] "buzz"     "fizzbuzz"

#> [1] "fizzbuzz" "1"        "2"        "buzz"     "5"        "buzz"     "10"      
#> [8] "buzz"     "fizzbuzz"

How could you use cut() to simplify this set of nested if-else statements?

Two advantages of using cut is that it works on vectors, whereas if only works on a single value

5.What happens if you use switch() with numeric values? In switch(n, …), if n is numeric, it will return the nth argument from …. This means that if n = 1, switch() will return the first argument in …, if n = 2, the second, and so on.

6.What does this switch() call do? What happens if x is “e”?

switch(x, 
  a = ,
  b = "ab",
  c = ,
  d = "cd"
)

The switch() function returns the first non-missing argument value for the first name it matches. Thus, when switch() encounters an argument with a missing value, like a = ,, it will return the value of the next argument with a non missing value, which in this case is b = “ab”. If object in switch(object=) is not equal to the names of any of its arguments, switch() will return either the last (unnamed) argument if one is present or NULL. Since “e” is not one of the named arguments in switch() (a, b, c, d), and no other unnamed default value is present, this code will return NULL.

switch(x,
  a = "ab",
  b = "ab",
  c = "cd",
  d = "cd",
  NULL # value to return if x not matched
)

## NULL

Function arguments

# Compute confidence interval around mean using normal approximation
mean_ci <- function(x, conf = 0.95) {
  se <- sd(x) / sqrt(length(x))
  alpha <- 1 - conf
  mean(x) + se * qnorm(c(alpha / 2, 1 - alpha / 2))
}

x <- runif(100)
mean_ci(x)

## [1] 49.56310 49.67737

#> [1] 0.4976111 0.6099594
mean_ci(x, conf = 0.99)

## [1] 49.54515 49.69532

#> [1] 0.4799599 0.6276105

wt_mean <- function(x, w) {
  sum(x * w) / sum(w)
}
wt_var <- function(x, w) {
  mu <- wt_mean(x, w)
  sum(w * (x - mu) ^ 2) / sum(w)
}
wt_sd <- function(x, w) {
  sqrt(wt_var(x, w))
}

wt_mean(1:6, 1:3)

## [1] 7.666667

#> [1] 7.666667

wt_mean <- function(x, w) {
  if (length(x) != length(w)) {
    stop("`x` and `w` must be the same length", call. = FALSE)
  }
  sum(w * x) / sum(w)
}

wt_mean <- function(x, w, na.rm = FALSE) {
  if (!is.logical(na.rm)) {
    stop("`na.rm` must be logical")
  }
  if (length(na.rm) != 1) {
    stop("`na.rm` must be length 1")
  }
  if (length(x) != length(w)) {
    stop("`x` and `w` must be the same length", call. = FALSE)
  }
  
  if (na.rm) {
    miss <- is.na(x) | is.na(w)
    x <- x[!miss]
    w <- w[!miss]
  }
  sum(w * x) / sum(w)
}

Exercises

1.What does commas(letters, collapse = “-”) do? Why?

2.It’d be nice if you could supply multiple characters to the pad argument, e.g. rule(“Title”, pad = “-+”). Why doesn’t this currently work? How could you fix it?

What does the trim argument to mean() do? When might you use it?
The default value for the method argument to cor() is c(“pearson”, “kendall”, “spearman”). What does that mean? What value is used by default?

Code Along 11

Aaric Adams

2024-11-26

Functions are for humans and computers

Exercises

Exercises

Function arguments

Exercises