library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("lubridate")
df <- tibble::tibble(
a = rnorm(10),
b = rnorm(10),
c = rnorm(10),
d = rnorm(10)
)
df$a <- (df$a - min(df$a, na.rm = TRUE)) /
(max(df$a, na.rm = TRUE) - min(df$a, na.rm = TRUE))
df$b <- (df$b - min(df$b, na.rm = TRUE)) /
(max(df$b, na.rm = TRUE) - min(df$a, na.rm = TRUE))
df$c <- (df$c - min(df$c, na.rm = TRUE)) /
(max(df$c, na.rm = TRUE) - min(df$c, na.rm = TRUE))
df$d <- (df$d - min(df$d, na.rm = TRUE)) /
(max(df$d, na.rm = TRUE) - min(df$d, na.rm = TRUE))
(df$a - min(df$a, na.rm = TRUE)) /
(max(df$a, na.rm = TRUE) - min(df$a, na.rm = TRUE))
## [1] 0.4434324 1.0000000 0.6694723 0.1702379 0.1660593 0.0000000 0.6158430
## [8] 0.3553321 0.1823770 0.4247616
x <- df$a
(x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
## [1] 0.4434324 1.0000000 0.6694723 0.1702379 0.1660593 0.0000000 0.6158430
## [8] 0.3553321 0.1823770 0.4247616
#> [1] 0.2892677 0.7509271 0.0000000 0.6781686 0.8530656 1.0000000 0.1716402
#> [8] 0.6107464 0.6116181 0.6008793
rng <- range(x, na.rm = TRUE)
(x - rng[1]) / (rng[2] - rng[1])
## [1] 0.4434324 1.0000000 0.6694723 0.1702379 0.1660593 0.0000000 0.6158430
## [8] 0.3553321 0.1823770 0.4247616
#> [1] 0.2892677 0.7509271 0.0000000 0.6781686 0.8530656 1.0000000 0.1716402
#> [8] 0.6107464 0.6116181 0.6008793
rescale01 <- function(x) {
rng <- range(x, na.rm = TRUE)
(x - rng[1]) / (rng[2] - rng[1])
}
rescale01(c(0, 5, 10))
## [1] 0.0 0.5 1.0
#> [1] 0.0 0.5 1.0
rescale01(c(-10, 0, 10))
## [1] 0.0 0.5 1.0
#> [1] 0.0 0.5 1.0
rescale01(c(1, 2, 3, NA, 5))
## [1] 0.00 0.25 0.50 NA 1.00
#> [1] 0.00 0.25 0.50 NA 1.00
df$a <- rescale01(df$a)
df$b <- rescale01(df$b)
df$c <- rescale01(df$c)
df$d <- rescale01(df$d)
x <- c(1:10, Inf)
rescale01(x)
## [1] 0 0 0 0 0 0 0 0 0 0 NaN
#> [1] 0 0 0 0 0 0 0 0 0 0 NaN
rescale01 <- function(x) {
rng <- range(x, na.rm = TRUE, finite = TRUE)
(x - rng[1]) / (rng[2] - rng[1])
}
rescale01(x)
## [1] 0.0000000 0.1111111 0.2222222 0.3333333 0.4444444 0.5555556 0.6666667
## [8] 0.7777778 0.8888889 1.0000000 Inf
#> [1] 0.0000000 0.1111111 0.2222222 0.3333333 0.4444444 0.5555556 0.6666667
#> [8] 0.7777778 0.8888889 1.0000000 Inf
##Exercises 1. Why is TRUE not a parameter to rescale01()? What would happen if x contained a single missing value, and na.rm was FALSE? If x contains a single missing value and na.rm = FALSE, then this function stills return a non-missing value. The option finite = TRUE to range() will drop all non-finite elements, and NA is a non-finite element.
2.In the second variant of rescale01(), infinite values are left unchanged. Rewrite rescale01() so that -Inf is mapped to 0, and Inf is mapped to 1.
rescale01 <- function(x) {
rng <- range(x, na.rm = TRUE, finite = TRUE)
y <- (x - rng[1]) / (rng[2] - rng[1])
y[y == -Inf] <- 0
y[y == Inf] <- 1
y
}
rescale01(c(Inf, -Inf, 0:5, NA))
## [1] 1.0 0.0 0.0 0.2 0.4 0.6 0.8 1.0 NA
#> [1] 1.0 0.0 0.0 0.2 0.4 0.6 0.8 1.0 NA
mean(is.na(x))
## [1] 0
x / sum(x, na.rm = TRUE)
## [1] 0 0 0 0 0 0 0 0 0 0 NaN
sd(x, na.rm = TRUE) / mean(x, na.rm = TRUE)
## [1] NaN
coef_variation <- function(x, na.rm = FALSE) {
sd(x, na.rm = na.rm) / mean(x, na.rm = na.rm)
}
coef_variation(1:5)
## [1] 0.5270463
#> [1] 0.527
coef_variation(c(1:5, NA))
## [1] NA
#> [1] NA
coef_variation(c(1:5, NA), na.rm = TRUE)
## [1] 0.5270463
#> [1] 0.527
4.write your own functions to compute the variance and skewness of a numeric vector. Variance is defined as
variance <- function(x, na.rm = TRUE) {
n <- length(x)
m <- mean(x, na.rm = TRUE)
sq_err <- (x - m)^2
sum(sq_err) / (n - 1)
}
var(1:10)
## [1] 9.166667
#> [1] 9.17
variance(1:10)
## [1] 9.166667
#> [1] 9.17
5.Write both_na(), a function that takes two vectors of the same length and returns the number of positions that have an NA in both vectors.
both_na <- function(x, y) {
sum(is.na(x) & is.na(y))
}
both_na(
c(NA, NA, 1, 2),
c(NA, 1, NA, 2)
)
## [1] 1
#> [1] 1
both_na(
c(NA, NA, 1, 2, NA, NA, 1),
c(NA, 1, NA, 2, NA, NA, 1)
)
## [1] 3
#> [1] 3
6.What do the following functions do? Why are they useful even though they are so short?
is_directory <- function(x) file.info(x)$isdir
is_readable <- function(x) file.access(x, 4) == 0
The function is_directory() checks whether the path in x is a directory. The function is_readable() checks whether the path in x is readable, meaning that the file exists and the user has permission to open it. These functions are useful even though they are short because their names make it much clearer what the code is doing. 7.Read the complete lyrics to “Little Bunny Foo Foo”. There’s a lot of duplication in this song. Extend the initial piping example to recreate the complete song, and use functions to reduce the duplication. threat <- function(chances) { give_chances( from = Good_Fairy, to = foo_foo, number = chances, condition = “Don’t behave”, consequence = turn_into_goon ) }
lyric <- function() { foo_foo %>% hop(through = forest) %>% scoop(up = field_mouse) %>% bop(on = head)
down_came(Good_Fairy) said( Good_Fairy, c( “Little bunny Foo Foo”, “I don’t want to see you”, “Scooping up the field mice”, “And bopping them on the head.” ) ) }
lyric() threat(3) lyric() threat(2) lyric() threat(1) lyric() turn_into_goon(Good_Fairy, foo_foo)
# Never do this!
col_mins <- function(x, y) {}
rowMaxes <- function(y, x) {}
# Don't do this!
T <- FALSE
c <- 10
mean <- function(x) sum(x)
# Load data --------------------------------------
# Plot data --------------------------------------
f1 <- function(string, prefix) {
substr(string, 1, nchar(prefix)) == prefix
}
f2 <- function(x) {
if (length(x) <= 1) return(NULL)
x[-length(x)]
}
f3 <- function(x, y) {
rep(y, length.out = length(x))
}
The function f1 tests whether each element of the character vector nchar starts with the string prefix. A better name for f1 is has_prefix()
The function f2 drops the last element of the vector x.
A better name for f2 is drop_last().
The function f3 repeats y once for each element of x. 2. Compare and contrast rnorm() and MASS::mvrnorm(). How could you make them more consistent?
rnorm() samples from the univariate normal distribution, while MASS::mvrnorm samples from the multivariate normal distribution. The main arguments in rnorm() are n, mean, sd. The main arguments is MASS::mvrnorm are n, mu, Sigma. To be consistent they should have the same names. However, this is difficult. In general, it is better to be consistent with more widely used functions, e.g. rmvnorm() should follow the conventions of rnorm(). However, while mean is correct in the multivariate case, sd does not make sense in the multivariate case. However, both functions are internally consistent. It would not be good practice to have mu and sd as arguments or mean and Sigma as arguments.
3.Make a case for why norm_r(), norm_d() etc would be better than rnorm(), dnorm(). Make a case for the opposite. If named norm_r() and norm_d(), the naming convention groups functions by their distribution.
If named rnorm(), and dnorm(), the naming convention groups functions by the action they perform.
##Conditional execution
has_name <- function(x) {
nms <- names(x)
if (is.null(nms)) {
rep(FALSE, length(x))
} else {
!is.na(nms) & nms != ""
}
}
identical(0L, 0)
## [1] FALSE
#> [1] FALSE
x <- sqrt(2) ^ 2
x
## [1] 2
#> [1] 2
x == 2
## [1] FALSE
#> [1] FALSE
x - 2
## [1] 4.440892e-16
#> [1] 4.440892e-16
y <- 10
x <- if (y < 20) "Too low" else "Too high"
if (y < 20) {
x <- "Too low"
} else {
x <- "Too high"
}
1.What’s the difference between if and ifelse()? Carefully read the help and construct three examples that illustrate the key differences.
The keyword if tests a single condition, while ifelse() tests each element.
2.Write a greeting function that says “good morning”, “good afternoon”, or “good evening”, depending on the time of day. (Hint: use a time argument that defaults to lubridate::now(). That will make it easier to test your function.)
greet <- function(time = lubridate::now()) {
hr <- lubridate::hour(time)
# I don't know what to do about times after midnight,
# are they evening or morning?
if (hr < 12) {
print("good morning")
} else if (hr < 17) {
print("good afternoon")
} else {
print("good evening")
}
}
greet()
## [1] "good afternoon"
#> [1] "good morning"
greet(ymd_h("2017-01-08:05"))
## [1] "good morning"
#> [1] "good morning"
greet(ymd_h("2017-01-08:13"))
## [1] "good afternoon"
#> [1] "good afternoon"
greet(ymd_h("2017-01-08:20"))
## [1] "good evening"
#> [1] "good evening"
1:10 %% 3 == 0
## [1] FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE TRUE FALSE
#> [1] FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE TRUE FALSE
!(1:10 %% 3)
## [1] FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE TRUE FALSE
#> [1] FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE TRUE FALSE
fizzbuzz <- function(x) {
# these two lines check that x is a valid input
stopifnot(length(x) == 1)
stopifnot(is.numeric(x))
if (!(x %% 3) && !(x %% 5)) {
"fizzbuzz"
} else if (!(x %% 3)) {
"fizz"
} else if (!(x %% 5)) {
"buzz"
} else {
# ensure that the function returns a character vector
as.character(x)
}
}
fizzbuzz(6)
## [1] "fizz"
#> [1] "fizz"
fizzbuzz(10)
## [1] "buzz"
#> [1] "buzz"
fizzbuzz(15)
## [1] "fizzbuzz"
#> [1] "fizzbuzz"
fizzbuzz(2)
## [1] "2"
#> [1] "2"
fizzbuzz2 <- function(x) {
# these two lines check that x is a valid input
stopifnot(length(x) == 1)
stopifnot(is.numeric(x))
if (!(x %% 3)) {
if (!(x %% 5)) {
"fizzbuzz"
} else {
"fizz"
}
} else if (!(x %% 5)) {
"buzz"
} else {
# ensure that the function returns a character vector
as.character(x)
}
}
fizzbuzz2(6)
## [1] "fizz"
#> [1] "fizz"
fizzbuzz2(10)
## [1] "buzz"
#> [1] "buzz"
fizzbuzz2(15)
## [1] "fizzbuzz"
#> [1] "fizzbuzz"
fizzbuzz2(2)
## [1] "2"
#> [1] "2"
fizzbuzz_vec <- function(x) {
case_when(!(x %% 3) & !(x %% 5) ~ "fizzbuzz",
!(x %% 3) ~ "fizz",
!(x %% 5) ~ "buzz",
TRUE ~ as.character(x)
)
}
fizzbuzz_vec(c(0, 1, 2, 3, 5, 9, 10, 12, 15))
## [1] "fizzbuzz" "1" "2" "fizz" "buzz" "fizz" "buzz"
## [8] "fizz" "fizzbuzz"
#> [1] "fizzbuzz" "1" "2" "fizz" "buzz" "fizz" "buzz"
#> [8] "fizz" "fizzbuzz"
fizzbuzz_vec2 <- function(x) {
y <- as.character(x)
# put the individual cases first - any elements divisible by both 3 and 5
# will be overwritten with fizzbuzz later
y[!(x %% 3)] <- "fizz"
y[!(x %% 3)] <- "buzz"
y[!(x %% 3) & !(x %% 5)] <- "fizzbuzz"
y
}
fizzbuzz_vec2(c(0, 1, 2, 3, 5, 9, 10, 12, 15))
## [1] "fizzbuzz" "1" "2" "buzz" "5" "buzz" "10"
## [8] "buzz" "fizzbuzz"
#> [1] "fizzbuzz" "1" "2" "buzz" "5" "buzz" "10"
#> [8] "buzz" "fizzbuzz"
Two advantages of using cut is that it works on vectors, whereas if only works on a single value
5.What happens if you use switch() with numeric values? In switch(n, …), if n is numeric, it will return the nth argument from …. This means that if n = 1, switch() will return the first argument in …, if n = 2, the second, and so on.
6.What does this switch() call do? What happens if x is “e”?
switch(x,
a = ,
b = "ab",
c = ,
d = "cd"
)
The switch() function returns the first non-missing argument value for the first name it matches. Thus, when switch() encounters an argument with a missing value, like a = ,, it will return the value of the next argument with a non missing value, which in this case is b = “ab”. If object in switch(object=) is not equal to the names of any of its arguments, switch() will return either the last (unnamed) argument if one is present or NULL. Since “e” is not one of the named arguments in switch() (a, b, c, d), and no other unnamed default value is present, this code will return NULL.
switch(x,
a = "ab",
b = "ab",
c = "cd",
d = "cd",
NULL # value to return if x not matched
)
## NULL
# Compute confidence interval around mean using normal approximation
mean_ci <- function(x, conf = 0.95) {
se <- sd(x) / sqrt(length(x))
alpha <- 1 - conf
mean(x) + se * qnorm(c(alpha / 2, 1 - alpha / 2))
}
x <- runif(100)
mean_ci(x)
## [1] 49.56310 49.67737
#> [1] 0.4976111 0.6099594
mean_ci(x, conf = 0.99)
## [1] 49.54515 49.69532
#> [1] 0.4799599 0.6276105
wt_mean <- function(x, w) {
sum(x * w) / sum(w)
}
wt_var <- function(x, w) {
mu <- wt_mean(x, w)
sum(w * (x - mu) ^ 2) / sum(w)
}
wt_sd <- function(x, w) {
sqrt(wt_var(x, w))
}
wt_mean(1:6, 1:3)
## [1] 7.666667
#> [1] 7.666667
wt_mean <- function(x, w) {
if (length(x) != length(w)) {
stop("`x` and `w` must be the same length", call. = FALSE)
}
sum(w * x) / sum(w)
}
wt_mean <- function(x, w, na.rm = FALSE) {
if (!is.logical(na.rm)) {
stop("`na.rm` must be logical")
}
if (length(na.rm) != 1) {
stop("`na.rm` must be length 1")
}
if (length(x) != length(w)) {
stop("`x` and `w` must be the same length", call. = FALSE)
}
if (na.rm) {
miss <- is.na(x) | is.na(w)
x <- x[!miss]
w <- w[!miss]
}
sum(w * x) / sum(w)
}
1.What does commas(letters, collapse = “-”) do? Why?
2.It’d be nice if you could supply multiple characters to the pad argument, e.g. rule(“Title”, pad = “-+”). Why doesn’t this currently work? How could you fix it?
What does the trim argument to mean() do? When might you use it?
The default value for the method argument to cor() is c(“pearson”, “kendall”, “spearman”). What does that mean? What value is used by default?