Chapter 1 : A quick refresher

Exploring lists

# tricky_list has a regression model stored in it.
# Let's see if we can drill down and pull out the slope estimate corresponding to the wt variable.

# Guess where the regression model is stored
names(tricky_list)

# Use names() and str() on the model element
names(tricky_list[["model"]])
str(tricky_list[["model"]])

# Subset the coefficients element
tricky_list[["model"]][["coefficients"]]

# Subset the wt element
tricky_list[["model"]][["coefficients"]][["wt"]]
a = list(2, 3, 5, 7, 11, 13)
for(i in 1 : length(a)) {
  print(a[[i]])
}
## [1] 2
## [1] 3
## [1] 5
## [1] 7
## [1] 11
## [1] 13

A safer way to create the sequence

df = data.frame(
  a = rnorm(10),
  b = rnorm(10),
  c = rnorm(10),
  d = rnorm(10)
)
df
##              a          b           c          d
## 1   1.27207119  0.2330034  0.16066405  0.3086231
## 2  -0.67788805 -1.3324355 -0.77956676 -0.9367044
## 3  -1.57230799 -0.4823942 -0.03195792  1.5708032
## 4  -0.92337911  0.8126277  0.51724567  0.7787041
## 5  -1.00007431 -0.1927306 -0.65482097 -0.6214809
## 6   0.57200743 -1.3304990 -0.96793095 -0.7076832
## 7  -0.88153966  1.1016151 -0.15646724 -0.7871502
## 8  -0.54637169  1.0693052  1.87900356  0.1654758
## 9   0.19904137  0.7091375 -0.31666585 -0.1352767
## 10  0.01994523 -0.2265446  1.92180653 -1.3774047
df[1]
##              a
## 1   1.27207119
## 2  -0.67788805
## 3  -1.57230799
## 4  -0.92337911
## 5  -1.00007431
## 6   0.57200743
## 7  -0.88153966
## 8  -0.54637169
## 9   0.19904137
## 10  0.01994523
for (i in 1 : ncol(df)) {
  print(median(df[[i]]))
}
## [1] -0.6121299
## [1] 0.02013642
## [1] -0.09421258
## [1] -0.3783788
### Save the result to output vector
# Create new double vector: output
output = vector("double", ncol(df))

# Alter the loop
for (i in seq_along(df)) {
  output[[i]] = median(df[[i]])
}

# Print output
output
## [1] -0.61212987  0.02013642 -0.09421258 -0.37837878

Chapter 2 : When and how you should write a function?

Count at how many positions two vectors.

x <- c( 1, 2, NA, 3, NA)
y <- c(NA, 3, NA, 3,  4)

# Turn this snippet into a function: both_na()
both_na = function(x, y) {
  return(sum(is.na(x) & is.na(y)))
}

Remove last element of the data

f2 <- function(x) {
  if (length(x) <= 1) return(NULL)
  x[-length(x)]
}

return statements

mean_ci <- function(x, level = 0.95) {
  if (length(x) == 0) {
    warning("`x` was empty", call. = FALSE)
    interval <- c(-Inf, Inf)
  } else { 
    se <- sd(x) / sqrt(length(x))
    alpha <- 1 - level
    interval <- mean(x) + 
      se * qnorm(c(alpha / 2, 1 - alpha / 2))
  }
  interval
}

What does this function do?

x = c(1, 2, NA, 4, 5)

f <- function(x, y) {
  x[is.na(x)] <- y
  cat(sum(is.na(x)), y, "\n")
  x
}

f(x = x, y = 3)
## 0 3
## [1] 1 2 3 4 5
f(x = x, y = 10)
## 0 10
## [1]  1  2 10  4  5

Make it clear from its name and body more understandable

z = c(NA, NA, -0.10618832, 1.27298018, -1.50027365, -0.17863732, 0.08291387, -0.94013111, 0.17204559, -0.19802701)
df = data.frame(z)
df
##              z
## 1           NA
## 2           NA
## 3  -0.10618832
## 4   1.27298018
## 5  -1.50027365
## 6  -0.17863732
## 7   0.08291387
## 8  -0.94013111
## 9   0.17204559
## 10 -0.19802701
replace_missings <- function(x, replacement) {
  is_miss <- is.na(x)
  x[is_miss] <- replacement
  
  # Rewrite to use message()
  message(sum(is_miss), " missings replaced by the value ", replacement, ".\n")
  x
}

# Check your new function by running on df$z
df$z = replace_missings(df$z, 0)
## 2 missings replaced by the value 0.
df
##              z
## 1   0.00000000
## 2   0.00000000
## 3  -0.10618832
## 4   1.27298018
## 5  -1.50027365
## 6  -0.17863732
## 7   0.08291387
## 8  -0.94013111
## 9   0.17204559
## 10 -0.19802701

Chapter 3 : Functional programming

Column means function

col_median <- function(df) {
  output <- numeric(length(df))
  for (i in seq_along(df)) {
    output[[i]] <- median(df[[i]])
  }
  output
}
col_mean <- function(df) {
  output <- numeric(length(df))
  for (i in seq_along(df)) {
    output[[i]] <- mean(df[[i]])
  }
  output
}

Using a function as an argument

df = data.frame(a = rnorm(10), b = rnorm(10), c = rnorm(10), d = rnorm(10))
df
##              a           b           c            d
## 1  -0.19147619  0.74405728 -0.04165540 -1.415688661
## 2  -0.07873108  0.07572426  1.94396436  0.220628709
## 3  -0.09874482 -0.68824642  0.69102664 -1.006757652
## 4   0.66262529  0.41961151 -0.36087407 -0.066990957
## 5  -0.49803126  0.02491136 -1.20721582  0.550277622
## 6   1.84255104 -0.71041416  0.28037727 -0.679044497
## 7   0.82206754  1.00412151  0.09889471  0.473156931
## 8  -1.03645754 -0.97757050  0.09874980 -0.063416672
## 9  -1.20522212  0.36128055 -1.84000666  0.345021644
## 10  0.81312718  2.63900718  1.06275792  0.005860636
sapply(df, mean)
##           a           b           c           d 
##  0.10317080  0.28924826  0.07260187 -0.16369529
library(purrr)
map_dbl(df, mean)
##           a           b           c           d 
##  0.10317080  0.28924826  0.07260187 -0.16369529

The map functions differ in their return type

  • map() returns a list
  • map_dbl() returns a double vector
  • map_lgl() returns a logical vector
  • map_int() returns a integer vector
  • map_chr() returns a character vector

Advantages of the map functions in purrr

  • Handy shortcuts for specifying .f
  • More consistent than sapply(), lapply(), which makes them better for programming
# data frames, iterate over columns
df = data.frame(a = 1:10, b = 11:20)
map(df, mean)
## $a
## [1] 5.5
## 
## $b
## [1] 15.5
# list, iterate over elements
l = list(a = 1:10, b = 11:20)
map(l, mean)
## $a
## [1] 5.5
## 
## $b
## [1] 15.5
# vectors, iterate over elements
vec = c(a = 1, b = 2)
map(vec, mean)
## $a
## [1] 1
## 
## $b
## [1] 2

The argument to the map functions

# Find the mean of each column
map_dbl(planes, mean)

# Find the mean of each column, excluding missing values
map_dbl(planes, mean, na.rm = TRUE)

# Find the 5th percentile of each column, excluding missing values
map_dbl(planes, quantile, na.rm = TRUE, probs = 0.05 )