DataCamp : Writing Functions in R

Chapter 1 : A quick refresher

Exploring lists

# tricky_list has a regression model stored in it.
# Let's see if we can drill down and pull out the slope estimate corresponding to the wt variable.

# Guess where the regression model is stored
names(tricky_list)

# Use names() and str() on the model element
names(tricky_list[["model"]])
str(tricky_list[["model"]])

# Subset the coefficients element
tricky_list[["model"]][["coefficients"]]

# Subset the wt element
tricky_list[["model"]][["coefficients"]][["wt"]]

a = list(2, 3, 5, 7, 11, 13)
for(i in 1 : length(a)) {
  print(a[[i]])
}

## [1] 2
## [1] 3
## [1] 5
## [1] 7
## [1] 11
## [1] 13

A safer way to create the sequence

df = data.frame(
  a = rnorm(10),
  b = rnorm(10),
  c = rnorm(10),
  d = rnorm(10)
)
df

##              a          b           c          d
## 1   1.27207119  0.2330034  0.16066405  0.3086231
## 2  -0.67788805 -1.3324355 -0.77956676 -0.9367044
## 3  -1.57230799 -0.4823942 -0.03195792  1.5708032
## 4  -0.92337911  0.8126277  0.51724567  0.7787041
## 5  -1.00007431 -0.1927306 -0.65482097 -0.6214809
## 6   0.57200743 -1.3304990 -0.96793095 -0.7076832
## 7  -0.88153966  1.1016151 -0.15646724 -0.7871502
## 8  -0.54637169  1.0693052  1.87900356  0.1654758
## 9   0.19904137  0.7091375 -0.31666585 -0.1352767
## 10  0.01994523 -0.2265446  1.92180653 -1.3774047

df[1]

##              a
## 1   1.27207119
## 2  -0.67788805
## 3  -1.57230799
## 4  -0.92337911
## 5  -1.00007431
## 6   0.57200743
## 7  -0.88153966
## 8  -0.54637169
## 9   0.19904137
## 10  0.01994523

for (i in 1 : ncol(df)) {
  print(median(df[[i]]))
}

## [1] -0.6121299
## [1] 0.02013642
## [1] -0.09421258
## [1] -0.3783788

### Save the result to output vector
# Create new double vector: output
output = vector("double", ncol(df))

# Alter the loop
for (i in seq_along(df)) {
  output[[i]] = median(df[[i]])
}

# Print output
output

## [1] -0.61212987  0.02013642 -0.09421258 -0.37837878

Chapter 2 : When and how you should write a function?

Count at how many positions two vectors.

x <- c( 1, 2, NA, 3, NA)
y <- c(NA, 3, NA, 3,  4)

# Turn this snippet into a function: both_na()
both_na = function(x, y) {
  return(sum(is.na(x) & is.na(y)))
}

Remove last element of the data

f2 <- function(x) {
  if (length(x) <= 1) return(NULL)
  x[-length(x)]
}

return statements

mean_ci <- function(x, level = 0.95) {
  if (length(x) == 0) {
    warning("`x` was empty", call. = FALSE)
    interval <- c(-Inf, Inf)
  } else { 
    se <- sd(x) / sqrt(length(x))
    alpha <- 1 - level
    interval <- mean(x) + 
      se * qnorm(c(alpha / 2, 1 - alpha / 2))
  }
  interval
}

What does this function do?

x = c(1, 2, NA, 4, 5)

f <- function(x, y) {
  x[is.na(x)] <- y
  cat(sum(is.na(x)), y, "\n")
  x
}

f(x = x, y = 3)

## 0 3

## [1] 1 2 3 4 5

f(x = x, y = 10)

## 0 10

## [1]  1  2 10  4  5

Make it clear from its name and body more understandable

z = c(NA, NA, -0.10618832, 1.27298018, -1.50027365, -0.17863732, 0.08291387, -0.94013111, 0.17204559, -0.19802701)
df = data.frame(z)
df

##              z
## 1           NA
## 2           NA
## 3  -0.10618832
## 4   1.27298018
## 5  -1.50027365
## 6  -0.17863732
## 7   0.08291387
## 8  -0.94013111
## 9   0.17204559
## 10 -0.19802701

replace_missings <- function(x, replacement) {
  is_miss <- is.na(x)
  x[is_miss] <- replacement
  
  # Rewrite to use message()
  message(sum(is_miss), " missings replaced by the value ", replacement, ".\n")
  x
}

# Check your new function by running on df$z
df$z = replace_missings(df$z, 0)

## 2 missings replaced by the value 0.

df

##              z
## 1   0.00000000
## 2   0.00000000
## 3  -0.10618832
## 4   1.27298018
## 5  -1.50027365
## 6  -0.17863732
## 7   0.08291387
## 8  -0.94013111
## 9   0.17204559
## 10 -0.19802701

Chapter 3 : Functional programming

Column means function

col_median <- function(df) {
  output <- numeric(length(df))
  for (i in seq_along(df)) {
    output[[i]] <- median(df[[i]])
  }
  output
}
col_mean <- function(df) {
  output <- numeric(length(df))
  for (i in seq_along(df)) {
    output[[i]] <- mean(df[[i]])
  }
  output
}

Using a function as an argument

df = data.frame(a = rnorm(10), b = rnorm(10), c = rnorm(10), d = rnorm(10))
df

##              a           b           c            d
## 1  -0.19147619  0.74405728 -0.04165540 -1.415688661
## 2  -0.07873108  0.07572426  1.94396436  0.220628709
## 3  -0.09874482 -0.68824642  0.69102664 -1.006757652
## 4   0.66262529  0.41961151 -0.36087407 -0.066990957
## 5  -0.49803126  0.02491136 -1.20721582  0.550277622
## 6   1.84255104 -0.71041416  0.28037727 -0.679044497
## 7   0.82206754  1.00412151  0.09889471  0.473156931
## 8  -1.03645754 -0.97757050  0.09874980 -0.063416672
## 9  -1.20522212  0.36128055 -1.84000666  0.345021644
## 10  0.81312718  2.63900718  1.06275792  0.005860636

sapply(df, mean)

##           a           b           c           d 
##  0.10317080  0.28924826  0.07260187 -0.16369529

library(purrr)
map_dbl(df, mean)

##           a           b           c           d 
##  0.10317080  0.28924826  0.07260187 -0.16369529

The map functions differ in their return type

map() returns a list
map_dbl() returns a double vector
map_lgl() returns a logical vector
map_int() returns a integer vector
map_chr() returns a character vector

Advantages of the map functions in purrr

Handy shortcuts for specifying .f
More consistent than sapply(), lapply(), which makes them better for programming

# data frames, iterate over columns
df = data.frame(a = 1:10, b = 11:20)
map(df, mean)

## $a
## [1] 5.5
## 
## $b
## [1] 15.5

# list, iterate over elements
l = list(a = 1:10, b = 11:20)
map(l, mean)

## $a
## [1] 5.5
## 
## $b
## [1] 15.5

# vectors, iterate over elements
vec = c(a = 1, b = 2)
map(vec, mean)

## $a
## [1] 1
## 
## $b
## [1] 2

The argument to the map functions

# Find the mean of each column
map_dbl(planes, mean)

# Find the mean of each column, excluding missing values
map_dbl(planes, mean, na.rm = TRUE)

# Find the 5th percentile of each column, excluding missing values
map_dbl(planes, quantile, na.rm = TRUE, probs = 0.05 )

DataCamp : Writing Functions in R

Rose Park

October 25, 2017

Chapter 1 : A quick refresher

Exploring lists

A safer way to create the sequence

Chapter 2 : When and how you should write a function?

Count at how many positions two vectors.

Remove last element of the data

return statements

What does this function do?

Make it clear from its name and body more understandable

Chapter 3 : Functional programming

Column means function

Using a function as an argument

The argument to the map functions