1 - Refresher

Writing a function

my_fun <- function(arg1, arg2) {
  # body
}
# Define ratio() function
#-- Change the function's arguments arg1 and arg2 
#-- to x and y for good practice

ratio <- function(x, y) {
  x/y
}

# Call ratio() with arguments 3 and 4
ratio(3,4) # matching by position
## [1] 0.75
ratio(x = 3, y = 4) # matching by name
## [1] 0.75

Arguments

# Rewrite the call to follow best practices
mean(0.1,x=c(1:9, NA),TRUE)
## [1] 5

mean(x = c(1:9, NA), trim = 0.1, na.rm = TRUE)
## [1] 5

# The first argument to mean() should be c(1:9, NA)
# there's no need to name this argument 
# since it comes first and has no default value.

mean(c(1:9, NA), trim = 0.1, na.rm = TRUE)
## [1] 5

Uses

# c is being used in three ways
# 1. function
# 2. name
# 3. refers to a value

c <- 3
c(c = c)  
## c 
## 3

Scoping

# to get a fresh session in the console
rm(x)
## Warning in rm(x): object 'x' not found
rm(y)
## Warning in rm(y): object 'y' not found

y <- 10
f <- function(x) {
  x + y
}

f(10)
## [1] 20

# Because y is not passed in as an argument to the function, 
# R looks outside of the function environment
# to get a fresh session in the console
rm(x)
## Warning in rm(x): object 'x' not found
rm(y)

y <- 10
f <- function(x) {
  y <- 5
  x + y
}

f(10)
## [1] 15

# value of x is passed in as an argument to the function
# value of y is defined inside of the function
# to get a fresh session in the console
rm(x)
## Warning in rm(x): object 'x' not found
rm(y)

f <- function(x) {
  y <- 5
  x + y
}

f(5)
## [1] 10

# Now, what will typing y return?

y
## Error in eval(expr, envir, enclos): object 'y' not found

# y is set equal to 5 within the body of the function
# object does not exist in the global environment.

Data Structures

  • Vectors:
    • Atomic vectors – homogeneous
      • logical
      • integer
      • double
      • character
      • complex
      • raw
    • Lists (aka recurrive vectors) – heterogeneous
      • lists can contain other lists

For every vector

# Property 1
typeof(letters)
## [1] "character"
typeof(1:10)
## [1] "integer"

# Property 2
length(letters)
## [1] 26
x <- list("a", "b", 1:10)
length(x) 
## [1] 3

Missing values

# NULL used to indicate the absence of a vector 
typeof(NULL)
## [1] "NULL"
length(NULL)
## [1] 0

# NA used to indicate the absence of a value in a vector
# i.e. missing value
typeof(NA)
## [1] "logical"
length(NA) 
## [1] 1

NAs inside vectors

x <- c(1, 2, 3, NA, 5)
x
## [1]  1  2  3 NA  5

is.na(x)
## [1] FALSE FALSE FALSE  TRUE FALSE

Missing values are contagious

NA + 10
## [1] NA

NA / 2
## [1] NA

NA > 5
## [1] NA

10 == NA
## [1] NA

NA == NA
## [1] NA

Subsetting Lists

a_without_names <- list(
    1:3,
    "a string",
    pi,
    list(-1, -5)
 )

a_without_names
## [[1]]
## [1] 1 2 3
## 
## [[2]]
## [1] "a string"
## 
## [[3]]
## [1] 3.141593
## 
## [[4]]
## [[4]][[1]]
## [1] -1
## 
## [[4]][[2]]
## [1] -5

a <- list(
    a = 1:3,
    b = "a string",
    c = pi,
    d = list(-1, -5)
 )

a
## $a
## [1] 1 2 3
## 
## $b
## [1] "a string"
## 
## $c
## [1] 3.141593
## 
## $d
## $d[[1]]
## [1] -1
## 
## $d[[2]]
## [1] -5

a[1:2]
## $a
## [1] 1 2 3
## 
## $b
## [1] "a string"

a[4]
## $d
## $d[[1]]
## [1] -1
## 
## $d[[2]]
## [1] -5

str(a[4])
## List of 1
##  $ d:List of 2
##   ..$ : num -1
##   ..$ : num -5

a[[4]]
## [[1]]
## [1] -1
## 
## [[2]]
## [1] -5

str(a[[4]])
## List of 2
##  $ : num -1
##  $ : num -5

a[[4]][1]
## [[1]]
## [1] -1

str(a[[4]][1])
## List of 1
##  $ : num -1

a[[4]][[1]]
## [1] -1

str(a[[4]][[1]])
##  num -1

a[[4]][1] == a[[4]][[1]]
## [1] TRUE
# double bracket ([[]]) subsetting by index and by name.
# my_list[[1]] extracts the first element of the list my_list
# my_list[["name"]] extracts the element in my_list that is called name

# list is nested 
  # you can travel down the heirarchy by recursive subsetting
  # mylist[[1]][["name"]]  => element called name 
                            # inside the first element of my_list

# NB: data frame is just a special kind of list
# my_df[[1]] will extract the first column of a data frame

tricky_list <- list(
    nums= c(-0.08637812, -1.26158565, -0.19490648, -0.26042323,  1.04196393,  0.77327523, -0.01484575, -0.35573723, -0.25463426, -0.43844839),
    y = c(FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE),
    x = list("hello!", "hi!", "goodbye!", "bye!"),
    model = lm(formula = mpg ~ wt, data = mtcars)
 )


# typeof()

# 2nd element in tricky_list
# The double brackets should be inside the parantheses of the typeof() function
typeof(tricky_list[[2]])
## [1] "logical"


# Element called x in tricky_list
typeof(tricky_list[["x"]])
## [1] "list"

# 2nd element inside the element called x in tricky_list
typeof(tricky_list[['x']][[2]])
## [1] "character"

Exploring Lists

# Goal: drill down and pull out the slope estimate corresponding to the wt variable.

# Guess where the regression model is stored
names(tricky_list)
## [1] "nums"  "y"     "x"     "model"

# Use names() and str() on the model element
names(tricky_list$model)
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"
str(tricky_list$model)
## List of 12
##  $ coefficients : Named num [1:2] 37.29 -5.34
##   ..- attr(*, "names")= chr [1:2] "(Intercept)" "wt"
##  $ residuals    : Named num [1:32] -2.28 -0.92 -2.09 1.3 -0.2 ...
##   ..- attr(*, "names")= chr [1:32] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710" "Hornet 4 Drive" ...
##  $ effects      : Named num [1:32] -113.65 -29.116 -1.661 1.631 0.111 ...
##   ..- attr(*, "names")= chr [1:32] "(Intercept)" "wt" "" "" ...
##  $ rank         : int 2
##  $ fitted.values: Named num [1:32] 23.3 21.9 24.9 20.1 18.9 ...
##   ..- attr(*, "names")= chr [1:32] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710" "Hornet 4 Drive" ...
##  $ assign       : int [1:2] 0 1
##  $ qr           :List of 5
##   ..$ qr   : num [1:32, 1:2] -5.657 0.177 0.177 0.177 0.177 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:32] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710" "Hornet 4 Drive" ...
##   .. .. ..$ : chr [1:2] "(Intercept)" "wt"
##   .. ..- attr(*, "assign")= int [1:2] 0 1
##   ..$ qraux: num [1:2] 1.18 1.05
##   ..$ pivot: int [1:2] 1 2
##   ..$ tol  : num 1e-07
##   ..$ rank : int 2
##   ..- attr(*, "class")= chr "qr"
##  $ df.residual  : int 30
##  $ xlevels      : Named list()
##  $ call         : language lm(formula = mpg ~ wt, data = mtcars)
##  $ terms        :Classes 'terms', 'formula'  language mpg ~ wt
##   .. ..- attr(*, "variables")= language list(mpg, wt)
##   .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:2] "mpg" "wt"
##   .. .. .. ..$ : chr "wt"
##   .. ..- attr(*, "term.labels")= chr "wt"
##   .. ..- attr(*, "order")= int 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(mpg, wt)
##   .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
##   .. .. ..- attr(*, "names")= chr [1:2] "mpg" "wt"
##  $ model        :'data.frame':   32 obs. of  2 variables:
##   ..$ mpg: num [1:32] 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##   ..$ wt : num [1:32] 2.62 2.88 2.32 3.21 3.44 ...
##   ..- attr(*, "terms")=Classes 'terms', 'formula'  language mpg ~ wt
##   .. .. ..- attr(*, "variables")= language list(mpg, wt)
##   .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. ..$ : chr [1:2] "mpg" "wt"
##   .. .. .. .. ..$ : chr "wt"
##   .. .. ..- attr(*, "term.labels")= chr "wt"
##   .. .. ..- attr(*, "order")= int 1
##   .. .. ..- attr(*, "intercept")= int 1
##   .. .. ..- attr(*, "response")= int 1
##   .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. .. ..- attr(*, "predvars")= language list(mpg, wt)
##   .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
##   .. .. .. ..- attr(*, "names")= chr [1:2] "mpg" "wt"
##  - attr(*, "class")= chr "lm"

names(tricky_list[["model"]])
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"
str(tricky_list[["model"]])
## List of 12
##  $ coefficients : Named num [1:2] 37.29 -5.34
##   ..- attr(*, "names")= chr [1:2] "(Intercept)" "wt"
##  $ residuals    : Named num [1:32] -2.28 -0.92 -2.09 1.3 -0.2 ...
##   ..- attr(*, "names")= chr [1:32] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710" "Hornet 4 Drive" ...
##  $ effects      : Named num [1:32] -113.65 -29.116 -1.661 1.631 0.111 ...
##   ..- attr(*, "names")= chr [1:32] "(Intercept)" "wt" "" "" ...
##  $ rank         : int 2
##  $ fitted.values: Named num [1:32] 23.3 21.9 24.9 20.1 18.9 ...
##   ..- attr(*, "names")= chr [1:32] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710" "Hornet 4 Drive" ...
##  $ assign       : int [1:2] 0 1
##  $ qr           :List of 5
##   ..$ qr   : num [1:32, 1:2] -5.657 0.177 0.177 0.177 0.177 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:32] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710" "Hornet 4 Drive" ...
##   .. .. ..$ : chr [1:2] "(Intercept)" "wt"
##   .. ..- attr(*, "assign")= int [1:2] 0 1
##   ..$ qraux: num [1:2] 1.18 1.05
##   ..$ pivot: int [1:2] 1 2
##   ..$ tol  : num 1e-07
##   ..$ rank : int 2
##   ..- attr(*, "class")= chr "qr"
##  $ df.residual  : int 30
##  $ xlevels      : Named list()
##  $ call         : language lm(formula = mpg ~ wt, data = mtcars)
##  $ terms        :Classes 'terms', 'formula'  language mpg ~ wt
##   .. ..- attr(*, "variables")= language list(mpg, wt)
##   .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:2] "mpg" "wt"
##   .. .. .. ..$ : chr "wt"
##   .. ..- attr(*, "term.labels")= chr "wt"
##   .. ..- attr(*, "order")= int 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(mpg, wt)
##   .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
##   .. .. ..- attr(*, "names")= chr [1:2] "mpg" "wt"
##  $ model        :'data.frame':   32 obs. of  2 variables:
##   ..$ mpg: num [1:32] 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##   ..$ wt : num [1:32] 2.62 2.88 2.32 3.21 3.44 ...
##   ..- attr(*, "terms")=Classes 'terms', 'formula'  language mpg ~ wt
##   .. .. ..- attr(*, "variables")= language list(mpg, wt)
##   .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. ..$ : chr [1:2] "mpg" "wt"
##   .. .. .. .. ..$ : chr "wt"
##   .. .. ..- attr(*, "term.labels")= chr "wt"
##   .. .. ..- attr(*, "order")= int 1
##   .. .. ..- attr(*, "intercept")= int 1
##   .. .. ..- attr(*, "response")= int 1
##   .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. .. ..- attr(*, "predvars")= language list(mpg, wt)
##   .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
##   .. .. .. ..- attr(*, "names")= chr [1:2] "mpg" "wt"
##  - attr(*, "class")= chr "lm"



# Subset the coefficients element
coefficients(tricky_list$model)
## (Intercept)          wt 
##   37.285126   -5.344472
tricky_list$model[1]
## $coefficients
## (Intercept)          wt 
##   37.285126   -5.344472

tricky_list[["model"]][["coefficients"]]
## (Intercept)          wt 
##   37.285126   -5.344472

# Subset the wt element
tricky_list$model[1][2]
## $<NA>
## NULL
tricky_list$model[[1]][2]
##        wt 
## -5.344472
tricky_list$model[[1]][[2]]
## [1] -5.344472

tricky_list[["model"]][["coefficients"]][["wt"]]
## [1] -5.344472


# order of recursive subsets to get the element you want
# From left to right, 
# name of the elements you specify within the double brackets 
# should get more and more small-scale

# my_list[[x]][[y]][[z]]
# my_list contains x, 
#         which contains y, 
#         which contains z.

For loops

 df <- data.frame(
 a = rnorm(10),
 b = rnorm(10),
 c = rnorm(10),
 d = rnorm(10)
 )

# sequence = (i in 1:ncol(df))
# body = print(median(df[[i]]))

# Each time our for loop iterates, i takes the next value in 1:ncol(df)

for (i in 1:ncol(df)) {
 print(median(df[[i]]))
}
## [1] -0.3165703
## [1] -0.5276784
## [1] 0.1394496
## [1] 0.6225529


# not the best way to create seq
# e.g. empty df

df <- data.frame()
1:ncol(df)
## [1] 1 0

for (i in 1:ncol(df)) {
 print(median(df[[i]]))
}
## Error in .subset2(x, i, exact = exact): subscript out of bounds

# sequence is now the somewhat non-sensical: 1, 0. 

## Instead: 
## seq_along() 
## generates a sequence along the index of the object passed to it
## but handles the empty case much better



 df <- data.frame(
 a = rnorm(10),
 b = rnorm(10),
 c = rnorm(10),
 d = rnorm(10)
 )

# Replace the 1:ncol(df) sequence
for (i in seq_along(df)) {
  print(median(df[[i]]))
}
## [1] -0.6524705
## [1] 0.07543185
## [1] -0.107055
## [1] 0.1111691

 
 
# Change the value of df
df <- data.frame()

# Repeat for loop to verify there is no error
for (i in seq_along(df)) {
  print(median(df[[i]]))
}

For loops - keep output

# Before you start the loop
# allocate sufficient space for the output

# due to efficiency
#   if you grow the for loop at each iteration 
#   (e.g. using c()), 
#   your for loop will be very slow.

# vector()
# to create an empty vector of given length
# two arguments
# 1. type of the vector 
#    ("logical", "integer", "double", "character", etc.) 
# 2. length of the vector

# Create new double vector: output
output = vector(double, length = ncol(df))
## Error in vector(double, length = ncol(df)): cannot coerce type 'closure' to vector of type 'character'
output = vector(mode = "double", length = ncol(df))

for (i in seq_along(df)) {
  # assign / store result to output[[i]]
  output[[i]] <- median(df[[i]])
  # double brackets
  # for generalizability
  # subsetting will work whether output is a vector or a list
}

# Print output to console
output
## numeric(0)

2 - When / How to write a function?

Initial Snippet of Code

Snippet

# snippet of code 
# that successfully rescales a column to be between 0 and 1:

# (df$a - min(df$a, na.rm = TRUE)) /  
#   (max(df$a, na.rm = TRUE) - min(df$a, na.rm = TRUE))

Code with x

Rewrite snippet to use x

# create a vector x containing the numbers 1 through 10.
x <- 1:10

# Rewrite the code snippet 
# to use the temporary variable x 
# instead of referring to the data frame column df$a
(x - min(x, na.rm = TRUE)) /
  (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
##  [1] 0.0000000 0.1111111 0.2222222 0.3333333 0.4444444 0.5555556 0.6666667
##  [8] 0.7777778 0.8888889 1.0000000

Code with rng

Rewrite for clarity with intermediate variable rng

# create a vector x containing the numbers 1 through 10.
x <- 1:10

# duplicated statement =  min(x, na.rm = TRUE)
# calculate it once / store the result/ refer to it
# also need the maximum value of x
# => calculate the range once
# refer to the first and second elements when they are needed.

# Define the intermediate variable rng 
# to contain the range of x 
# using the function range()
# Specify the na.rm() argument 
# to automatically ignore any NAs in the vector

rng <- range(x, na.rm = TRUE)

# Rewrite this snippet to refer to the elements of rng
(x - rng[1]) /
  (rng[2] - rng[1])
##  [1] 0.0000000 0.1111111 0.2222222 0.3333333 0.4444444 0.5555556 0.6666667
##  [8] 0.7777778 0.8888889 1.0000000

Turn into function

Create the rescale01 function

rescale01 <- function(x) {
  
  rng <- range(x, na.rm = TRUE)
  
  (x - rng[1]) /
  (rng[2] - rng[1])

}
  • Start with a simple problem
  • Snipper of Code
  • Copy Paste Rule - if more than 2 times then function
  • Inputs
  • Temp variables - rewrite
  • remove duplication - rewrite for clarity
  • temp variable become arguments
  • test func

e.g. “both_na()”

counts at how many positions two vectors, x and y, both have a missing value

Start with a simple problem

# Define example vectors x and y
x <- c( 1, 2, NA, 3, NA)
y <- c(NA, 3, NA, 3,  4)
# where we know what the answer both_na(x, y) should be.

# both_na(x, y) should return 1
# since there is only one element that is missing in both x and y
# the third element.

# Count how many elements are missing in both x and y
is.na(x) # logical vector with TRUE at every position that has a missing
## [1] FALSE FALSE  TRUE FALSE  TRUE
sum(is.na(x & y)) # incorrect
## [1] 3
sum(is.na(x) & is.na(y)) # correct
## [1] 1

Rewrite snippet as function

# snippet
sum(is.na(x) & is.na(y))
## [1] 1
# Our snippet is also so simple we can't write it any clearer.


# Turn this snippet into a function
both_na <- function(x,y) {
  sum(is.na(x) & is.na(y))
}

Check the function works in other situs


# Define x, y1 and y2
x <-  c(NA, NA, NA)
y1 <- c( 1, NA, NA)
y2 <- c( 1, NA, NA, NA)



# Call both_na on x, y1
both_na(x, y1)
## [1] 2

# Call both_na on x, y2
both_na(x, y2)
## Warning in is.na(x) & is.na(y): longer object length is not a multiple of
## shorter object length
## [1] 3

Good function?

  • correct
  • understandable
    • humans
    • computer
      • good names – e.g. descriptive
        • objects
        • functions – e.g. verbs
        • arguments – e.g. nouns [data arg vs detail arg]

Good function names

# bad name
f2 <- function(x) {
  if (length(x) <= 1) return(NULL)
  x[-length(x)]
}

x<-3; x[-length(x)]
## numeric(0)

x<-c(3,4); x[-length(x)]
## [1] 3

x<-c(3,4,5); x[-length(x)]
## [1] 3 4

# good name
remove_last <- function(x) {
  if (length(x) <= 1) return(NULL)
  x[-length(x)]
}

Arg names

# bad arg names?
mean_ci <- function(c, nums) {
  se <- sd(nums) / sqrt(length(nums))
  alpha <- 1 - c
  mean(nums) + se * qnorm(c(alpha / 2, 1 - alpha / 2))
}

# argument nums = sample of data 
# argument c controls the level of the confidence interval
# e.g. 
# c = 0.95 => 95% confidence interval

# c = non-descriptive and it's the name of an existing function in R

# better name: confidence
# since it reveals the purpose of the argument:
# "to control the level of confidence for the interval"
# OR: level
# since it's the same name used for the confint function in base R
# & users may already be familiar with that name for this parameter.

# nums is not inherently bad
# but since it's the placeholder for the vector of data
# a name like x would be more recognizable to users.

# Rewrite mean_ci to take arguments named level and x
mean_ci <- function(level, x) {
  se <- sd(x) / sqrt(length(x))
  alpha <- 1 - level
  mean(x) + se * qnorm(c(alpha / 2, 1 - alpha / 2))
}

Arg order

mean_ci <- function(x, level = .95) {
  se <- sd(x) / sqrt(length(x))
  alpha <- 1 - level
  mean(x) + se * qnorm(c(alpha / 2, 1 - alpha / 2))
}

# Data arguments supply the data to compute on.
# Detail arguments control the details of how the computation is done.
# Data arguments should come first
# Detail arguments should go on the end, and usually should have default values.

Return statements

# pass mean_ci() an empty vector 
# it returns a confidence interval with missing values at both ends
mean_ci(numeric(0))
## [1] NA NA

#? produce a warning "x was empty" and return c(-Inf, Inf)

mean_ci <- function(x, level = 0.95) {
  if (length(x) == 0) {
    warning("`x` was empty", call. = FALSE)
    interval <- c(-Inf, Inf)
  } else { 
    se <- sd(x) / sqrt(length(x))
    alpha <- 1 - level
    interval <- mean(x) + 
      se * qnorm(c(alpha / 2, 1 - alpha / 2))
  }
  interval
}

# how hard it is now to follow the logic of the function

# an early return() makes sense
# if x is empty
# the function should immediately return c(-Inf, Inf).

# Alter the mean_ci function
mean_ci <- function(x, level = 0.95) {
  # The if statement should be 
  # the first thing in the body of the function definition
  if (length(x) == 0) {
    warning("`x` was empty", call. = FALSE)
    return(c(-Inf, Inf))
  }
  se <- sd(x) / sqrt(length(x))
  alpha <- 1 - level
  mean(x) + se * qnorm(c(alpha / 2, 1 - alpha / 2))
}

Practice

Poorly written function

f <- function(x, y) {
  x[is.na(x)] <- y
  cat(sum(is.na(x)), y, "\n")
  x
}

# What does this function do? 
# Let's try to figure it out by passing in some arguments.

# Define a numeric vector x with the values 1, 2, NA, 4 and 5
x <- c(1, 2, NA, 4, 5)

# Call f() with the arguments x = x and y = 3
f(x = x, y = 3)
## 0 3
## [1] 1 2 3 4 5

# Call f() with the arguments x = x and y = 10
f(x = x, y = 10)
## 0 10
## [1]  1  2 10  4  5

so… f() takes a vector x and replaces any missing values in it with the value y.

e.g. df\(z <- f(df\)z, 0)

I know this replaces any missing values in the column df$z with the value 0

Anyone else who comes across that line is going to have to go back and find the definition of f and see if they can reason it out.

Rename our function and arguments

# Rename the function f() to replace_missings()
replace_missings <- function(x, replacement) {
  # Change the name of the y argument to replacement
  x[is.na(x)] <- replacement
  cat(sum(is.na(x)), replacement, "\n")
  x
}

# Rewrite the call on df$z to match our new names
df$z <- replace_missings(df$z, replacement = 0)
## 0 0

Unnecessary duplication in body

replace_missings <- function(x, replacement) {
  # Define is_miss = logical that identifies the missing values in x.
  is_miss <- is.na(x)
  
  # Rewrite rest of function to refer to is_miss
  x[is_miss] <- replacement
  cat(sum(is_miss), replacement, "\n")
  x
}

Replace_missings outputs to console

# try it:
replace_missings(df$z, replacement = 0)
## 0 0
## numeric(0)

# It would be much nicer to say "2 missing values replaced by the value 0".

bad practice: to use cat() for anything other than a print() method (a function designed just to display output).

good practice: diagnostic information => message() function unnamed arguments are pasted together with no separator (and no need for a newline at the end) and by default are printed to the screen.

Final tweaks

replace_missings <- function(x, replacement) {
  is_miss <- is.na(x)
  x[is_miss] <- replacement
  
  # Rewrite to use message()
  message(sum(is_miss)," missings replaced by the value ", replacement)
  x
}

# Check your new function by running on df$z
replace_missings(df$z, replacement = 0)
## 0 missings replaced by the value 0
## numeric(0)

3 - Functional Programming

  • library(purrr)
    • means <- map_dbl(mtcars, mean)
    • medians <- map_dbl(mtcars, median)
  • Give equal weight to verbs and nouns
  • Abstract away the details of implementation
  • Functions can be arguments too

Using a for loop to remove duplication


df <- data.frame(
  a = rnorm(10),
  b = rnorm(10),
  c = rnorm(10),
  d = rnorm(10)
)

#compute the median of each column
median(df[[1]])
## [1] 0.2522204
median(df[[2]])
## [1] 0.1837433
median(df[[3]])
## [1] 0.03352717
median(df[[4]])
## [1] 0.4091961

# That is a lot of repetition
# => reduce the duplication by using a for loop.

# Initialize output vector
output <- vector("double", ncol(df))  

# Fill in the body of the for loop

for (i in seq_along(df)) {            
  
  output[i] <- median(df[[i]])
  
}

# View the result
output
## [1] 0.25222038 0.18374329 0.03352717 0.40919615

Turning the for loop into a function


# another data frame df2 
df2 <- data.frame(
  a = rnorm(10),
  b = rnorm(10),
  c = rnorm(10),
  d = rnorm(10)
)
# => copy and paste the for loop
#    edit every reference to df to be df2 instead
output <- vector("double", ncol(df2))  
for (i in seq_along(df2)) {            
  output[[i]] <- median(df2[[i]])      
}
# repeat for df3 etc.. WRITE A FUNCTION


# From code to function: col_median()
col_median <- function(df){

      # Just embed the code in the body
      output <- vector("double", ncol(df))  
      for (i in seq_along(df)) {            
        output[[i]] <- median(df[[i]])      
      }
      output
  
}

Column means: col_mean() function


# Create a col_mean() function by editing col_median() to find the column means instead.

col_mean <- function(df){

      # Just embed the code in the body
      output <- vector("double", ncol(df))  
      for (i in seq_along(df)) {            
        output[[i]] <- mean(df[[i]])      
      }
      output
  
}

Column standard deviations: col_sd() function


# Create a col_sd() function by editing col_median() to find the column standard deviations instead.

col_sd <- function(df){

      # Just embed the code in the body
      output <- vector("double", ncol(df))  
      for (i in seq_along(df)) {            
        output[[i]] <- sd(df[[i]])      
      }
      output
  
}

copied and pasted the function col_median two times

time to write a function again

f: take column summaries for any summary function we provide

Remove duplication with arguments

Simpler example:


f1 <- function(x) abs(x - mean(x)) ^ 1
f2 <- function(x) abs(x - mean(x)) ^ 2
f3 <- function(x) abs(x - mean(x)) ^ 3

# Q: How could you remove the duplication in this set of function definitions?

# A: single function with two arguments: x and power

# Add a second argument called power
f <- function(x, power) {
    # Edit the body to return absolute deviations raised to power
    abs(x - mean(x))^ power
}

Function as an argument


# remove the duplication in our set of summary functions 
# by requiring the function doing the summary as an input

col_summary <- function(df, fun) {
  output <- vector("numeric", length(df))
  for (i in seq_along(df)) {
    output[[i]] <- fun(df[[i]])
  }
  output
}

# Find the column medians using col_median() and col_summary()
col_median(df)
## [1] 0.25222038 0.18374329 0.03352717 0.40919615
col_summary(df,median)
## [1] 0.25222038 0.18374329 0.03352717 0.40919615

# Find the column means using col_mean() and col_summary()
col_mean(df)
## [1] -0.02125549  0.20077120 -0.05484722  0.01907553
col_summary(df, mean)
## [1] -0.02125549  0.20077120 -0.05484722  0.01907553
sapply(df,mean)
##           a           b           c           d 
## -0.02125549  0.20077120 -0.05484722  0.01907553
library(purrr)
map_dbl(df,mean)
##           a           b           c           d 
## -0.02125549  0.20077120 -0.05484722  0.01907553


# Find the column IQRs using col_summary()
col_summary(df, IQR)
## [1] 0.7316208 0.6611129 1.1723632 1.2884701

The map functions

take a vector, .x, as the first argument return .f applied to each element of .x

type of object that is returned is determined by function suffix (the part after _):

  • map() returns a list or data frame
  • map_lgl() returns a logical vector
  • map_int() returns a integer vector
  • map_dbl() returns a double vector
  • map_chr() returns a character vector

# repeat our column summaries 
# using a map function 
# instead of our col_summary() function

# every summary we calculated 
# returned a single numeric value, 
# so we'll use map_dbl().

# Load the purrr package
library(purrr)

# Use map_dbl() to find column means
map_dbl(df, mean) 
##           a           b           c           d 
## -0.02125549  0.20077120 -0.05484722  0.01907553

# Use map_dbl() to column medians
map_dbl(df, median) 
##          a          b          c          d 
## 0.25222038 0.18374329 0.03352717 0.40919615

# Use map_dbl() to find column standard deviations
map_dbl(df, sd) 
##         a         b         c         d 
## 0.8818169 0.7290350 0.8757765 1.1636818

the … (“dot dot dot”) argument

the … (“dot dot dot”) argument used to pass along additional arguments to .f each time it’s called

e.g. * pass the trim argument to the mean() function: map_dbl(df, mean, trim = 0.5) * multiple arguments can be passed: map_dbl(df, mean, trim = 0.5, na.rm = TRUE)

You don’t have to specify the arguments by name, but it is good practice!

Why .x and .f ?

Unlikely to be argument names you might pass through the …, thereby preventing confusion about whether an argument belongs to map() or to the function being mapped.

library(nycflights13) # contains planes data frame

head(planes)
## # A tibble: 6 x 9
##   tailnum  year type       manufacturer  model  engines seats speed engine
##   <chr>   <int> <chr>      <chr>         <chr>    <int> <int> <int> <chr> 
## 1 N10156   2004 Fixed win~ EMBRAER       EMB-1~       2    55    NA Turbo~
## 2 N102UW   1998 Fixed win~ AIRBUS INDUS~ A320-~       2   182    NA Turbo~
## 3 N103US   1999 Fixed win~ AIRBUS INDUS~ A320-~       2   182    NA Turbo~
## 4 N104UW   1999 Fixed win~ AIRBUS INDUS~ A320-~       2   182    NA Turbo~
## 5 N10575   2002 Fixed win~ EMBRAER       EMB-1~       2    55    NA Turbo~
## 6 N105UW   1999 Fixed win~ AIRBUS INDUS~ A320-~       2   182    NA Turbo~

head(planes[,c(2,6:8)])
## # A tibble: 6 x 4
##    year engines seats speed
##   <int>   <int> <int> <int>
## 1  2004       2    55    NA
## 2  1998       2   182    NA
## 3  1999       2   182    NA
## 4  1999       2   182    NA
## 5  2002       2    55    NA
## 6  1999       2   182    NA

planes <- planes[,c(2,6:8)]

# Find the mean of each column
map_dbl(planes, mean)
##       year    engines      seats      speed 
##         NA   1.995184 154.316376         NA

# Find the mean of each column, excluding missing values
map_dbl(planes, mean, na.rm = TRUE)
##        year     engines       seats       speed 
## 2000.484010    1.995184  154.316376  236.782609

# Find the 5th percentile of each column, excluding missing values
map_dbl(planes, quantile, probs = 0.05, na.rm = TRUE)
##    year engines   seats   speed 
##  1988.0     2.0    55.0    90.5

Picking the right map function

map() will return a list

if you know what type of output you expect you are better to use the corresponding function

e.g.

map_lgl(df, mean) # either returns a logical vector or an error
## Error: Can't coerce element 1 from a double to a logical

map functions = “type consistent”

try 1st element: mean(df[[1]]) returns: a single numeric value suggesting: map_dbl()

# create df3:
A <- c(-0.4766209, -1.67653203, -0.91437464, -0.73426835, -0.91774245, 0.46111978, -0.02242416, -0.22946634, 2.85772705, 0.87772213)  
D <- c(1.0625948,-0.5889523,0.4987057,1.147293,-1.3138689,0.818433,0.980729,0.1505957,1.1003666,-0.1864504)
df3 <- data.frame("A" = A, "B" = rep(c("A","B"),5), "C" = c(1:10), "D" = D, stringsAsFactors = FALSE)

# Find the columns that are numeric
map_lgl(df3, is.numeric)
##     A     B     C     D 
##  TRUE FALSE  TRUE  TRUE

# Find the type of each column
map_chr(df3, typeof)
##           A           B           C           D 
##    "double" "character"   "integer"    "double"

# Find a summary of each column
map(df3, summary)
## $A
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1.67653 -0.86935 -0.35304 -0.07749  0.34023  2.85773 
## 
## $B
##    Length     Class      Mode 
##        10 character character 
## 
## $C
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    3.25    5.50    5.50    7.75   10.00 
## 
## $D
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.3139 -0.1022  0.6586  0.3669  1.0421  1.1473

Specify .f


#an existing function
map(df, summary)
## $a
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -2.04919 -0.32667  0.25222 -0.02126  0.40495  1.17456 
## 
## $b
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.0177 -0.2097  0.1837  0.2008  0.4514  1.5452 
## 
## $c
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1.31345 -0.67938  0.03353 -0.05485  0.49298  1.39675 
## 
## $d
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -2.67295 -0.42279  0.40920  0.01908  0.86568  1.10229

# an existing function you defined
map(df, rescale01)
## $a
##  [1] 1.0000000 0.0000000 0.6879776 0.5191789 0.7398066 0.4283861 0.5797533
##  [8] 0.8193482 0.7516915 0.7644625
## 
## $b
##  [1] 0.5064432 0.5276586 0.4311417 0.8033729 0.0000000 0.2999584 0.3612123
##  [8] 0.5884202 0.2361577 1.0000000
## 
## $c
##  [1] 0.5427619 0.5488032 0.4484678 0.0291086 0.7057732 1.0000000 0.7553328
##  [8] 0.4512428 0.0000000 0.1624526
## 
## $d
##  [1] 0.7909682 0.6443476 0.9594164 0.0000000 1.0000000 0.4633025 0.8418539
##  [8] 0.9798662 0.8710606 0.5799268

# an anonymous function defined on the fly
map(df, function(x) sum(is.na(x)))
## $a
## [1] 0
## 
## $b
## [1] 0
## 
## $c
## [1] 0
## 
## $d
## [1] 0

# an anonymous function defined using a formula shortcut
map(df, ~ sum(is.na(.)))
## $a
## [1] 0
## 
## $b
## [1] 0
## 
## $c
## [1] 0
## 
## $d
## [1] 0

.f = [[

list_of_results <- list(
                   list(a = 1, b = "A"),
                   list(a = 2, b = "C"),
                   list(a = 3, b = "D")
                   )

# an anonymous function
map_dbl(list_of_results, function(x) x[["a"]])
## [1] 1 2 3

# Shortcut: string subsetting
map_dbl(list_of_results, "a")
## [1] 1 2 3

# Shortcut: integer subsetting (by index)
map_dbl(list_of_results, 1)
## [1] 1 2 3

Define cyl

  • Fit regression to each of the data frames in cyl
  • Quantify relationship between mpg and wt
    • Slopes for regressions on mpg on weight for each cylinder class
      • 4 6 8
      • -5.647025 -2.780106 -2.192438
# Split the data frame mtcars based on the unique values in the cyl column
cyl <- split(mtcars, mtcars$cyl)

str(cyl) # list of data frames
## List of 3
##  $ 4:'data.frame':   11 obs. of  11 variables:
##   ..$ mpg : num [1:11] 22.8 24.4 22.8 32.4 30.4 33.9 21.5 27.3 26 30.4 ...
##   ..$ cyl : num [1:11] 4 4 4 4 4 4 4 4 4 4 ...
##   ..$ disp: num [1:11] 108 146.7 140.8 78.7 75.7 ...
##   ..$ hp  : num [1:11] 93 62 95 66 52 65 97 66 91 113 ...
##   ..$ drat: num [1:11] 3.85 3.69 3.92 4.08 4.93 4.22 3.7 4.08 4.43 3.77 ...
##   ..$ wt  : num [1:11] 2.32 3.19 3.15 2.2 1.61 ...
##   ..$ qsec: num [1:11] 18.6 20 22.9 19.5 18.5 ...
##   ..$ vs  : num [1:11] 1 1 1 1 1 1 1 1 0 1 ...
##   ..$ am  : num [1:11] 1 0 0 1 1 1 0 1 1 1 ...
##   ..$ gear: num [1:11] 4 4 4 4 4 4 3 4 5 5 ...
##   ..$ carb: num [1:11] 1 2 2 1 2 1 1 1 2 2 ...
##  $ 6:'data.frame':   7 obs. of  11 variables:
##   ..$ mpg : num [1:7] 21 21 21.4 18.1 19.2 17.8 19.7
##   ..$ cyl : num [1:7] 6 6 6 6 6 6 6
##   ..$ disp: num [1:7] 160 160 258 225 168 ...
##   ..$ hp  : num [1:7] 110 110 110 105 123 123 175
##   ..$ drat: num [1:7] 3.9 3.9 3.08 2.76 3.92 3.92 3.62
##   ..$ wt  : num [1:7] 2.62 2.88 3.21 3.46 3.44 ...
##   ..$ qsec: num [1:7] 16.5 17 19.4 20.2 18.3 ...
##   ..$ vs  : num [1:7] 0 0 1 1 1 1 0
##   ..$ am  : num [1:7] 1 1 0 0 0 0 1
##   ..$ gear: num [1:7] 4 4 3 3 4 4 5
##   ..$ carb: num [1:7] 4 4 1 1 4 4 6
##  $ 8:'data.frame':   14 obs. of  11 variables:
##   ..$ mpg : num [1:14] 18.7 14.3 16.4 17.3 15.2 10.4 10.4 14.7 15.5 15.2 ...
##   ..$ cyl : num [1:14] 8 8 8 8 8 8 8 8 8 8 ...
##   ..$ disp: num [1:14] 360 360 276 276 276 ...
##   ..$ hp  : num [1:14] 175 245 180 180 180 205 215 230 150 150 ...
##   ..$ drat: num [1:14] 3.15 3.21 3.07 3.07 3.07 2.93 3 3.23 2.76 3.15 ...
##   ..$ wt  : num [1:14] 3.44 3.57 4.07 3.73 3.78 ...
##   ..$ qsec: num [1:14] 17 15.8 17.4 17.6 18 ...
##   ..$ vs  : num [1:14] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ am  : num [1:14] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ gear: num [1:14] 3 3 3 3 3 3 3 3 3 3 ...
##   ..$ carb: num [1:14] 2 4 3 3 3 4 4 4 2 2 ...

cyl[[1]]
##                 mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Datsun 710     22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Merc 240D      24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Merc 230       22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Fiat 128       32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## Honda Civic    30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## Toyota Corolla 33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## Toyota Corona  21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## Fiat X1-9      27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## Porsche 914-2  26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## Lotus Europa   30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## Volvo 142E     21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2

Fit the regression to the first group of cars

# Split the data frame mtcars based on the unique values in the cyl column
# Examine the structure of cyl
# confirm the structure of this list of data frames
str(cyl)
## List of 3
##  $ 4:'data.frame':   11 obs. of  11 variables:
##   ..$ mpg : num [1:11] 22.8 24.4 22.8 32.4 30.4 33.9 21.5 27.3 26 30.4 ...
##   ..$ cyl : num [1:11] 4 4 4 4 4 4 4 4 4 4 ...
##   ..$ disp: num [1:11] 108 146.7 140.8 78.7 75.7 ...
##   ..$ hp  : num [1:11] 93 62 95 66 52 65 97 66 91 113 ...
##   ..$ drat: num [1:11] 3.85 3.69 3.92 4.08 4.93 4.22 3.7 4.08 4.43 3.77 ...
##   ..$ wt  : num [1:11] 2.32 3.19 3.15 2.2 1.61 ...
##   ..$ qsec: num [1:11] 18.6 20 22.9 19.5 18.5 ...
##   ..$ vs  : num [1:11] 1 1 1 1 1 1 1 1 0 1 ...
##   ..$ am  : num [1:11] 1 0 0 1 1 1 0 1 1 1 ...
##   ..$ gear: num [1:11] 4 4 4 4 4 4 3 4 5 5 ...
##   ..$ carb: num [1:11] 1 2 2 1 2 1 1 1 2 2 ...
##  $ 6:'data.frame':   7 obs. of  11 variables:
##   ..$ mpg : num [1:7] 21 21 21.4 18.1 19.2 17.8 19.7
##   ..$ cyl : num [1:7] 6 6 6 6 6 6 6
##   ..$ disp: num [1:7] 160 160 258 225 168 ...
##   ..$ hp  : num [1:7] 110 110 110 105 123 123 175
##   ..$ drat: num [1:7] 3.9 3.9 3.08 2.76 3.92 3.92 3.62
##   ..$ wt  : num [1:7] 2.62 2.88 3.21 3.46 3.44 ...
##   ..$ qsec: num [1:7] 16.5 17 19.4 20.2 18.3 ...
##   ..$ vs  : num [1:7] 0 0 1 1 1 1 0
##   ..$ am  : num [1:7] 1 1 0 0 0 0 1
##   ..$ gear: num [1:7] 4 4 3 3 4 4 5
##   ..$ carb: num [1:7] 4 4 1 1 4 4 6
##  $ 8:'data.frame':   14 obs. of  11 variables:
##   ..$ mpg : num [1:14] 18.7 14.3 16.4 17.3 15.2 10.4 10.4 14.7 15.5 15.2 ...
##   ..$ cyl : num [1:14] 8 8 8 8 8 8 8 8 8 8 ...
##   ..$ disp: num [1:14] 360 360 276 276 276 ...
##   ..$ hp  : num [1:14] 175 245 180 180 180 205 215 230 150 150 ...
##   ..$ drat: num [1:14] 3.15 3.21 3.07 3.07 3.07 2.93 3 3.23 2.76 3.15 ...
##   ..$ wt  : num [1:14] 3.44 3.57 4.07 3.73 3.78 ...
##   ..$ qsec: num [1:14] 17 15.8 17.4 17.6 18 ...
##   ..$ vs  : num [1:14] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ am  : num [1:14] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ gear: num [1:14] 3 3 3 3 3 3 3 3 3 3 ...
##   ..$ carb: num [1:14] 2 4 3 3 3 4 4 4 2 2 ...

# Extract the first element into four_cyls
four_cyls <- cyl[1] # incorrect

str(four_cyls)
## List of 1
##  $ 4:'data.frame':   11 obs. of  11 variables:
##   ..$ mpg : num [1:11] 22.8 24.4 22.8 32.4 30.4 33.9 21.5 27.3 26 30.4 ...
##   ..$ cyl : num [1:11] 4 4 4 4 4 4 4 4 4 4 ...
##   ..$ disp: num [1:11] 108 146.7 140.8 78.7 75.7 ...
##   ..$ hp  : num [1:11] 93 62 95 66 52 65 97 66 91 113 ...
##   ..$ drat: num [1:11] 3.85 3.69 3.92 4.08 4.93 4.22 3.7 4.08 4.43 3.77 ...
##   ..$ wt  : num [1:11] 2.32 3.19 3.15 2.2 1.61 ...
##   ..$ qsec: num [1:11] 18.6 20 22.9 19.5 18.5 ...
##   ..$ vs  : num [1:11] 1 1 1 1 1 1 1 1 0 1 ...
##   ..$ am  : num [1:11] 1 0 0 1 1 1 0 1 1 1 ...
##   ..$ gear: num [1:11] 4 4 4 4 4 4 3 4 5 5 ...
##   ..$ carb: num [1:11] 1 2 2 1 2 1 1 1 2 2 ...

four_cyls <- cyl[[1]] # correct

str(four_cyls)
## 'data.frame':    11 obs. of  11 variables:
##  $ mpg : num  22.8 24.4 22.8 32.4 30.4 33.9 21.5 27.3 26 30.4 ...
##  $ cyl : num  4 4 4 4 4 4 4 4 4 4 ...
##  $ disp: num  108 146.7 140.8 78.7 75.7 ...
##  $ hp  : num  93 62 95 66 52 65 97 66 91 113 ...
##  $ drat: num  3.85 3.69 3.92 4.08 4.93 4.22 3.7 4.08 4.43 3.77 ...
##  $ wt  : num  2.32 3.19 3.15 2.2 1.61 ...
##  $ qsec: num  18.6 20 22.9 19.5 18.5 ...
##  $ vs  : num  1 1 1 1 1 1 1 1 0 1 ...
##  $ am  : num  1 0 0 1 1 1 0 1 1 1 ...
##  $ gear: num  4 4 4 4 4 4 3 4 5 5 ...
##  $ carb: num  1 2 2 1 2 1 1 1 2 2 ...

# Fit a linear regression of miles per gallon on weight
# Fit a linear regression of mpg on wt using four_cyls

  # To fit a linear model with a
  # response variable y and explanatory variable x, 
  # you can call the lm() function with the formula y ~ x.

  # incorrect:
  # lm(four_cyls$mpg ~ four_cyls$wt,four_cyls)
  lm(mpg ~ wt, data = four_cyls)
## 
## Call:
## lm(formula = mpg ~ wt, data = four_cyls)
## 
## Coefficients:
## (Intercept)           wt  
##      39.571       -5.647

Anonymous function

Now we have a snippet of code that performs the operation we want on one data frame.


# create function
fit_reg <- function(df) {
  lm(mpg ~ wt, data = df)
}

# pass this to map
map(cyl, fit_reg)
## $`4`
## 
## Call:
## lm(formula = mpg ~ wt, data = df)
## 
## Coefficients:
## (Intercept)           wt  
##      39.571       -5.647  
## 
## 
## $`6`
## 
## Call:
## lm(formula = mpg ~ wt, data = df)
## 
## Coefficients:
## (Intercept)           wt  
##       28.41        -2.78  
## 
## 
## $`8`
## 
## Call:
## lm(formula = mpg ~ wt, data = df)
## 
## Coefficients:
## (Intercept)           wt  
##      23.868       -2.192

# Rewrite to call an anonymous function
    # incorrect  
    # map(cyl, lm(mpg ~ wt, data = df))

    # The anonymous function is defined as 
    # function(df) lm(y ~ x, data = df)
    # which should be the .f argument 
    # to the function map().

    map(cyl, function(df) lm(mpg ~ wt, data = df))
## $`4`
## 
## Call:
## lm(formula = mpg ~ wt, data = df)
## 
## Coefficients:
## (Intercept)           wt  
##      39.571       -5.647  
## 
## 
## $`6`
## 
## Call:
## lm(formula = mpg ~ wt, data = df)
## 
## Coefficients:
## (Intercept)           wt  
##       28.41        -2.78  
## 
## 
## $`8`
## 
## Call:
## lm(formula = mpg ~ wt, data = df)
## 
## Coefficients:
## (Intercept)           wt  
##      23.868       -2.192

Formula shortcut

“purrr” provides a shortcut that allows you to re write an anonymous function: * as a one-sided formula * starts with a ~ * followed by an R expression

In purrr’s map functions: “R expression” can refer to an element of the .x argument using the . character.

e.g. mean displacement for each data frame

use ananonymous function:

map_dbl(cyl, function(df) mean(df$disp))
##        4        6        8 
## 105.1364 183.3143 353.1000

use formula shortcut:

# replace the function definition (function(df)) 
#     with the ~
# to refer to the element of cyl the function operates on (in this case df)
#     we use a .
map_dbl(cyl, ~ mean(.$disp))
##        4        6        8 
## 105.1364 183.3143 353.1000
  • less typing
  • no need for argument name

Re write Regression with formula shortcut:

# Rewrite:
map(cyl, function(df) lm(mpg ~ wt, data = df))
## $`4`
## 
## Call:
## lm(formula = mpg ~ wt, data = df)
## 
## Coefficients:
## (Intercept)           wt  
##      39.571       -5.647  
## 
## 
## $`6`
## 
## Call:
## lm(formula = mpg ~ wt, data = df)
## 
## Coefficients:
## (Intercept)           wt  
##       28.41        -2.78  
## 
## 
## $`8`
## 
## Call:
## lm(formula = mpg ~ wt, data = df)
## 
## Coefficients:
## (Intercept)           wt  
##      23.868       -2.192

# to use the formula shortcut:
map(cyl, ~ lm(mpg ~ wt, data = .))
## $`4`
## 
## Call:
## lm(formula = mpg ~ wt, data = .)
## 
## Coefficients:
## (Intercept)           wt  
##      39.571       -5.647  
## 
## 
## $`6`
## 
## Call:
## lm(formula = mpg ~ wt, data = .)
## 
## Coefficients:
## (Intercept)           wt  
##       28.41        -2.78  
## 
## 
## $`8`
## 
## Call:
## lm(formula = mpg ~ wt, data = .)
## 
## Coefficients:
## (Intercept)           wt  
##      23.868       -2.192

Useful shortcuts: subset each element of the .x argument

if: .f argument to a map function is set equal to a string, let’s say “name” then: purrr extracts the “name” element from every element of .x

i.e. with nested lists

# list of where every element contains an a and b element:

list_of_results <- list(
  list(a = 1, b = "A"), 
  list(a = 2, b = "C"), 
  list(a = 3, b = "D")
)

# pull out the a element from every entry

# string shortcut
map(list_of_results, "a")
## [[1]]
## [1] 1
## 
## [[2]]
## [1] 2
## 
## [[3]]
## [1] 3

# Save the result from the previous exercise to the variable models
# fit the models
models <- map(cyl, ~ lm(mpg ~ wt, data = .))

# get the coefficients for each model
coefs <- map(models, coef)


# Use string shortcut to extract the wt coefficient
map(coefs, "wt")
## $`4`
## [1] -5.647025
## 
## $`6`
## [1] -2.780106
## 
## $`8`
## [1] -2.192438

The .f argument = numeric vector


# Extract the second element from each vector in coefs 
# using 
#   numeric shortcut 
#   map_dbl()  [pulling out a single numeric value
#               from each element]
map_dbl(coefs, 2)
##         4         6         8 
## -5.647025 -2.780106 -2.192438

Pipe Operator (another shortcut)

%>%

x %>% f(y) == f(x, y)

LHS “x” == 1st argument to RHS f()

# split the data frame mtcars
cyl <- split(mtcars, mtcars$cyl) 

# fit models
# pass cyl as the first argument to map
map(cyl, ~ lm(mpg ~ wt, data = .))
## $`4`
## 
## Call:
## lm(formula = mpg ~ wt, data = .)
## 
## Coefficients:
## (Intercept)           wt  
##      39.571       -5.647  
## 
## 
## $`6`
## 
## Call:
## lm(formula = mpg ~ wt, data = .)
## 
## Coefficients:
## (Intercept)           wt  
##       28.41        -2.78  
## 
## 
## $`8`
## 
## Call:
## lm(formula = mpg ~ wt, data = .)
## 
## Coefficients:
## (Intercept)           wt  
##      23.868       -2.192

# REWRITE with PIPE OPERATOR:

# split the data frame mtcars on cyl
split(mtcars, mtcars$cyl) %>%
  # use map() on the result
  map(~ lm(mpg ~ wt, data = .))
## $`4`
## 
## Call:
## lm(formula = mpg ~ wt, data = .)
## 
## Coefficients:
## (Intercept)           wt  
##      39.571       -5.647  
## 
## 
## $`6`
## 
## Call:
## lm(formula = mpg ~ wt, data = .)
## 
## Coefficients:
## (Intercept)           wt  
##       28.41        -2.78  
## 
## 
## $`8`
## 
## Call:
## lm(formula = mpg ~ wt, data = .)
## 
## Coefficients:
## (Intercept)           wt  
##      23.868       -2.192

# chain together many operations
mtcars %>% 
  split(mtcars$cyl) %>%
  map(~ lm(mpg ~ wt, data = .)) %>%
  map(coef) %>% 
  map_dbl("wt")
##         4         6         8 
## -5.647025 -2.780106 -2.192438

# Now, pull out the R2 from each model
  models <- mtcars %>% 
  split(mtcars$cyl) %>%
  map(~ lm(mpg ~ wt, data = .))

# Rewrite to be a single command using pipes 
# summaries <- map(models, summary)
# map_dbl(summaries, "r.squared")
    # LHS = models
    # RHS = map
    models %>%
    map(summary) %>%
    map_dbl("r.squared")
##         4         6         8 
## 0.5086326 0.4645102 0.4229655

4 - Advanced inputs & outputs

safely() is an adverb

argument = function returns = function

never throws an error (and never stops the rest of your computation!)

returns a list with two elements:

  • result is the original result
    • If there was an error, this will be NULL
  • error is an error object
    • If the operation was successful this will be NULL
# pass readLines() to safely()
safe_readLines <- safely(readLines)

# Call safe_readLines() on "http://example.org"
safe_readLines("http://example.org")
## $result
##  [1] "<!doctype html>"                                                                                      
##  [2] "<html>"                                                                                               
##  [3] "<head>"                                                                                               
##  [4] "    <title>Example Domain</title>"                                                                    
##  [5] ""                                                                                                     
##  [6] "    <meta charset=\"utf-8\" />"                                                                       
##  [7] "    <meta http-equiv=\"Content-type\" content=\"text/html; charset=utf-8\" />"                        
##  [8] "    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />"                       
##  [9] "    <style type=\"text/css\">"                                                                        
## [10] "    body {"                                                                                           
## [11] "        background-color: #f0f0f2;"                                                                   
## [12] "        margin: 0;"                                                                                   
## [13] "        padding: 0;"                                                                                  
## [14] "        font-family: \"Open Sans\", \"Helvetica Neue\", Helvetica, Arial, sans-serif;"                
## [15] "        "                                                                                             
## [16] "    }"                                                                                                
## [17] "    div {"                                                                                            
## [18] "        width: 600px;"                                                                                
## [19] "        margin: 5em auto;"                                                                            
## [20] "        padding: 50px;"                                                                               
## [21] "        background-color: #fff;"                                                                      
## [22] "        border-radius: 1em;"                                                                          
## [23] "    }"                                                                                                
## [24] "    a:link, a:visited {"                                                                              
## [25] "        color: #38488f;"                                                                              
## [26] "        text-decoration: none;"                                                                       
## [27] "    }"                                                                                                
## [28] "    @media (max-width: 700px) {"                                                                      
## [29] "        body {"                                                                                       
## [30] "            background-color: #fff;"                                                                  
## [31] "        }"                                                                                            
## [32] "        div {"                                                                                        
## [33] "            width: auto;"                                                                             
## [34] "            margin: 0 auto;"                                                                          
## [35] "            border-radius: 0;"                                                                        
## [36] "            padding: 1em;"                                                                            
## [37] "        }"                                                                                            
## [38] "    }"                                                                                                
## [39] "    </style>    "                                                                                     
## [40] "</head>"                                                                                              
## [41] ""                                                                                                     
## [42] "<body>"                                                                                               
## [43] "<div>"                                                                                                
## [44] "    <h1>Example Domain</h1>"                                                                          
## [45] "    <p>This domain is established to be used for illustrative examples in documents. You may use this"
## [46] "    domain in examples without prior coordination or asking for permission.</p>"                      
## [47] "    <p><a href=\"http://www.iana.org/domains/example\">More information...</a></p>"                   
## [48] "</div>"                                                                                               
## [49] "</body>"                                                                                              
## [50] "</html>"                                                                                              
## 
## $error
## NULL

# Call safe_readLines() on "http://asdfasdasdkfjlda"
safe_readLines("http://asdfasdasdkfjlda")
## Warning in file(con, "r"): InternetOpenUrl failed: 'The server name or
## address could not be resolved'
## $result
## NULL
## 
## $error
## <simpleError in file(con, "r"): cannot open the connection>

map() safely()


urls <- list(
  example = "http://example.org",
  rproj = "http://www.r-project.org",
  asdf = "http://asdfasdasdkfjlda"
)

# download the HTML files at each URL
### map(urls, readLines)
### Error in file(con, "r") : cannot open the connection
### no output for any of the URLs
### :(
### solve this problem 
### :)
### by using our safe_readLines()

   html <- map(urls, safe_readLines)
## Warning in file(con, "r"): InternetOpenUrl failed: 'The server name or
## address could not be resolved'
   # Warning message: 
   # URL 'http://asdfasdasdkfjlda/': status was 'Couldn't resolve host name'
   
   # html 
   # contains 
   #   the HTML for each of the two URLs 
   #     on which readLines() was successful 
   #   and the error for the other
   # BUT: buried in the inner-most level of the list.

   str(html)
## List of 3
##  $ example:List of 2
##   ..$ result: chr [1:50] "<!doctype html>" "<html>" "<head>" "    <title>Example Domain</title>" ...
##   ..$ error : NULL
##  $ rproj  :List of 2
##   ..$ result: chr [1:111] "<!DOCTYPE html>" "<html lang=\"en\">" "  <head>" "    <meta charset=\"utf-8\">" ...
##   ..$ error : NULL
##  $ asdf   :List of 2
##   ..$ result: NULL
##   ..$ error :List of 2
##   .. ..$ message: chr "cannot open the connection"
##   .. ..$ call   : language file(con, "r")
##   .. ..- attr(*, "class")= chr [1:3] "simpleError" "error" "condition"
   
   # Extract the "result"
   # from one of the two elements that was successful 
   # using double square bracket subsetting.
   
   html[[1]][['result']]
##  [1] "<!doctype html>"                                                                                      
##  [2] "<html>"                                                                                               
##  [3] "<head>"                                                                                               
##  [4] "    <title>Example Domain</title>"                                                                    
##  [5] ""                                                                                                     
##  [6] "    <meta charset=\"utf-8\" />"                                                                       
##  [7] "    <meta http-equiv=\"Content-type\" content=\"text/html; charset=utf-8\" />"                        
##  [8] "    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />"                       
##  [9] "    <style type=\"text/css\">"                                                                        
## [10] "    body {"                                                                                           
## [11] "        background-color: #f0f0f2;"                                                                   
## [12] "        margin: 0;"                                                                                   
## [13] "        padding: 0;"                                                                                  
## [14] "        font-family: \"Open Sans\", \"Helvetica Neue\", Helvetica, Arial, sans-serif;"                
## [15] "        "                                                                                             
## [16] "    }"                                                                                                
## [17] "    div {"                                                                                            
## [18] "        width: 600px;"                                                                                
## [19] "        margin: 5em auto;"                                                                            
## [20] "        padding: 50px;"                                                                               
## [21] "        background-color: #fff;"                                                                      
## [22] "        border-radius: 1em;"                                                                          
## [23] "    }"                                                                                                
## [24] "    a:link, a:visited {"                                                                              
## [25] "        color: #38488f;"                                                                              
## [26] "        text-decoration: none;"                                                                       
## [27] "    }"                                                                                                
## [28] "    @media (max-width: 700px) {"                                                                      
## [29] "        body {"                                                                                       
## [30] "            background-color: #fff;"                                                                  
## [31] "        }"                                                                                            
## [32] "        div {"                                                                                        
## [33] "            width: auto;"                                                                             
## [34] "            margin: 0 auto;"                                                                          
## [35] "            border-radius: 0;"                                                                        
## [36] "            padding: 1em;"                                                                            
## [37] "        }"                                                                                            
## [38] "    }"                                                                                                
## [39] "    </style>    "                                                                                     
## [40] "</head>"                                                                                              
## [41] ""                                                                                                     
## [42] "<body>"                                                                                               
## [43] "<div>"                                                                                                
## [44] "    <h1>Example Domain</h1>"                                                                          
## [45] "    <p>This domain is established to be used for illustrative examples in documents. You may use this"
## [46] "    domain in examples without prior coordination or asking for permission.</p>"                      
## [47] "    <p><a href=\"http://www.iana.org/domains/example\">More information...</a></p>"                   
## [48] "</div>"                                                                                               
## [49] "</body>"                                                                                              
## [50] "</html>"
   
   
   # Extract the "error"
   # from the element that was unsuccessful
   # again using double square bracket subsetting.
   html[[3]][['error']]
## <simpleError in file(con, "r"): cannot open the connection>

transpose()


nested_list <- list(
   x1 = list(a = 1, b = 2),
   x2 = list(a = 3, b = 4)
)

# extract the a element in x1
nested_list[["x1"]][["a"]]
## [1] 1

# transpose the list first, the order of subsetting reverses
transpose(nested_list)[["a"]][["x1"]]
## [1] 1

# handy for safe output
# can easily grab:
# all the results 
# or all the errors


str(transpose(html))
## List of 2
##  $ result:List of 3
##   ..$ example: chr [1:50] "<!doctype html>" "<html>" "<head>" "    <title>Example Domain</title>" ...
##   ..$ rproj  : chr [1:111] "<!DOCTYPE html>" "<html lang=\"en\">" "  <head>" "    <meta charset=\"utf-8\">" ...
##   ..$ asdf   : NULL
##  $ error :List of 3
##   ..$ example: NULL
##   ..$ rproj  : NULL
##   ..$ asdf   :List of 2
##   .. ..$ message: chr "cannot open the connection"
##   .. ..$ call   : language file(con, "r")
##   .. ..- attr(*, "class")= chr [1:3] "simpleError" "error" "condition"

res <- transpose(html)[["result"]]

errs <- transpose(html)[["error"]]

# collect all the results for the elements that were successful 
# examine the inputs for all those that weren't

# Create a logical vector
# TRUE when errs is NULL
is_ok <- map_lgl(errs, is_null)

# Extract the successful results
# subsetting res with is_ok
res[is_ok]
## $example
##  [1] "<!doctype html>"                                                                                      
##  [2] "<html>"                                                                                               
##  [3] "<head>"                                                                                               
##  [4] "    <title>Example Domain</title>"                                                                    
##  [5] ""                                                                                                     
##  [6] "    <meta charset=\"utf-8\" />"                                                                       
##  [7] "    <meta http-equiv=\"Content-type\" content=\"text/html; charset=utf-8\" />"                        
##  [8] "    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />"                       
##  [9] "    <style type=\"text/css\">"                                                                        
## [10] "    body {"                                                                                           
## [11] "        background-color: #f0f0f2;"                                                                   
## [12] "        margin: 0;"                                                                                   
## [13] "        padding: 0;"                                                                                  
## [14] "        font-family: \"Open Sans\", \"Helvetica Neue\", Helvetica, Arial, sans-serif;"                
## [15] "        "                                                                                             
## [16] "    }"                                                                                                
## [17] "    div {"                                                                                            
## [18] "        width: 600px;"                                                                                
## [19] "        margin: 5em auto;"                                                                            
## [20] "        padding: 50px;"                                                                               
## [21] "        background-color: #fff;"                                                                      
## [22] "        border-radius: 1em;"                                                                          
## [23] "    }"                                                                                                
## [24] "    a:link, a:visited {"                                                                              
## [25] "        color: #38488f;"                                                                              
## [26] "        text-decoration: none;"                                                                       
## [27] "    }"                                                                                                
## [28] "    @media (max-width: 700px) {"                                                                      
## [29] "        body {"                                                                                       
## [30] "            background-color: #fff;"                                                                  
## [31] "        }"                                                                                            
## [32] "        div {"                                                                                        
## [33] "            width: auto;"                                                                             
## [34] "            margin: 0 auto;"                                                                          
## [35] "            border-radius: 0;"                                                                        
## [36] "            padding: 1em;"                                                                            
## [37] "        }"                                                                                            
## [38] "    }"                                                                                                
## [39] "    </style>    "                                                                                     
## [40] "</head>"                                                                                              
## [41] ""                                                                                                     
## [42] "<body>"                                                                                               
## [43] "<div>"                                                                                                
## [44] "    <h1>Example Domain</h1>"                                                                          
## [45] "    <p>This domain is established to be used for illustrative examples in documents. You may use this"
## [46] "    domain in examples without prior coordination or asking for permission.</p>"                      
## [47] "    <p><a href=\"http://www.iana.org/domains/example\">More information...</a></p>"                   
## [48] "</div>"                                                                                               
## [49] "</body>"                                                                                              
## [50] "</html>"                                                                                              
## 
## $rproj
##   [1] "<!DOCTYPE html>"                                                                                                                                                                                                                                                                                                                                       
##   [2] "<html lang=\"en\">"                                                                                                                                                                                                                                                                                                                                    
##   [3] "  <head>"                                                                                                                                                                                                                                                                                                                                              
##   [4] "    <meta charset=\"utf-8\">"                                                                                                                                                                                                                                                                                                                          
##   [5] "    <meta http-equiv=\"X-UA-Compatible\" content=\"IE=edge\">"                                                                                                                                                                                                                                                                                         
##   [6] "    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\">"                                                                                                                                                                                                                                                                          
##   [7] "    <title>R: The R Project for Statistical Computing</title>"                                                                                                                                                                                                                                                                                         
##   [8] ""                                                                                                                                                                                                                                                                                                                                                      
##   [9] "    <link rel=\"icon\" type=\"image/png\" href=\"/favicon-32x32.png\" sizes=\"32x32\" />"                                                                                                                                                                                                                                                              
##  [10] "    <link rel=\"icon\" type=\"image/png\" href=\"/favicon-16x16.png\" sizes=\"16x16\" />"                                                                                                                                                                                                                                                              
##  [11] ""                                                                                                                                                                                                                                                                                                                                                      
##  [12] "    <!-- Bootstrap -->"                                                                                                                                                                                                                                                                                                                                
##  [13] "    <link href=\"/css/bootstrap.min.css\" rel=\"stylesheet\">"                                                                                                                                                                                                                                                                                         
##  [14] "    <link href=\"/css/R.css\" rel=\"stylesheet\">"                                                                                                                                                                                                                                                                                                     
##  [15] ""                                                                                                                                                                                                                                                                                                                                                      
##  [16] "    <!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media queries -->"                                                                                                                                                                                                                                                            
##  [17] "    <!-- WARNING: Respond.js doesn't work if you view the page via file:// -->"                                                                                                                                                                                                                                                                        
##  [18] "    <!--[if lt IE 9]>"                                                                                                                                                                                                                                                                                                                                 
##  [19] "      <script src=\"https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js\"></script>"                                                                                                                                                                                                                                                               
##  [20] "      <script src=\"https://oss.maxcdn.com/respond/1.4.2/respond.min.js\"></script>"                                                                                                                                                                                                                                                                   
##  [21] "    <![endif]-->"                                                                                                                                                                                                                                                                                                                                      
##  [22] "  </head>"                                                                                                                                                                                                                                                                                                                                             
##  [23] "  <body>"                                                                                                                                                                                                                                                                                                                                              
##  [24] "    <div class=\"container page\">"                                                                                                                                                                                                                                                                                                                    
##  [25] "      <div class=\"row\">"                                                                                                                                                                                                                                                                                                                             
##  [26] "        <div class=\"col-xs-12 col-sm-offset-1 col-sm-2 sidebar\" role=\"navigation\">"                                                                                                                                                                                                                                                                
##  [27] "<div class=\"row\">"                                                                                                                                                                                                                                                                                                                                   
##  [28] "<div class=\"col-xs-6 col-sm-12\">"                                                                                                                                                                                                                                                                                                                    
##  [29] "<p><a href=\"/\"><img src=\"/Rlogo.png\" width=\"100\" height=\"78\" alt = \"R\" /></a></p>"                                                                                                                                                                                                                                                           
##  [30] "<p><small><a href=\"/\">[Home]</a></small></p>"                                                                                                                                                                                                                                                                                                        
##  [31] "<h2 id=\"download\">Download</h2>"                                                                                                                                                                                                                                                                                                                     
##  [32] "<p><a href=\"http://cran.r-project.org/mirrors.html\">CRAN</a></p>"                                                                                                                                                                                                                                                                                    
##  [33] "<h2 id=\"r-project\">R Project</h2>"                                                                                                                                                                                                                                                                                                                   
##  [34] "<ul>"                                                                                                                                                                                                                                                                                                                                                  
##  [35] "<li><a href=\"/about.html\">About R</a></li>"                                                                                                                                                                                                                                                                                                          
##  [36] "<li><a href=\"/logo/\">Logo</a></li>"                                                                                                                                                                                                                                                                                                                  
##  [37] "<li><a href=\"/contributors.html\">Contributors</a></li>"                                                                                                                                                                                                                                                                                              
##  [38] "<li><a href=\"/news.html\">Whatâ\200\231s New?</a></li>"                                                                                                                                                                                                                                                                                                     
##  [39] "<li><a href=\"/bugs.html\">Reporting Bugs</a></li>"                                                                                                                                                                                                                                                                                                    
##  [40] "<li><a href=\"http://developer.R-project.org\">Development Site</a></li>"                                                                                                                                                                                                                                                                              
##  [41] "<li><a href=\"/conferences.html\">Conferences</a></li>"                                                                                                                                                                                                                                                                                                
##  [42] "<li><a href=\"/search.html\">Search</a></li>"                                                                                                                                                                                                                                                                                                          
##  [43] "</ul>"                                                                                                                                                                                                                                                                                                                                                 
##  [44] "</div>"                                                                                                                                                                                                                                                                                                                                                
##  [45] "<div class=\"col-xs-6 col-sm-12\">"                                                                                                                                                                                                                                                                                                                    
##  [46] "<h2 id=\"r-foundation\">R Foundation</h2>"                                                                                                                                                                                                                                                                                                             
##  [47] "<ul>"                                                                                                                                                                                                                                                                                                                                                  
##  [48] "<li><a href=\"/foundation/\">Foundation</a></li>"                                                                                                                                                                                                                                                                                                      
##  [49] "<li><a href=\"/foundation/board.html\">Board</a></li>"                                                                                                                                                                                                                                                                                                 
##  [50] "<li><a href=\"/foundation/members.html\">Members</a></li>"                                                                                                                                                                                                                                                                                             
##  [51] "<li><a href=\"/foundation/donors.html\">Donors</a></li>"                                                                                                                                                                                                                                                                                               
##  [52] "<li><a href=\"/foundation/donations.html\">Donate</a></li>"                                                                                                                                                                                                                                                                                            
##  [53] "</ul>"                                                                                                                                                                                                                                                                                                                                                 
##  [54] "<h2 id=\"help-with-r\">Help With R</h2>"                                                                                                                                                                                                                                                                                                               
##  [55] "<ul>"                                                                                                                                                                                                                                                                                                                                                  
##  [56] "<li><a href=\"/help.html\">Getting Help</a></li>"                                                                                                                                                                                                                                                                                                      
##  [57] "</ul>"                                                                                                                                                                                                                                                                                                                                                 
##  [58] "<h2 id=\"documentation\">Documentation</h2>"                                                                                                                                                                                                                                                                                                           
##  [59] "<ul>"                                                                                                                                                                                                                                                                                                                                                  
##  [60] "<li><a href=\"http://cran.r-project.org/manuals.html\">Manuals</a></li>"                                                                                                                                                                                                                                                                               
##  [61] "<li><a href=\"http://cran.r-project.org/faqs.html\">FAQs</a></li>"                                                                                                                                                                                                                                                                                     
##  [62] "<li><a href=\"http://journal.r-project.org\">The R Journal</a></li>"                                                                                                                                                                                                                                                                                   
##  [63] "<li><a href=\"/doc/bib/R-books.html\">Books</a></li>"                                                                                                                                                                                                                                                                                                  
##  [64] "<li><a href=\"/certification.html\">Certification</a></li>"                                                                                                                                                                                                                                                                                            
##  [65] "<li><a href=\"/other-docs.html\">Other</a></li>"                                                                                                                                                                                                                                                                                                       
##  [66] "</ul>"                                                                                                                                                                                                                                                                                                                                                 
##  [67] "<h2 id=\"links\">Links</h2>"                                                                                                                                                                                                                                                                                                                           
##  [68] "<ul>"                                                                                                                                                                                                                                                                                                                                                  
##  [69] "<li><a href=\"http://www.bioconductor.org\">Bioconductor</a></li>"                                                                                                                                                                                                                                                                                     
##  [70] "<li><a href=\"/other-projects.html\">Related Projects</a></li>"                                                                                                                                                                                                                                                                                        
##  [71] "<li><a href=\"/gsoc.html\">GSoC</a></li>"                                                                                                                                                                                                                                                                                                              
##  [72] "</ul>"                                                                                                                                                                                                                                                                                                                                                 
##  [73] "</div>"                                                                                                                                                                                                                                                                                                                                                
##  [74] "</div>"                                                                                                                                                                                                                                                                                                                                                
##  [75] "        </div>"                                                                                                                                                                                                                                                                                                                                        
##  [76] "        <div class=\"col-xs-12 col-sm-7\">"                                                                                                                                                                                                                                                                                                            
##  [77] "        <h1>The R Project for Statistical Computing</h1>"                                                                                                                                                                                                                                                                                              
##  [78] "<h2 id=\"getting-started\">Getting Started</h2>"                                                                                                                                                                                                                                                                                                       
##  [79] "<p>R is a free software environment for statistical computing and graphics. It compiles and runs on a wide variety of UNIX platforms, Windows and MacOS. To <strong><a href=\"https://cran.r-project.org/mirrors.html\">download R</a></strong>, please choose your preferred <a href=\"https://cran.r-project.org/mirrors.html\">CRAN mirror</a>.</p>"
##  [80] "<p>If you have questions about R like how to download and install the software, or what the license terms are, please read our <a href=\"https://cran.R-project.org/faqs.html\">answers to frequently asked questions</a> before you send an email.</p>"                                                                                               
##  [81] "<h2 id=\"news\">News</h2>"                                                                                                                                                                                                                                                                                                                             
##  [82] "<ul>"                                                                                                                                                                                                                                                                                                                                                  
##  [83] "<li><p><a href=\"https://cran.r-project.org/src/base/R-3\"><strong>R version 3.5.0 (Joy in Playing)</strong></a> has been released on 2018-04-23.</p></li>"                                                                                                                                                                                            
##  [84] "<li><p><a href=\"https://cran.r-project.org/src/base/R-3\"><strong>R version 3.4.4 (Someone to Lean On)</strong></a> has been released on 2018-03-15.</p></li>"                                                                                                                                                                                        
##  [85] "<li><p><strong>useR! 2018</strong> (July 10 - 13 in Brisbane) is open for registration at <a href=\"https://user2018.r-project.org\"><strong>https://user2018.r-project.org</strong></a></p></li>"                                                                                                                                                     
##  [86] "<li><p><a href=\"https://journal.r-project.org/archive/2017-2\"><strong>The R Journal Volume 9/2</strong></a> is available.</p></li>"                                                                                                                                                                                                                  
##  [87] "<li><p><strong>useR! 2017</strong> took place July 4 - 7 in Brussels <a href=\"https://user2017.brussels\"><strong>https://user2017.brussels</strong></a></p></li>"                                                                                                                                                                                    
##  [88] "<li><p>The <a href=\"https://www.r-project.org/logo\"><strong>R Logo</strong></a> is available for download in high-resolution PNG or SVG formats.</p></li>"                                                                                                                                                                                           
##  [89] "</ul>"                                                                                                                                                                                                                                                                                                                                                 
##  [90] "<!--- (Boilerplate for release run-in)"                                                                                                                                                                                                                                                                                                                
##  [91] "-   [**R version 3.1.3 (Smooth Sidewalk) prerelease versions**](http://cran.r-project.org/src/base-prerelease) will appear starting February 28. Final release is scheduled for 2015-03-09."                                                                                                                                                           
##  [92] "-->"                                                                                                                                                                                                                                                                                                                                                   
##  [93] "        </div>"                                                                                                                                                                                                                                                                                                                                        
##  [94] "      </div>"                                                                                                                                                                                                                                                                                                                                          
##  [95] "      <div class=\"raw footer\">"                                                                                                                                                                                                                                                                                                                      
##  [96] "        &copy; The R Foundation. For queries about this web site, please contact"                                                                                                                                                                                                                                                                      
##  [97] "\t<script type='text/javascript'>"                                                                                                                                                                                                                                                                                                                      
##  [98] "<!--"                                                                                                                                                                                                                                                                                                                                                  
##  [99] "var s=\"=b!isfg>#nbjmup;xfcnbtufsAs.qspkfdu/psh#?uif!xfcnbtufs=0b?\";"                                                                                                                                                                                                                                                                                 
## [100] "m=\"\"; for (i=0; i<s.length; i++) {if(s.charCodeAt(i) == 28){m+= '&';} else if (s.charCodeAt(i) == 23) {m+= '!';} else {m+=String.fromCharCode(s.charCodeAt(i)-1);}}document.write(m);//-->"                                                                                                                                                          
## [101] "\t</script>;"                                                                                                                                                                                                                                                                                                                                           
## [102] "        for queries about R itself, please consult the "                                                                                                                                                                                                                                                                                               
## [103] "        <a href=\"help.html\">Getting Help</a> section."                                                                                                                                                                                                                                                                                               
## [104] "      </div>"                                                                                                                                                                                                                                                                                                                                          
## [105] "    </div>"                                                                                                                                                                                                                                                                                                                                            
## [106] "    <!-- jQuery (necessary for Bootstrap's JavaScript plugins) -->"                                                                                                                                                                                                                                                                                    
## [107] "    <script src=\"https://ajax.googleapis.com/ajax/libs/jquery/1.11.1/jquery.min.js\"></script>"                                                                                                                                                                                                                                                       
## [108] "    <!-- Include all compiled plugins (below), or include individual files as needed -->"                                                                                                                                                                                                                                                              
## [109] "    <script src=\"/js/bootstrap.min.js\"></script>"                                                                                                                                                                                                                                                                                                    
## [110] "  </body>"                                                                                                                                                                                                                                                                                                                                             
## [111] "</html>"

# Extract the input from the unsuccessful results
# subsetting urls with !is_ok
urls[!is_ok]
## $asdf
## [1] "http://asdfasdasdkfjlda"

map2()

rnorm() random numbers from a Normal distribution

# simulate 5 random numbers from a Normal distribution
# with mean zero
rnorm(n = 5)
## [1] -0.7184978 -1.3529178  1.0179859  1.8410276  2.6965366

map() drawing samples from a Normal


# repeat three times, but each time with a different sample size
rnorm(5)
## [1]  0.04567558 -0.73672255 -0.08525783 -0.05609098  0.12507143
rnorm(10)
##  [1] -0.07671131  0.50262646  0.18372367 -1.01890939 -0.05305828
##  [6] -0.61703311  0.30724594  1.51973679  0.08070157 -0.24716776
rnorm(20)
##  [1]  0.1349956 -2.9998807 -1.2199234 -0.8803089 -1.1105952  0.4644794
##  [7]  0.7982421  0.1093767 -0.4790875 -1.1688789  0.4490483  0.2351042
## [13] -0.7701425  0.3470370  1.8589856  1.4977189  1.8743996  0.2511965
## [19]  0.8226129 -0.6310207

# alternatively:
map(list(5, 10, 20), rnorm)
## [[1]]
## [1] -1.1936940  2.3753668  1.5358066  0.4415677 -0.1527810
## 
## [[2]]
##  [1]  0.2668639  0.9275570  2.9852474  1.2493899  0.1304897  0.6778199
##  [7]  2.3413737 -2.5569637 -0.4942210 -0.6099386
## 
## [[3]]
##  [1] -0.370147792  1.360196772 -1.638766531 -1.671530675 -1.725087738
##  [6]  1.131453269 -0.813178258 -0.166746486  0.002313656  2.692439552
## [11]  0.692029181  0.652989141 -0.357393210  1.013485874  0.534926318
## [16]  0.672021196 -0.094722838 -0.035539840  1.161120123  0.938780186

# i.e. 
n <- list(5, 10, 20)
map(n, rnorm)
## [[1]]
## [1] -2.5350166  1.2194725 -0.3878960 -0.7289417  0.3434182
## 
## [[2]]
##  [1]  2.2038646 -1.7883462 -1.6556874 -0.2474666  0.4234609 -0.6239097
##  [7] -1.6380009 -0.5966542  0.2046613  0.1490992
## 
## [[3]]
##  [1] -0.3740660  0.3592354  1.0839342 -1.5524715  1.8563490 -1.2255496
##  [7] -1.7703197  0.5754285  0.8210805 -1.7213611 -0.4213728 -0.2523393
## [13] -1.5009499  0.7682673  0.5812566 -1.4294271 -0.4173622  0.0908106
## [19]  0.7927532 -0.4239674

map2() iterate over two arguments


# rnorm(n, mean = 0, sd = 1)

rnorm(5, mean = 1)
## [1]  1.705974 -0.217439  1.602343  1.029560  0.691539
rnorm(10, mean = 5)
##  [1] 6.146697 5.795837 4.755039 5.690232 5.047908 4.220321 4.137719
##  [8] 4.098953 4.708845 5.433843
rnorm(20, mean = 10)
##  [1]  9.132280  9.416960 11.432265 10.125283 11.300966  9.877106  9.354803
##  [8] 10.165987 11.324978  9.868663 10.785895  9.860812  9.142252  8.603288
## [15] 10.696172 10.788850  9.932629 11.236169  8.661722 11.750454

# alternatively:
# map2(.x, .y, .f, ...)
map2(list(5, 10, 20), list(1, 5, 10), rnorm)
## [[1]]
## [1]  1.06990843  1.11426759  1.51754138 -0.01431911  1.35857722
## 
## [[2]]
##  [1] 4.491806 6.132983 6.020320 4.938282 3.659294 4.029250 4.922837
##  [8] 5.056045 5.087867 5.335729
## 
## [[3]]
##  [1] 10.595610 12.247691 11.176335  7.806663  9.662283 10.496946  9.961202
##  [8]  8.100465 12.666928 11.677195  9.677266  8.800581  8.689409  9.576068
## [15] 11.117679  9.524646  7.946224 10.190701 10.188143  9.432270

# i.e.
mu <- list(1, 5, 10)
map2(n,mu, rnorm)
## [[1]]
## [1] 1.0996344 1.0934606 0.5361912 1.4577187 0.4361098
## 
## [[2]]
##  [1] 5.877185 3.794603 4.505450 5.521853 6.091761 5.255705 4.103266
##  [8] 2.679487 5.807915 5.426072
## 
## [[3]]
##  [1] 11.736617 11.151447 11.270809 11.073334  9.505712 11.415830  9.989987
##  [8] 10.079329  9.452842 11.519678  9.954787 11.642103 12.280424  8.187645
## [15]  8.697277  9.867203  9.378562 10.157019 10.327322  9.313307

pmap() iterate over many arguments


rnorm(5, mean = 1, sd = 0.1)
## [1] 0.9933568 1.0734781 0.9982540 1.1825412 0.9935860
rnorm(10, mean = 5, sd = 0.5)
##  [1] 4.710702 6.329808 4.578451 4.609126 5.015731 5.239718 4.742876
##  [8] 5.492697 5.358975 5.671041
rnorm(20, mean = 10, sd = 0.1)
##  [1]  9.988266 10.018874  9.995430  9.996626  9.930426 10.059853  9.799281
##  [8] 10.046185  9.907805 10.153146  9.890998  9.915099 10.019474 10.043388
## [15] 10.178831  9.866670 10.168416  9.969079  9.981466 10.004589

# alternatively:
# pmap(.l, .f, ...)
pmap(list(n = list(5, 10, 20),
     mean = list(1, 5, 10),
     sd = list(0.1, 0.5, 0.1)), rnorm)
## [[1]]
## [1] 1.0251159 0.9143462 1.0623979 1.1446805 0.6986038
## 
## [[2]]
##  [1] 4.634960 4.233074 4.206222 5.077710 5.219386 4.073963 4.777418
##  [8] 5.357986 5.107987 5.227026
## 
## [[3]]
##  [1]  9.826272  9.850642 10.150864 10.069276 10.208196  9.955140 10.014815
##  [8] 10.200067 10.065475  9.884640 10.021479  9.900215  9.989499  9.959591
## [15] 10.028232 10.075510  9.845512  9.925291 10.032233 10.002264

# i.e. 
sd = list(0.1, 0.5, 0.1)
pmap(list(mean = mu, n = n, sd = sd), rnorm)
## [[1]]
## [1] 0.9719579 0.9079359 1.0821508 1.2720144 1.0640284
## 
## [[2]]
##  [1] 5.751928 4.907903 5.053093 5.180628 5.135509 5.801066 4.707341
##  [8] 5.496773 4.677394 3.765233
## 
## [[3]]
##  [1]  9.793010  9.972128  9.741273  9.921614  9.840243 10.128178  9.950890
##  [8]  9.813654  9.876345 10.053874 10.131098 10.032148 10.105890  9.923602
## [15]  9.963030  9.912236  9.969536 10.041416  9.918537  9.988366

invoke_map() iterate over functions & args


rnorm(5)
## [1]  0.1318228 -0.1430492  1.6102036 -0.5401508 -1.1395560
runif(5)
## [1] 0.3378201 0.9984885 0.3758897 0.1533140 0.9891426
rexp(5)
## [1] 0.2122709 0.2682666 0.3925914 0.9188603 1.5862613

# alternatively:
# invoke_map(.f, .x = list(NULL), ...)
invoke_map(list(rnorm, runif, rexp), n = 5)
## [[1]]
## [1] -1.0460192 -0.3928283 -0.9014461 -0.6139352  0.1757031
## 
## [[2]]
## [1] 0.557859152 0.622239655 0.004652569 0.333880262 0.827704524
## 
## [[3]]
## [1] 0.8416657 0.5616720 0.7815147 0.1123118 0.2611983

simulate three samples

  • distributions
    • Normal(10, 1)
    • Uniform(0, 5)
    • Exponential(5)
# Define list of functions
f <- list("rnorm", "runif", "rexp")

# Parameter list for rnorm()
rnorm_params <- list(mean = 10)

# Add a min element with value 0 and max element with value 5
runif_params <- list()
runif_params <- list(min = 0, max = 5)

# Add a rate element with value 5
rexp_params <- list()
rexp_params <- list(rate = 5)

# Define params for each function
params <- list(
  rnorm_params,
  runif_params,
  rexp_params
)

# Call invoke_map() on f supplying params as the second argument
invoke_map(f, params, n = 5)
## [[1]]
## [1]  8.678056 11.960764 11.905008 10.915172  7.749312
## 
## [[2]]
## [1] 2.7869078 0.7286203 2.7382496 0.7758526 1.4469598
## 
## [[3]]
## [1] 0.13281932 0.03883795 0.09731466 0.45826979 0.08146358

walk()

designed for functions that don’t return anything e.g. functions with side effects like printing, plotting or saving

x <- list(1, "a", 3)

x %>% walk(print)
## [1] 1
## [1] "a"
## [1] 3

library(ggplot2)

plots <- cyl %>%
  map(~ ggplot(., aes(mpg, wt)) + geom_point())

paths <- paste0(names(plots), ".pdf")

walk2(paths, plots, ggsave)
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image

return value of walk()

# return value of walk is the input
out <- x %>% walk(print) 
## [1] 1
## [1] "a"
## [1] 3
str(out)
## List of 3
##  $ : num 1
##  $ : chr "a"
##  $ : num 3

walk() with pipeline

lengths <- x %>% walk(print) %>% map_dbl(length)
## [1] 1
## [1] "a"
## [1] 3

lengths
## [1] 1 1 1

plotting a histogram for each simulated samples

# Define list of functions
f <- list(Normal = "rnorm", Uniform = "runif", Exp = "rexp")

# Define params
params <- list(
  Normal = list(mean = 10),
  Uniform = list(min = 0, max = 5),
  Exp = list(rate = 5)
)

# Assign the simulated samples to sims
sims <- invoke_map(f, params, n = 50)

# Use walk() to make a histogram of each element in sims
sims %>% walk(hist)

Que: Take a quick look through the three histograms, do they have any problems?

Ans: They really needed better breaks for the bins on the x-axis.

NB1: default value for the breaks argument to hist() is “Sturges”

Sol: Need to vary two arguments to hist(): x and breaks

walk2()


# Replace "Sturges" with reasonable breaks for each sample
breaks_list <- list(
  Normal = "Sturges",
  Uniform = "Sturges",
  Exp = "Sturges"
)

# default value for the breaks argument to hist() is "Sturges"

breaks_list <- list(
  Normal = seq(6, 16, 0.5),
  Uniform = seq(0, 5, 0.25),
  Exp = seq(0, 1.5, 0.1)
)

# Use walk2() to make histograms with the right breaks
walk2(sims, breaks_list, hist)


# alternatively
sims %>% walk2(breaks_list, hist)

find breaks


# hard-coded the breaks = no good, if we change the parameters of our simulation
# generate reasonable breaks based on the actual values in our simulated samples

# Turn this snippet into find_breaks()

  rng <- range(sims[[1]], na.rm = TRUE)
  seq(rng[1], rng[2], length.out = 30)
##  [1]  7.898824  8.063608  8.228392  8.393176  8.557959  8.722743  8.887527
##  [8]  9.052311  9.217095  9.381879  9.546662  9.711446  9.876230 10.041014
## [15] 10.205798 10.370581 10.535365 10.700149 10.864933 11.029717 11.194501
## [22] 11.359284 11.524068 11.688852 11.853636 12.018420 12.183204 12.347987
## [29] 12.512771 12.677555

  # writing our own function:
  # takes a single argument x 
  # and return the sequence of breaks
  find_breaks <- function(x) {
    rng <- range(x, na.rm = TRUE)
    seq(rng[1], rng[2], length.out = 30)
  }
  
  # Check that your function works 
  # by calling find_breaks() on sims[[1]].
  find_breaks(sims[[1]])
##  [1]  7.898824  8.063608  8.228392  8.393176  8.557959  8.722743  8.887527
##  [8]  9.052311  9.217095  9.381879  9.546662  9.711446  9.876230 10.041014
## [15] 10.205798 10.370581 10.535365 10.700149 10.864933 11.029717 11.194501
## [22] 11.359284 11.524068 11.688852 11.853636 12.018420 12.183204 12.347987
## [29] 12.512771 12.677555
  # additional checks
  find_breaks(sims[[2]])
##  [1] 0.01299713 0.18460169 0.35620625 0.52781081 0.69941537 0.87101993
##  [7] 1.04262449 1.21422905 1.38583361 1.55743817 1.72904273 1.90064729
## [13] 2.07225185 2.24385641 2.41546097 2.58706553 2.75867010 2.93027466
## [19] 3.10187922 3.27348378 3.44508834 3.61669290 3.78829746 3.95990202
## [25] 4.13150658 4.30311114 4.47471570 4.64632026 4.81792482 4.98952938
  find_breaks(sims[[3]])
##  [1] 0.001431461 0.038830039 0.076228617 0.113627195 0.151025773
##  [6] 0.188424351 0.225822929 0.263221507 0.300620085 0.338018663
## [11] 0.375417241 0.412815819 0.450214397 0.487612975 0.525011553
## [16] 0.562410131 0.599808709 0.637207287 0.674605865 0.712004444
## [21] 0.749403022 0.786801600 0.824200178 0.861598756 0.898997334
## [26] 0.936395912 0.973794490 1.011193068 1.048591646 1.085990224

nice breaks

# Use map() to iterate find_breaks() over sims
nice_breaks <- map(sims, find_breaks)

# Use nice_breaks as the second argument to walk2()
# iterate over both 
#   the simulations 
#   and calculated breaks to plot histograms
walk2(sims, nice_breaks, hist)

no labels on the x-axis


# walk2(sims, nice_breaks, hist)
walk2(sims, nice_breaks, hist, xlab = "")

also nice titles

# Increase sample size to 1000
sims <- invoke_map(f, params, n = 50)
sims <- invoke_map(f, params, n = 1000)

# nice_breaks [given]
nice_breaks <- map(sims, find_breaks)

# Create a vector nice_titles
nice_titles <- c("Normal(10, 1)", "Uniform(0, 5)", "Exp(5)")


# Use pwalk() instead of walk2()
# walk2(sims, nice_breaks, hist, xlab = "")

# NB: keep xlab = "" 
#     outside of the list of arguments 
#     being iterated over 
#     since it's the same value 
#     for all three histograms
pwalk(list(x = sims, breaks = nice_breaks, main = nice_titles), hist, xlab = "")

walk() return the object you passed to it easy use in pipeline (pipeline = “a statement with lots of pipes”)


# e.g. 
str(sims)
## List of 3
##  $ Normal : num [1:1000] 10.48 10.01 9.76 10.7 10.6 ...
##  $ Uniform: num [1:1000] 3.027 0.441 4.359 0.908 1.56 ...
##  $ Exp    : num [1:1000] 0.0105 0.2805 0.1745 0.0419 0.1122 ...
tmp <- walk(sims, hist)

str(tmp)
## List of 3
##  $ Normal : num [1:1000] 10.48 10.01 9.76 10.7 10.6 ...
##  $ Uniform: num [1:1000] 3.027 0.441 4.359 0.908 1.56 ...
##  $ Exp    : num [1:1000] 0.0105 0.2805 0.1745 0.0419 0.1122 ...

# can pipe the sims object along to other functions

# e.g. want some basic summary statistics on each sample

# Pipe this along to map(), using summary() as .f
sims %>%
  walk(hist) %>%
    map(summary)

## $Normal
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   6.886   9.304  10.007  10.013  10.661  13.261 
## 
## $Uniform
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.006835 1.245027 2.437147 2.480203 3.703678 4.994127 
## 
## $Exp
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0002914 0.0604708 0.1430724 0.2045928 0.2845496 1.5235408

5 - Robust Functions

stopifnot() an error is better than a surprise

# from chapter 2: 
# finds the number of entries where vectors x and y both have missing values
both_na <- function(x, y) {
  sum(is.na(x) & is.na(y))
}

# Define troublesome x and y
x <- c(NA, NA, NA)
y <- c( 1, NA, NA, NA)

# function works and returns 3
both_na(x, y) 
## Warning in is.na(x) & is.na(y): longer object length is not a multiple of
## shorter object length
## [1] 3

# NOT COOL to pass in different length arguments


both_na <- function(x, y) {
  
  stopifnot( length(x) == length(y) )
  
  sum(is.na(x) & is.na(y))
}

# # function works and returns 3
both_na(x, y)
## Error in both_na(x, y): length(x) == length(y) is not TRUE

stop() an informative error is even better

both_na <- function(x, y) {
  # Replace condition with logical
  # if (condition) {
  if ( length(x) != length(y) ) {
    # Replace "Error" with better message
    # stop("Error", call. = FALSE)
    stop("x and y must have the same length", call. = FALSE)
  }  
  
  sum(is.na(x) & is.na(y))
}

# Call both_na() 
# verify it returns a more informative error
both_na(x, y)
## Error: x and y must have the same length

Side effects

i.e. when you run a function that alters the state of your R session

e.g.

show_missings <- function(x) {
  n <- sum(is.na(x))
  cat("Missing values: ", n, "\n", sep = "")
  x
}
# prints output to the console

plot_missings <- function(x) {
  plot(seq_along(x), is.na(x))
  x
} 
# creates a plot


exclude_missings <- function() {
  options(na.action = "na.exclude")
} 
# changes a global option

Pure if:

  • no other variables in the global environment should be changed or created
  • no output should be printed
  • no plots displayed
  • no files saved
  • no options changed

e.g.

replace_missings <- function(x, replacement) {
  x[is.na(x)] <- replacement
  x
}

functions with side effects are crucial for data analysis

You need to be aware of them, and deliberate in their usage.

Problem 1: Type-unstable functions

  • Type-inconsistent: the type of the return object depends on the input

  • Surprises occur when you’ve used a type-inconsistent function inside your own function

  • Source of surprises, type-inconsistent f:

    • single bracket subsetting [
    • sapply
  • in your functions: PoA: use type consistent functions instead (e.g. Purr funcitons) or use tests to ensure type

single bracket subsetting [

1.1745604, 0.2802373, 0.1575448, 0.3131434 returns first row:

df <- data.frame(z = 1:3, y = 2:4)
str(df[1, ]) # df
## 'data.frame':    1 obs. of  2 variables:
##  $ z: int 1
##  $ y: int 2

df <- data.frame(z = 1:3)
str(df[1, ]) # vector
##  int 1

last_row <- function(df) {
 df[nrow(df), ]
 }

df <- data.frame(x = 1:3) # one col df

# Not a row, just a vector
str(last_row(df))
##  int 3
  • solution 1:

use drop = FALSE: df[x, , drop = FALSE]


last_row <- function(df) {
  df[nrow(df), , drop = FALSE] 
 }

df <- data.frame(x = 1:3) # one col df

str(last_row(df)) # now a df
## 'data.frame':    1 obs. of  1 variable:
##  $ x: int 3
  • solution 2:

Subset the data frame like a list: df[x]

sapply

The type of output returned from sapply() depends on the type of input

df <- data.frame(
  a = 1L,
  b = 1.5,
  y = Sys.time(),
  z = ordered(1)
)

A <- sapply(df[1:4], class) 
B <- sapply(df[3:4], class)

What type of objects will be A and B be?


# A will be a list

A
## $a
## [1] "integer"
## 
## $b
## [1] "numeric"
## 
## $y
## [1] "POSIXct" "POSIXt" 
## 
## $z
## [1] "ordered" "factor"

str(A)
## List of 4
##  $ a: chr "integer"
##  $ b: chr "numeric"
##  $ y: chr [1:2] "POSIXct" "POSIXt"
##  $ z: chr [1:2] "ordered" "factor"

# B will be a character matrix.

B
##      y         z        
## [1,] "POSIXct" "ordered"
## [2,] "POSIXt"  "factor"

str(B)
##  chr [1:2, 1:2] "POSIXct" "POSIXt" "ordered" "factor"
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:2] "y" "z"

unpredictable behaviour => shouldn’t rely on sapply() inside your own functions

Solution: purrr

call class() on the columns of df

=> expect: character output

    => map_chr()

df <- data.frame(
  a = 1L,
  b = 1.5,
  y = Sys.time(),
  z = ordered(1)
)

A <- map_chr(df[1:4], class) 
## Error: Result 3 is not a length 1 atomic vector
B <- map_chr(df[3:4], class)
## Error: Result 1 is not a length 1 atomic vector

ERROR alerts us that our assumption is wrong (that class() would return purely character output)

Solution:


# sapply calls
A <- sapply(df[1:4], class) 
B <- sapply(df[3:4], class)
C <- sapply(df[1:2], class) 

# Demonstrate type inconsistency
str(A)
## List of 4
##  $ a: chr "integer"
##  $ b: chr "numeric"
##  $ y: chr [1:2] "POSIXct" "POSIXt"
##  $ z: chr [1:2] "ordered" "factor"
str(B)
##  chr [1:2, 1:2] "POSIXct" "POSIXt" "ordered" "factor"
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:2] "y" "z"
str(C)
##  Named chr [1:2] "integer" "numeric"
##  - attr(*, "names")= chr [1:2] "a" "b"

# Use map() to define X, Y and Z
X <- map(df[1:4], class) 
Y <- map(df[3:4], class)
Z <- map(df[1:2], class)

# Use str() to check type consistency
str(X)
## List of 4
##  $ a: chr "integer"
##  $ b: chr "numeric"
##  $ y: chr [1:2] "POSIXct" "POSIXt"
##  $ z: chr [1:2] "ordered" "factor"
str(Y)
## List of 2
##  $ y: chr [1:2] "POSIXct" "POSIXt"
##  $ z: chr [1:2] "ordered" "factor"
str(Z)
## List of 2
##  $ a: chr "integer"
##  $ b: chr "numeric"

Solution: Type Consistent:

# always return a list
# becuase used map() a type consistent function
col_classes <- function(df) {
  map(df, class)
}

# Want this function to return a character string

col_classes <- function(df) {
  
  # Assign list output to class_list
  class_list <- map(df, class)
  
  # Use map_chr() 
  # along with the numeric subsetting shortcut
  # to extract first element in class_list
  map_chr(class_list, 1)
}

# Check that our new function is:
#   type consistent
#   always returns a character vector.
df %>% col_classes() %>% str()
##  Named chr [1:4] "integer" "numeric" "POSIXct" "ordered"
##  - attr(*, "names")= chr [1:4] "a" "b" "y" "z"
df[3:4] %>% col_classes() %>% str()
##  Named chr [1:2] "POSIXct" "ordered"
##  - attr(*, "names")= chr [1:2] "y" "z"
df[1:2] %>% col_classes() %>% str()
##  Named chr [1:2] "integer" "numeric"
##  - attr(*, "names")= chr [1:2] "a" "b"

Solution: Fail Early:

col_classes <- function(df) {
  map_chr(df, class)
}

df %>% col_classes() %>% str()
## Error: Result 3 is not a length 1 atomic vector


class_list <- map(df, class)
class_list
## $a
## [1] "integer"
## 
## $b
## [1] "numeric"
## 
## $y
## [1] "POSIXct" "POSIXt" 
## 
## $z
## [1] "ordered" "factor"

# vector of lengths for the elements in class_list
map_dbl(class_list, length)
## a b y z 
## 1 1 2 2

# one and only argument to any() is a conditional statement
any(map_dbl(class_list, length) > 1)
## [1] TRUE


col_classes <- function(df) {
  class_list <- map(df, class)
  
  # Add a check that no element of class_list has length > 1
  if (any(map_dbl(class_list, length) > 1)) {
    stop("Some columns have more than one class", call. = FALSE)
  }
  
  # Use flatten_chr() to return a character vector
  flatten_chr(class_list)
}

# Check that our new function is type consistent
df %>% col_classes() %>% str()
## Error: Some columns have more than one class
df[3:4] %>% col_classes() %>% str()
## Error: Some columns have more than one class
df[1:2] %>% col_classes() %>% str()
##  chr [1:2] "integer" "numeric"

Problem 2: Non-standard evaluation

e.g. filter() - from the dplyr package

# return all rows in df 
# where the x column exceeds a certain threshold
big_x <- function(df, threshold) {
  dplyr::filter(df, x > threshold)
}
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data(diamonds)
diamonds_sub <- head(diamonds,20)
diamonds_sub <- diamonds[sample(1:length(diamonds$price), 20), ]
# Use big_x() to find rows in diamonds_sub where x > 7
big_x(diamonds_sub, 7)
## # A tibble: 2 x 10
##   carat cut   color clarity depth table price     x     y     z
##   <dbl> <ord> <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  1.76 Ideal I     SI2      61.2    55  9714  7.79  7.76  4.76
## 2  2.07 Ideal J     VVS2     62.7    54 16617  8.12  8.17  5.11

filter() - surprising results

  • The x column doesn’t exist in df.
  • There is a threshold column in df.

unexpected outputs & NO indication (i.e. error message)

# Remove the x column from diamonds
diamonds_sub$x <- NULL

# Create variable x with value 1
x <- 1

# Use big_x() to find rows in diamonds_sub where x > 7
big_x(diamonds_sub, 7)
## # A tibble: 0 x 9
## # ... with 9 variables: carat <dbl>, cut <ord>, color <ord>,
## #   clarity <ord>, depth <dbl>, table <dbl>, price <int>, y <dbl>, z <dbl>

# Create a threshold column with value 100
diamonds_sub$threshold <- 100

# Use big_x() to find rows in diamonds_sub where x > 7
big_x(diamonds_sub, 7)
## # A tibble: 0 x 10
## # ... with 10 variables: carat <dbl>, cut <ord>, color <ord>,
## #   clarity <ord>, depth <dbl>, table <dbl>, price <int>, y <dbl>,
## #   z <dbl>, threshold <dbl>

Solution

big_x <- function(df, threshold) {
  
  # Write a check for x not being in df
  # If x is not in names(df), 
  # if ( x !in names(df) ) { - INCORRECT
  if (!"x" %in% names(df)) { 
  stop("df must contain variable called x", call. = FALSE)
  }  

  # Write a check for threshold being in df
  # If threshold is in names(df), 
  # if ( threshold in names(df) ) {
  if ("threshold" %in% names(df)) {
  stop("df must not contain variable called threshold", call. = FALSE)
  }  

  dplyr::filter(df, x > threshold)
  
  big_x(diamonds_sub, 7)
}

Problem 3: Hidden arguments

options() # global options of console
## $add.smooth
## [1] TRUE
## 
## $browserNLdisabled
## [1] FALSE
## 
## $CBoundsCheck
## [1] FALSE
## 
## $check.bounds
## [1] FALSE
## 
## $citation.bibtex.max
## [1] 1
## 
## $continue
## [1] "+ "
## 
## $contrasts
##         unordered           ordered 
## "contr.treatment"      "contr.poly" 
## 
## $defaultPackages
## [1] "datasets"  "utils"     "grDevices" "graphics"  "stats"     "methods"  
## 
## $demo.ask
## [1] "default"
## 
## $deparse.cutoff
## [1] 60
## 
## $device
## function (width = 7, height = 7, ...) 
## {
##     grDevices::pdf(NULL, width, height, ...)
## }
## <bytecode: 0x0000000015863b30>
## <environment: namespace:knitr>
## 
## $device.ask.default
## [1] FALSE
## 
## $digits
## [1] 7
## 
## $dplyr.show_progress
## [1] TRUE
## 
## $echo
## [1] FALSE
## 
## $editor
## [1] "notepad"
## 
## $encoding
## [1] "native.enc"
## 
## $example.ask
## [1] "default"
## 
## $expressions
## [1] 5000
## 
## $help.search.types
## [1] "vignette" "demo"     "help"    
## 
## $help.try.all.packages
## [1] FALSE
## 
## $help_type
## [1] "html"
## 
## $HTTPUserAgent
## [1] "R (3.5.0 x86_64-w64-mingw32 x86_64 mingw32)"
## 
## $install.packages.compile.from.source
## [1] "interactive"
## 
## $internet.info
## [1] 2
## 
## $keep.source
## [1] FALSE
## 
## $keep.source.pkgs
## [1] FALSE
## 
## $knitr.in.progress
## [1] TRUE
## 
## $locatorBell
## [1] TRUE
## 
## $mailer
## [1] "mailto"
## 
## $matprod
## [1] "default"
## 
## $max.print
## [1] 99999
## 
## $menu.graphics
## [1] TRUE
## 
## $na.action
## [1] "na.omit"
## 
## $nwarnings
## [1] 50
## 
## $OutDec
## [1] "."
## 
## $pager
## [1] "internal"
## 
## $papersize
## [1] "a4"
## 
## $PCRE_limit_recursion
## [1] NA
## 
## $PCRE_study
## [1] 10
## 
## $PCRE_use_JIT
## [1] TRUE
## 
## $pdfviewer
## [1] "C:/PROGRA~1/R/R-35~1.0/bin/x64/open.exe"
## 
## $pkgType
## [1] "both"
## 
## $prompt
## [1] "> "
## 
## $repos
##     CRAN 
## "@CRAN@" 
## 
## $scipen
## [1] 0
## 
## $show.coef.Pvalues
## [1] TRUE
## 
## $show.error.messages
## [1] TRUE
## 
## $show.signif.stars
## [1] TRUE
## 
## $showErrorCalls
## [1] TRUE
## 
## $str
## $str$strict.width
## [1] "no"
## 
## $str$digits.d
## [1] 3
## 
## $str$vec.len
## [1] 4
## 
## 
## $str.dendrogram.last
## [1] "`"
## 
## $stringsAsFactors
## [1] TRUE
## 
## $tikzMetricsDictionary
## [1] "8_-_writing-functions-in-r-tikzDictionary"
## 
## $timeout
## [1] 60
## 
## $ts.eps
## [1] 1e-05
## 
## $ts.S.compat
## [1] FALSE
## 
## $unzip
## [1] "internal"
## 
## $useFancyQuotes
## [1] FALSE
## 
## $verbose
## [1] FALSE
## 
## $warn
## [1] 0
## 
## $warning.length
## [1] 1000
## 
## $width
## [1] 75
## 
## $windowsTimeouts
## [1] 100 500
getOption("digits")
## [1] 7
options(digits = 5)
getOption("digits")
## [1] 5

download: swimming_pools.csv

download.file("http://s3.amazonaws.com/assets.datacamp.com/production/course_1478/datasets/swimming_pools.csv",
              "swimming_pools.csv")

hidden dependence: “stringsAsFactors” arg to read.csv()

# unknown result of this code:
pools <- read.csv("swimming_pools.csv")
# because the argument stringsAsFactors is not specified
# inherits value from getOption("stringsAsFactors")
#                     a global option that a user may change.

# Examine the structure of pools
# notice how the columns Name and Address are factors.
str(pools)
## 'data.frame':    20 obs. of  4 variables:
##  $ Name     : Factor w/ 20 levels "Acacia Ridge Leisure Centre",..: 1 2 3 4 5 6 19 7 8 9 ...
##  $ Address  : Factor w/ 20 levels "1 Fairlead Crescent, Manly",..: 5 20 18 10 9 11 6 15 12 17 ...
##  $ Latitude : num  -27.6 -27.6 -27.6 -27.5 -27.4 ...
##  $ Longitude: num  153 153 153 153 153 ...

# Change the global stringsAsFactor option to FALSE
options(stringsAsFactors = FALSE)

# Read in the swimming_pools.csv to pools2
pools2 <- read.csv("swimming_pools.csv")

# Examine the structure of pools2
# notice how the columns Name and Address are now characters.
str(pools2)
## 'data.frame':    20 obs. of  4 variables:
##  $ Name     : chr  "Acacia Ridge Leisure Centre" "Bellbowrie Pool" "Carole Park" "Centenary Pool (inner City)" ...
##  $ Address  : chr  "1391 Beaudesert Road, Acacia Ridge" "Sugarwood Street, Bellbowrie" "Cnr Boundary Road and Waterford Road Wacol" "400 Gregory Terrace, Spring Hill" ...
##  $ Latitude : num  -27.6 -27.6 -27.6 -27.5 -27.4 ...
##  $ Longitude: num  153 153 153 153 153 ...

options()

ideal: avoid having the return value of your own functions depend on any global options

okay: to have side effects of a function depend on global options

e.g. print() function uses getOption(“digits”) as the default for the digits argument:

print(x, digits = getOption("digits"), quote = FALSE,

e.g. print.lm() function has the options digits with default max(3, getOption(“digits”) - 3):

# We've fit a regression model of fuel efficiency on weight using the mtcars dataset
# Fit a regression model
fit <- lm(mpg ~ wt, data = mtcars)

# Look at the summary of the (fitted regression) model
# Pay particular attention to number of decimal places reported.
summary(fit)
## 
## Call:
## lm(formula = mpg ~ wt, data = mtcars)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.543 -2.365 -0.125  1.410  6.873 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   37.285      1.878   19.86  < 2e-16 ***
## wt            -5.344      0.559   -9.56  1.3e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.05 on 30 degrees of freedom
## Multiple R-squared:  0.753,  Adjusted R-squared:  0.745 
## F-statistic: 91.4 on 1 and 30 DF,  p-value: 1.29e-10


# Set the global digits option to 2
options(digits = 2)

# Take another look at the summary
# Notice the number of decimal places has changed
# but there is no change to the underlying fit object.
summary(fit)
## 
## Call:
## lm(formula = mpg ~ wt, data = mtcars)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.543 -2.365 -0.125  1.410  6.873 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   37.285      1.878   19.86  < 2e-16 ***
## wt            -5.344      0.559   -9.56  1.3e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3 on 30 degrees of freedom
## Multiple R-squared:  0.753,  Adjusted R-squared:  0.745 
## F-statistic: 91.4 on 1 and 30 DF,  p-value: 1.29e-10