my_fun <- function(arg1, arg2) {
# body
}
# Define ratio() function
#-- Change the function's arguments arg1 and arg2
#-- to x and y for good practice
ratio <- function(x, y) {
x/y
}
# Call ratio() with arguments 3 and 4
ratio(3,4) # matching by position
## [1] 0.75
ratio(x = 3, y = 4) # matching by name
## [1] 0.75
# Rewrite the call to follow best practices
mean(0.1,x=c(1:9, NA),TRUE)
## [1] 5
mean(x = c(1:9, NA), trim = 0.1, na.rm = TRUE)
## [1] 5
# The first argument to mean() should be c(1:9, NA)
# there's no need to name this argument
# since it comes first and has no default value.
mean(c(1:9, NA), trim = 0.1, na.rm = TRUE)
## [1] 5
# c is being used in three ways
# 1. function
# 2. name
# 3. refers to a value
c <- 3
c(c = c)
## c
## 3
# to get a fresh session in the console
rm(x)
## Warning in rm(x): object 'x' not found
rm(y)
## Warning in rm(y): object 'y' not found
y <- 10
f <- function(x) {
x + y
}
f(10)
## [1] 20
# Because y is not passed in as an argument to the function,
# R looks outside of the function environment
# to get a fresh session in the console
rm(x)
## Warning in rm(x): object 'x' not found
rm(y)
y <- 10
f <- function(x) {
y <- 5
x + y
}
f(10)
## [1] 15
# value of x is passed in as an argument to the function
# value of y is defined inside of the function
# to get a fresh session in the console
rm(x)
## Warning in rm(x): object 'x' not found
rm(y)
f <- function(x) {
y <- 5
x + y
}
f(5)
## [1] 10
# Now, what will typing y return?
y
## Error in eval(expr, envir, enclos): object 'y' not found
# y is set equal to 5 within the body of the function
# object does not exist in the global environment.
# Property 1
typeof(letters)
## [1] "character"
typeof(1:10)
## [1] "integer"
# Property 2
length(letters)
## [1] 26
x <- list("a", "b", 1:10)
length(x)
## [1] 3
# NULL used to indicate the absence of a vector
typeof(NULL)
## [1] "NULL"
length(NULL)
## [1] 0
# NA used to indicate the absence of a value in a vector
# i.e. missing value
typeof(NA)
## [1] "logical"
length(NA)
## [1] 1
x <- c(1, 2, 3, NA, 5)
x
## [1] 1 2 3 NA 5
is.na(x)
## [1] FALSE FALSE FALSE TRUE FALSE
NA + 10
## [1] NA
NA / 2
## [1] NA
NA > 5
## [1] NA
10 == NA
## [1] NA
NA == NA
## [1] NA
a_without_names <- list(
1:3,
"a string",
pi,
list(-1, -5)
)
a_without_names
## [[1]]
## [1] 1 2 3
##
## [[2]]
## [1] "a string"
##
## [[3]]
## [1] 3.141593
##
## [[4]]
## [[4]][[1]]
## [1] -1
##
## [[4]][[2]]
## [1] -5
a <- list(
a = 1:3,
b = "a string",
c = pi,
d = list(-1, -5)
)
a
## $a
## [1] 1 2 3
##
## $b
## [1] "a string"
##
## $c
## [1] 3.141593
##
## $d
## $d[[1]]
## [1] -1
##
## $d[[2]]
## [1] -5
a[1:2]
## $a
## [1] 1 2 3
##
## $b
## [1] "a string"
a[4]
## $d
## $d[[1]]
## [1] -1
##
## $d[[2]]
## [1] -5
str(a[4])
## List of 1
## $ d:List of 2
## ..$ : num -1
## ..$ : num -5
a[[4]]
## [[1]]
## [1] -1
##
## [[2]]
## [1] -5
str(a[[4]])
## List of 2
## $ : num -1
## $ : num -5
a[[4]][1]
## [[1]]
## [1] -1
str(a[[4]][1])
## List of 1
## $ : num -1
a[[4]][[1]]
## [1] -1
str(a[[4]][[1]])
## num -1
a[[4]][1] == a[[4]][[1]]
## [1] TRUE
# double bracket ([[]]) subsetting by index and by name.
# my_list[[1]] extracts the first element of the list my_list
# my_list[["name"]] extracts the element in my_list that is called name
# list is nested
# you can travel down the heirarchy by recursive subsetting
# mylist[[1]][["name"]] => element called name
# inside the first element of my_list
# NB: data frame is just a special kind of list
# my_df[[1]] will extract the first column of a data frame
tricky_list <- list(
nums= c(-0.08637812, -1.26158565, -0.19490648, -0.26042323, 1.04196393, 0.77327523, -0.01484575, -0.35573723, -0.25463426, -0.43844839),
y = c(FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE),
x = list("hello!", "hi!", "goodbye!", "bye!"),
model = lm(formula = mpg ~ wt, data = mtcars)
)
# typeof()
# 2nd element in tricky_list
# The double brackets should be inside the parantheses of the typeof() function
typeof(tricky_list[[2]])
## [1] "logical"
# Element called x in tricky_list
typeof(tricky_list[["x"]])
## [1] "list"
# 2nd element inside the element called x in tricky_list
typeof(tricky_list[['x']][[2]])
## [1] "character"
# Goal: drill down and pull out the slope estimate corresponding to the wt variable.
# Guess where the regression model is stored
names(tricky_list)
## [1] "nums" "y" "x" "model"
# Use names() and str() on the model element
names(tricky_list$model)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
str(tricky_list$model)
## List of 12
## $ coefficients : Named num [1:2] 37.29 -5.34
## ..- attr(*, "names")= chr [1:2] "(Intercept)" "wt"
## $ residuals : Named num [1:32] -2.28 -0.92 -2.09 1.3 -0.2 ...
## ..- attr(*, "names")= chr [1:32] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710" "Hornet 4 Drive" ...
## $ effects : Named num [1:32] -113.65 -29.116 -1.661 1.631 0.111 ...
## ..- attr(*, "names")= chr [1:32] "(Intercept)" "wt" "" "" ...
## $ rank : int 2
## $ fitted.values: Named num [1:32] 23.3 21.9 24.9 20.1 18.9 ...
## ..- attr(*, "names")= chr [1:32] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710" "Hornet 4 Drive" ...
## $ assign : int [1:2] 0 1
## $ qr :List of 5
## ..$ qr : num [1:32, 1:2] -5.657 0.177 0.177 0.177 0.177 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:32] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710" "Hornet 4 Drive" ...
## .. .. ..$ : chr [1:2] "(Intercept)" "wt"
## .. ..- attr(*, "assign")= int [1:2] 0 1
## ..$ qraux: num [1:2] 1.18 1.05
## ..$ pivot: int [1:2] 1 2
## ..$ tol : num 1e-07
## ..$ rank : int 2
## ..- attr(*, "class")= chr "qr"
## $ df.residual : int 30
## $ xlevels : Named list()
## $ call : language lm(formula = mpg ~ wt, data = mtcars)
## $ terms :Classes 'terms', 'formula' language mpg ~ wt
## .. ..- attr(*, "variables")= language list(mpg, wt)
## .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:2] "mpg" "wt"
## .. .. .. ..$ : chr "wt"
## .. ..- attr(*, "term.labels")= chr "wt"
## .. ..- attr(*, "order")= int 1
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(mpg, wt)
## .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
## .. .. ..- attr(*, "names")= chr [1:2] "mpg" "wt"
## $ model :'data.frame': 32 obs. of 2 variables:
## ..$ mpg: num [1:32] 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## ..$ wt : num [1:32] 2.62 2.88 2.32 3.21 3.44 ...
## ..- attr(*, "terms")=Classes 'terms', 'formula' language mpg ~ wt
## .. .. ..- attr(*, "variables")= language list(mpg, wt)
## .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. .. ..$ : chr [1:2] "mpg" "wt"
## .. .. .. .. ..$ : chr "wt"
## .. .. ..- attr(*, "term.labels")= chr "wt"
## .. .. ..- attr(*, "order")= int 1
## .. .. ..- attr(*, "intercept")= int 1
## .. .. ..- attr(*, "response")= int 1
## .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. .. ..- attr(*, "predvars")= language list(mpg, wt)
## .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
## .. .. .. ..- attr(*, "names")= chr [1:2] "mpg" "wt"
## - attr(*, "class")= chr "lm"
names(tricky_list[["model"]])
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
str(tricky_list[["model"]])
## List of 12
## $ coefficients : Named num [1:2] 37.29 -5.34
## ..- attr(*, "names")= chr [1:2] "(Intercept)" "wt"
## $ residuals : Named num [1:32] -2.28 -0.92 -2.09 1.3 -0.2 ...
## ..- attr(*, "names")= chr [1:32] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710" "Hornet 4 Drive" ...
## $ effects : Named num [1:32] -113.65 -29.116 -1.661 1.631 0.111 ...
## ..- attr(*, "names")= chr [1:32] "(Intercept)" "wt" "" "" ...
## $ rank : int 2
## $ fitted.values: Named num [1:32] 23.3 21.9 24.9 20.1 18.9 ...
## ..- attr(*, "names")= chr [1:32] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710" "Hornet 4 Drive" ...
## $ assign : int [1:2] 0 1
## $ qr :List of 5
## ..$ qr : num [1:32, 1:2] -5.657 0.177 0.177 0.177 0.177 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:32] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710" "Hornet 4 Drive" ...
## .. .. ..$ : chr [1:2] "(Intercept)" "wt"
## .. ..- attr(*, "assign")= int [1:2] 0 1
## ..$ qraux: num [1:2] 1.18 1.05
## ..$ pivot: int [1:2] 1 2
## ..$ tol : num 1e-07
## ..$ rank : int 2
## ..- attr(*, "class")= chr "qr"
## $ df.residual : int 30
## $ xlevels : Named list()
## $ call : language lm(formula = mpg ~ wt, data = mtcars)
## $ terms :Classes 'terms', 'formula' language mpg ~ wt
## .. ..- attr(*, "variables")= language list(mpg, wt)
## .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:2] "mpg" "wt"
## .. .. .. ..$ : chr "wt"
## .. ..- attr(*, "term.labels")= chr "wt"
## .. ..- attr(*, "order")= int 1
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(mpg, wt)
## .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
## .. .. ..- attr(*, "names")= chr [1:2] "mpg" "wt"
## $ model :'data.frame': 32 obs. of 2 variables:
## ..$ mpg: num [1:32] 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## ..$ wt : num [1:32] 2.62 2.88 2.32 3.21 3.44 ...
## ..- attr(*, "terms")=Classes 'terms', 'formula' language mpg ~ wt
## .. .. ..- attr(*, "variables")= language list(mpg, wt)
## .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. .. ..$ : chr [1:2] "mpg" "wt"
## .. .. .. .. ..$ : chr "wt"
## .. .. ..- attr(*, "term.labels")= chr "wt"
## .. .. ..- attr(*, "order")= int 1
## .. .. ..- attr(*, "intercept")= int 1
## .. .. ..- attr(*, "response")= int 1
## .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. .. ..- attr(*, "predvars")= language list(mpg, wt)
## .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
## .. .. .. ..- attr(*, "names")= chr [1:2] "mpg" "wt"
## - attr(*, "class")= chr "lm"
# Subset the coefficients element
coefficients(tricky_list$model)
## (Intercept) wt
## 37.285126 -5.344472
tricky_list$model[1]
## $coefficients
## (Intercept) wt
## 37.285126 -5.344472
tricky_list[["model"]][["coefficients"]]
## (Intercept) wt
## 37.285126 -5.344472
# Subset the wt element
tricky_list$model[1][2]
## $<NA>
## NULL
tricky_list$model[[1]][2]
## wt
## -5.344472
tricky_list$model[[1]][[2]]
## [1] -5.344472
tricky_list[["model"]][["coefficients"]][["wt"]]
## [1] -5.344472
# order of recursive subsets to get the element you want
# From left to right,
# name of the elements you specify within the double brackets
# should get more and more small-scale
# my_list[[x]][[y]][[z]]
# my_list contains x,
# which contains y,
# which contains z.
df <- data.frame(
a = rnorm(10),
b = rnorm(10),
c = rnorm(10),
d = rnorm(10)
)
# sequence = (i in 1:ncol(df))
# body = print(median(df[[i]]))
# Each time our for loop iterates, i takes the next value in 1:ncol(df)
for (i in 1:ncol(df)) {
print(median(df[[i]]))
}
## [1] -0.3165703
## [1] -0.5276784
## [1] 0.1394496
## [1] 0.6225529
# not the best way to create seq
# e.g. empty df
df <- data.frame()
1:ncol(df)
## [1] 1 0
for (i in 1:ncol(df)) {
print(median(df[[i]]))
}
## Error in .subset2(x, i, exact = exact): subscript out of bounds
# sequence is now the somewhat non-sensical: 1, 0.
## Instead:
## seq_along()
## generates a sequence along the index of the object passed to it
## but handles the empty case much better
df <- data.frame(
a = rnorm(10),
b = rnorm(10),
c = rnorm(10),
d = rnorm(10)
)
# Replace the 1:ncol(df) sequence
for (i in seq_along(df)) {
print(median(df[[i]]))
}
## [1] -0.6524705
## [1] 0.07543185
## [1] -0.107055
## [1] 0.1111691
# Change the value of df
df <- data.frame()
# Repeat for loop to verify there is no error
for (i in seq_along(df)) {
print(median(df[[i]]))
}
# Before you start the loop
# allocate sufficient space for the output
# due to efficiency
# if you grow the for loop at each iteration
# (e.g. using c()),
# your for loop will be very slow.
# vector()
# to create an empty vector of given length
# two arguments
# 1. type of the vector
# ("logical", "integer", "double", "character", etc.)
# 2. length of the vector
# Create new double vector: output
output = vector(double, length = ncol(df))
## Error in vector(double, length = ncol(df)): cannot coerce type 'closure' to vector of type 'character'
output = vector(mode = "double", length = ncol(df))
for (i in seq_along(df)) {
# assign / store result to output[[i]]
output[[i]] <- median(df[[i]])
# double brackets
# for generalizability
# subsetting will work whether output is a vector or a list
}
# Print output to console
output
## numeric(0)
# snippet of code
# that successfully rescales a column to be between 0 and 1:
# (df$a - min(df$a, na.rm = TRUE)) /
# (max(df$a, na.rm = TRUE) - min(df$a, na.rm = TRUE))
# create a vector x containing the numbers 1 through 10.
x <- 1:10
# Rewrite the code snippet
# to use the temporary variable x
# instead of referring to the data frame column df$a
(x - min(x, na.rm = TRUE)) /
(max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
## [1] 0.0000000 0.1111111 0.2222222 0.3333333 0.4444444 0.5555556 0.6666667
## [8] 0.7777778 0.8888889 1.0000000
# create a vector x containing the numbers 1 through 10.
x <- 1:10
# duplicated statement = min(x, na.rm = TRUE)
# calculate it once / store the result/ refer to it
# also need the maximum value of x
# => calculate the range once
# refer to the first and second elements when they are needed.
# Define the intermediate variable rng
# to contain the range of x
# using the function range()
# Specify the na.rm() argument
# to automatically ignore any NAs in the vector
rng <- range(x, na.rm = TRUE)
# Rewrite this snippet to refer to the elements of rng
(x - rng[1]) /
(rng[2] - rng[1])
## [1] 0.0000000 0.1111111 0.2222222 0.3333333 0.4444444 0.5555556 0.6666667
## [8] 0.7777778 0.8888889 1.0000000
rescale01 <- function(x) {
rng <- range(x, na.rm = TRUE)
(x - rng[1]) /
(rng[2] - rng[1])
}
counts at how many positions two vectors, x and y, both have a missing value
# Define example vectors x and y
x <- c( 1, 2, NA, 3, NA)
y <- c(NA, 3, NA, 3, 4)
# where we know what the answer both_na(x, y) should be.
# both_na(x, y) should return 1
# since there is only one element that is missing in both x and y
# the third element.
# Count how many elements are missing in both x and y
is.na(x) # logical vector with TRUE at every position that has a missing
## [1] FALSE FALSE TRUE FALSE TRUE
sum(is.na(x & y)) # incorrect
## [1] 3
sum(is.na(x) & is.na(y)) # correct
## [1] 1
# snippet
sum(is.na(x) & is.na(y))
## [1] 1
# Our snippet is also so simple we can't write it any clearer.
# Turn this snippet into a function
both_na <- function(x,y) {
sum(is.na(x) & is.na(y))
}
# Define x, y1 and y2
x <- c(NA, NA, NA)
y1 <- c( 1, NA, NA)
y2 <- c( 1, NA, NA, NA)
# Call both_na on x, y1
both_na(x, y1)
## [1] 2
# Call both_na on x, y2
both_na(x, y2)
## Warning in is.na(x) & is.na(y): longer object length is not a multiple of
## shorter object length
## [1] 3
# bad name
f2 <- function(x) {
if (length(x) <= 1) return(NULL)
x[-length(x)]
}
x<-3; x[-length(x)]
## numeric(0)
x<-c(3,4); x[-length(x)]
## [1] 3
x<-c(3,4,5); x[-length(x)]
## [1] 3 4
# good name
remove_last <- function(x) {
if (length(x) <= 1) return(NULL)
x[-length(x)]
}
# bad arg names?
mean_ci <- function(c, nums) {
se <- sd(nums) / sqrt(length(nums))
alpha <- 1 - c
mean(nums) + se * qnorm(c(alpha / 2, 1 - alpha / 2))
}
# argument nums = sample of data
# argument c controls the level of the confidence interval
# e.g.
# c = 0.95 => 95% confidence interval
# c = non-descriptive and it's the name of an existing function in R
# better name: confidence
# since it reveals the purpose of the argument:
# "to control the level of confidence for the interval"
# OR: level
# since it's the same name used for the confint function in base R
# & users may already be familiar with that name for this parameter.
# nums is not inherently bad
# but since it's the placeholder for the vector of data
# a name like x would be more recognizable to users.
# Rewrite mean_ci to take arguments named level and x
mean_ci <- function(level, x) {
se <- sd(x) / sqrt(length(x))
alpha <- 1 - level
mean(x) + se * qnorm(c(alpha / 2, 1 - alpha / 2))
}
mean_ci <- function(x, level = .95) {
se <- sd(x) / sqrt(length(x))
alpha <- 1 - level
mean(x) + se * qnorm(c(alpha / 2, 1 - alpha / 2))
}
# Data arguments supply the data to compute on.
# Detail arguments control the details of how the computation is done.
# Data arguments should come first
# Detail arguments should go on the end, and usually should have default values.
# pass mean_ci() an empty vector
# it returns a confidence interval with missing values at both ends
mean_ci(numeric(0))
## [1] NA NA
#? produce a warning "x was empty" and return c(-Inf, Inf)
mean_ci <- function(x, level = 0.95) {
if (length(x) == 0) {
warning("`x` was empty", call. = FALSE)
interval <- c(-Inf, Inf)
} else {
se <- sd(x) / sqrt(length(x))
alpha <- 1 - level
interval <- mean(x) +
se * qnorm(c(alpha / 2, 1 - alpha / 2))
}
interval
}
# how hard it is now to follow the logic of the function
# an early return() makes sense
# if x is empty
# the function should immediately return c(-Inf, Inf).
# Alter the mean_ci function
mean_ci <- function(x, level = 0.95) {
# The if statement should be
# the first thing in the body of the function definition
if (length(x) == 0) {
warning("`x` was empty", call. = FALSE)
return(c(-Inf, Inf))
}
se <- sd(x) / sqrt(length(x))
alpha <- 1 - level
mean(x) + se * qnorm(c(alpha / 2, 1 - alpha / 2))
}
f <- function(x, y) {
x[is.na(x)] <- y
cat(sum(is.na(x)), y, "\n")
x
}
# What does this function do?
# Let's try to figure it out by passing in some arguments.
# Define a numeric vector x with the values 1, 2, NA, 4 and 5
x <- c(1, 2, NA, 4, 5)
# Call f() with the arguments x = x and y = 3
f(x = x, y = 3)
## 0 3
## [1] 1 2 3 4 5
# Call f() with the arguments x = x and y = 10
f(x = x, y = 10)
## 0 10
## [1] 1 2 10 4 5
so… f() takes a vector x and replaces any missing values in it with the value y.
e.g. df\(z <- f(df\)z, 0)
I know this replaces any missing values in the column df$z with the value 0
Anyone else who comes across that line is going to have to go back and find the definition of f and see if they can reason it out.
# Rename the function f() to replace_missings()
replace_missings <- function(x, replacement) {
# Change the name of the y argument to replacement
x[is.na(x)] <- replacement
cat(sum(is.na(x)), replacement, "\n")
x
}
# Rewrite the call on df$z to match our new names
df$z <- replace_missings(df$z, replacement = 0)
## 0 0
replace_missings <- function(x, replacement) {
# Define is_miss = logical that identifies the missing values in x.
is_miss <- is.na(x)
# Rewrite rest of function to refer to is_miss
x[is_miss] <- replacement
cat(sum(is_miss), replacement, "\n")
x
}
# try it:
replace_missings(df$z, replacement = 0)
## 0 0
## numeric(0)
# It would be much nicer to say "2 missing values replaced by the value 0".
bad practice: to use cat() for anything other than a print() method (a function designed just to display output).
good practice: diagnostic information => message() function unnamed arguments are pasted together with no separator (and no need for a newline at the end) and by default are printed to the screen.
replace_missings <- function(x, replacement) {
is_miss <- is.na(x)
x[is_miss] <- replacement
# Rewrite to use message()
message(sum(is_miss)," missings replaced by the value ", replacement)
x
}
# Check your new function by running on df$z
replace_missings(df$z, replacement = 0)
## 0 missings replaced by the value 0
## numeric(0)
df <- data.frame(
a = rnorm(10),
b = rnorm(10),
c = rnorm(10),
d = rnorm(10)
)
#compute the median of each column
median(df[[1]])
## [1] 0.2522204
median(df[[2]])
## [1] 0.1837433
median(df[[3]])
## [1] 0.03352717
median(df[[4]])
## [1] 0.4091961
# That is a lot of repetition
# => reduce the duplication by using a for loop.
# Initialize output vector
output <- vector("double", ncol(df))
# Fill in the body of the for loop
for (i in seq_along(df)) {
output[i] <- median(df[[i]])
}
# View the result
output
## [1] 0.25222038 0.18374329 0.03352717 0.40919615
# another data frame df2
df2 <- data.frame(
a = rnorm(10),
b = rnorm(10),
c = rnorm(10),
d = rnorm(10)
)
# => copy and paste the for loop
# edit every reference to df to be df2 instead
output <- vector("double", ncol(df2))
for (i in seq_along(df2)) {
output[[i]] <- median(df2[[i]])
}
# repeat for df3 etc.. WRITE A FUNCTION
# From code to function: col_median()
col_median <- function(df){
# Just embed the code in the body
output <- vector("double", ncol(df))
for (i in seq_along(df)) {
output[[i]] <- median(df[[i]])
}
output
}
# Create a col_mean() function by editing col_median() to find the column means instead.
col_mean <- function(df){
# Just embed the code in the body
output <- vector("double", ncol(df))
for (i in seq_along(df)) {
output[[i]] <- mean(df[[i]])
}
output
}
# Create a col_sd() function by editing col_median() to find the column standard deviations instead.
col_sd <- function(df){
# Just embed the code in the body
output <- vector("double", ncol(df))
for (i in seq_along(df)) {
output[[i]] <- sd(df[[i]])
}
output
}
copied and pasted the function col_median two times
time to write a function again
f: take column summaries for any summary function we provide
f1 <- function(x) abs(x - mean(x)) ^ 1
f2 <- function(x) abs(x - mean(x)) ^ 2
f3 <- function(x) abs(x - mean(x)) ^ 3
# Q: How could you remove the duplication in this set of function definitions?
# A: single function with two arguments: x and power
# Add a second argument called power
f <- function(x, power) {
# Edit the body to return absolute deviations raised to power
abs(x - mean(x))^ power
}
# remove the duplication in our set of summary functions
# by requiring the function doing the summary as an input
col_summary <- function(df, fun) {
output <- vector("numeric", length(df))
for (i in seq_along(df)) {
output[[i]] <- fun(df[[i]])
}
output
}
# Find the column medians using col_median() and col_summary()
col_median(df)
## [1] 0.25222038 0.18374329 0.03352717 0.40919615
col_summary(df,median)
## [1] 0.25222038 0.18374329 0.03352717 0.40919615
# Find the column means using col_mean() and col_summary()
col_mean(df)
## [1] -0.02125549 0.20077120 -0.05484722 0.01907553
col_summary(df, mean)
## [1] -0.02125549 0.20077120 -0.05484722 0.01907553
sapply(df,mean)
## a b c d
## -0.02125549 0.20077120 -0.05484722 0.01907553
library(purrr)
map_dbl(df,mean)
## a b c d
## -0.02125549 0.20077120 -0.05484722 0.01907553
# Find the column IQRs using col_summary()
col_summary(df, IQR)
## [1] 0.7316208 0.6611129 1.1723632 1.2884701
take a vector, .x, as the first argument return .f applied to each element of .x
type of object that is returned is determined by function suffix (the part after _):
# repeat our column summaries
# using a map function
# instead of our col_summary() function
# every summary we calculated
# returned a single numeric value,
# so we'll use map_dbl().
# Load the purrr package
library(purrr)
# Use map_dbl() to find column means
map_dbl(df, mean)
## a b c d
## -0.02125549 0.20077120 -0.05484722 0.01907553
# Use map_dbl() to column medians
map_dbl(df, median)
## a b c d
## 0.25222038 0.18374329 0.03352717 0.40919615
# Use map_dbl() to find column standard deviations
map_dbl(df, sd)
## a b c d
## 0.8818169 0.7290350 0.8757765 1.1636818
the … (“dot dot dot”) argument used to pass along additional arguments to .f each time it’s called
e.g. * pass the trim argument to the mean() function: map_dbl(df, mean, trim = 0.5) * multiple arguments can be passed: map_dbl(df, mean, trim = 0.5, na.rm = TRUE)
You don’t have to specify the arguments by name, but it is good practice!
Unlikely to be argument names you might pass through the …, thereby preventing confusion about whether an argument belongs to map() or to the function being mapped.
library(nycflights13) # contains planes data frame
head(planes)
## # A tibble: 6 x 9
## tailnum year type manufacturer model engines seats speed engine
## <chr> <int> <chr> <chr> <chr> <int> <int> <int> <chr>
## 1 N10156 2004 Fixed win~ EMBRAER EMB-1~ 2 55 NA Turbo~
## 2 N102UW 1998 Fixed win~ AIRBUS INDUS~ A320-~ 2 182 NA Turbo~
## 3 N103US 1999 Fixed win~ AIRBUS INDUS~ A320-~ 2 182 NA Turbo~
## 4 N104UW 1999 Fixed win~ AIRBUS INDUS~ A320-~ 2 182 NA Turbo~
## 5 N10575 2002 Fixed win~ EMBRAER EMB-1~ 2 55 NA Turbo~
## 6 N105UW 1999 Fixed win~ AIRBUS INDUS~ A320-~ 2 182 NA Turbo~
head(planes[,c(2,6:8)])
## # A tibble: 6 x 4
## year engines seats speed
## <int> <int> <int> <int>
## 1 2004 2 55 NA
## 2 1998 2 182 NA
## 3 1999 2 182 NA
## 4 1999 2 182 NA
## 5 2002 2 55 NA
## 6 1999 2 182 NA
planes <- planes[,c(2,6:8)]
# Find the mean of each column
map_dbl(planes, mean)
## year engines seats speed
## NA 1.995184 154.316376 NA
# Find the mean of each column, excluding missing values
map_dbl(planes, mean, na.rm = TRUE)
## year engines seats speed
## 2000.484010 1.995184 154.316376 236.782609
# Find the 5th percentile of each column, excluding missing values
map_dbl(planes, quantile, probs = 0.05, na.rm = TRUE)
## year engines seats speed
## 1988.0 2.0 55.0 90.5
map() will return a list
if you know what type of output you expect you are better to use the corresponding function
e.g.
map_lgl(df, mean) # either returns a logical vector or an error
## Error: Can't coerce element 1 from a double to a logical
map functions = “type consistent”
try 1st element: mean(df[[1]]) returns: a single numeric value suggesting: map_dbl()
# create df3:
A <- c(-0.4766209, -1.67653203, -0.91437464, -0.73426835, -0.91774245, 0.46111978, -0.02242416, -0.22946634, 2.85772705, 0.87772213)
D <- c(1.0625948,-0.5889523,0.4987057,1.147293,-1.3138689,0.818433,0.980729,0.1505957,1.1003666,-0.1864504)
df3 <- data.frame("A" = A, "B" = rep(c("A","B"),5), "C" = c(1:10), "D" = D, stringsAsFactors = FALSE)
# Find the columns that are numeric
map_lgl(df3, is.numeric)
## A B C D
## TRUE FALSE TRUE TRUE
# Find the type of each column
map_chr(df3, typeof)
## A B C D
## "double" "character" "integer" "double"
# Find a summary of each column
map(df3, summary)
## $A
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.67653 -0.86935 -0.35304 -0.07749 0.34023 2.85773
##
## $B
## Length Class Mode
## 10 character character
##
## $C
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 3.25 5.50 5.50 7.75 10.00
##
## $D
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.3139 -0.1022 0.6586 0.3669 1.0421 1.1473
#an existing function
map(df, summary)
## $a
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.04919 -0.32667 0.25222 -0.02126 0.40495 1.17456
##
## $b
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.0177 -0.2097 0.1837 0.2008 0.4514 1.5452
##
## $c
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.31345 -0.67938 0.03353 -0.05485 0.49298 1.39675
##
## $d
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.67295 -0.42279 0.40920 0.01908 0.86568 1.10229
# an existing function you defined
map(df, rescale01)
## $a
## [1] 1.0000000 0.0000000 0.6879776 0.5191789 0.7398066 0.4283861 0.5797533
## [8] 0.8193482 0.7516915 0.7644625
##
## $b
## [1] 0.5064432 0.5276586 0.4311417 0.8033729 0.0000000 0.2999584 0.3612123
## [8] 0.5884202 0.2361577 1.0000000
##
## $c
## [1] 0.5427619 0.5488032 0.4484678 0.0291086 0.7057732 1.0000000 0.7553328
## [8] 0.4512428 0.0000000 0.1624526
##
## $d
## [1] 0.7909682 0.6443476 0.9594164 0.0000000 1.0000000 0.4633025 0.8418539
## [8] 0.9798662 0.8710606 0.5799268
# an anonymous function defined on the fly
map(df, function(x) sum(is.na(x)))
## $a
## [1] 0
##
## $b
## [1] 0
##
## $c
## [1] 0
##
## $d
## [1] 0
# an anonymous function defined using a formula shortcut
map(df, ~ sum(is.na(.)))
## $a
## [1] 0
##
## $b
## [1] 0
##
## $c
## [1] 0
##
## $d
## [1] 0
list_of_results <- list(
list(a = 1, b = "A"),
list(a = 2, b = "C"),
list(a = 3, b = "D")
)
# an anonymous function
map_dbl(list_of_results, function(x) x[["a"]])
## [1] 1 2 3
# Shortcut: string subsetting
map_dbl(list_of_results, "a")
## [1] 1 2 3
# Shortcut: integer subsetting (by index)
map_dbl(list_of_results, 1)
## [1] 1 2 3
# Split the data frame mtcars based on the unique values in the cyl column
cyl <- split(mtcars, mtcars$cyl)
str(cyl) # list of data frames
## List of 3
## $ 4:'data.frame': 11 obs. of 11 variables:
## ..$ mpg : num [1:11] 22.8 24.4 22.8 32.4 30.4 33.9 21.5 27.3 26 30.4 ...
## ..$ cyl : num [1:11] 4 4 4 4 4 4 4 4 4 4 ...
## ..$ disp: num [1:11] 108 146.7 140.8 78.7 75.7 ...
## ..$ hp : num [1:11] 93 62 95 66 52 65 97 66 91 113 ...
## ..$ drat: num [1:11] 3.85 3.69 3.92 4.08 4.93 4.22 3.7 4.08 4.43 3.77 ...
## ..$ wt : num [1:11] 2.32 3.19 3.15 2.2 1.61 ...
## ..$ qsec: num [1:11] 18.6 20 22.9 19.5 18.5 ...
## ..$ vs : num [1:11] 1 1 1 1 1 1 1 1 0 1 ...
## ..$ am : num [1:11] 1 0 0 1 1 1 0 1 1 1 ...
## ..$ gear: num [1:11] 4 4 4 4 4 4 3 4 5 5 ...
## ..$ carb: num [1:11] 1 2 2 1 2 1 1 1 2 2 ...
## $ 6:'data.frame': 7 obs. of 11 variables:
## ..$ mpg : num [1:7] 21 21 21.4 18.1 19.2 17.8 19.7
## ..$ cyl : num [1:7] 6 6 6 6 6 6 6
## ..$ disp: num [1:7] 160 160 258 225 168 ...
## ..$ hp : num [1:7] 110 110 110 105 123 123 175
## ..$ drat: num [1:7] 3.9 3.9 3.08 2.76 3.92 3.92 3.62
## ..$ wt : num [1:7] 2.62 2.88 3.21 3.46 3.44 ...
## ..$ qsec: num [1:7] 16.5 17 19.4 20.2 18.3 ...
## ..$ vs : num [1:7] 0 0 1 1 1 1 0
## ..$ am : num [1:7] 1 1 0 0 0 0 1
## ..$ gear: num [1:7] 4 4 3 3 4 4 5
## ..$ carb: num [1:7] 4 4 1 1 4 4 6
## $ 8:'data.frame': 14 obs. of 11 variables:
## ..$ mpg : num [1:14] 18.7 14.3 16.4 17.3 15.2 10.4 10.4 14.7 15.5 15.2 ...
## ..$ cyl : num [1:14] 8 8 8 8 8 8 8 8 8 8 ...
## ..$ disp: num [1:14] 360 360 276 276 276 ...
## ..$ hp : num [1:14] 175 245 180 180 180 205 215 230 150 150 ...
## ..$ drat: num [1:14] 3.15 3.21 3.07 3.07 3.07 2.93 3 3.23 2.76 3.15 ...
## ..$ wt : num [1:14] 3.44 3.57 4.07 3.73 3.78 ...
## ..$ qsec: num [1:14] 17 15.8 17.4 17.6 18 ...
## ..$ vs : num [1:14] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ am : num [1:14] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ gear: num [1:14] 3 3 3 3 3 3 3 3 3 3 ...
## ..$ carb: num [1:14] 2 4 3 3 3 4 4 4 2 2 ...
cyl[[1]]
## mpg cyl disp hp drat wt qsec vs am gear carb
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
# Split the data frame mtcars based on the unique values in the cyl column
# Examine the structure of cyl
# confirm the structure of this list of data frames
str(cyl)
## List of 3
## $ 4:'data.frame': 11 obs. of 11 variables:
## ..$ mpg : num [1:11] 22.8 24.4 22.8 32.4 30.4 33.9 21.5 27.3 26 30.4 ...
## ..$ cyl : num [1:11] 4 4 4 4 4 4 4 4 4 4 ...
## ..$ disp: num [1:11] 108 146.7 140.8 78.7 75.7 ...
## ..$ hp : num [1:11] 93 62 95 66 52 65 97 66 91 113 ...
## ..$ drat: num [1:11] 3.85 3.69 3.92 4.08 4.93 4.22 3.7 4.08 4.43 3.77 ...
## ..$ wt : num [1:11] 2.32 3.19 3.15 2.2 1.61 ...
## ..$ qsec: num [1:11] 18.6 20 22.9 19.5 18.5 ...
## ..$ vs : num [1:11] 1 1 1 1 1 1 1 1 0 1 ...
## ..$ am : num [1:11] 1 0 0 1 1 1 0 1 1 1 ...
## ..$ gear: num [1:11] 4 4 4 4 4 4 3 4 5 5 ...
## ..$ carb: num [1:11] 1 2 2 1 2 1 1 1 2 2 ...
## $ 6:'data.frame': 7 obs. of 11 variables:
## ..$ mpg : num [1:7] 21 21 21.4 18.1 19.2 17.8 19.7
## ..$ cyl : num [1:7] 6 6 6 6 6 6 6
## ..$ disp: num [1:7] 160 160 258 225 168 ...
## ..$ hp : num [1:7] 110 110 110 105 123 123 175
## ..$ drat: num [1:7] 3.9 3.9 3.08 2.76 3.92 3.92 3.62
## ..$ wt : num [1:7] 2.62 2.88 3.21 3.46 3.44 ...
## ..$ qsec: num [1:7] 16.5 17 19.4 20.2 18.3 ...
## ..$ vs : num [1:7] 0 0 1 1 1 1 0
## ..$ am : num [1:7] 1 1 0 0 0 0 1
## ..$ gear: num [1:7] 4 4 3 3 4 4 5
## ..$ carb: num [1:7] 4 4 1 1 4 4 6
## $ 8:'data.frame': 14 obs. of 11 variables:
## ..$ mpg : num [1:14] 18.7 14.3 16.4 17.3 15.2 10.4 10.4 14.7 15.5 15.2 ...
## ..$ cyl : num [1:14] 8 8 8 8 8 8 8 8 8 8 ...
## ..$ disp: num [1:14] 360 360 276 276 276 ...
## ..$ hp : num [1:14] 175 245 180 180 180 205 215 230 150 150 ...
## ..$ drat: num [1:14] 3.15 3.21 3.07 3.07 3.07 2.93 3 3.23 2.76 3.15 ...
## ..$ wt : num [1:14] 3.44 3.57 4.07 3.73 3.78 ...
## ..$ qsec: num [1:14] 17 15.8 17.4 17.6 18 ...
## ..$ vs : num [1:14] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ am : num [1:14] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ gear: num [1:14] 3 3 3 3 3 3 3 3 3 3 ...
## ..$ carb: num [1:14] 2 4 3 3 3 4 4 4 2 2 ...
# Extract the first element into four_cyls
four_cyls <- cyl[1] # incorrect
str(four_cyls)
## List of 1
## $ 4:'data.frame': 11 obs. of 11 variables:
## ..$ mpg : num [1:11] 22.8 24.4 22.8 32.4 30.4 33.9 21.5 27.3 26 30.4 ...
## ..$ cyl : num [1:11] 4 4 4 4 4 4 4 4 4 4 ...
## ..$ disp: num [1:11] 108 146.7 140.8 78.7 75.7 ...
## ..$ hp : num [1:11] 93 62 95 66 52 65 97 66 91 113 ...
## ..$ drat: num [1:11] 3.85 3.69 3.92 4.08 4.93 4.22 3.7 4.08 4.43 3.77 ...
## ..$ wt : num [1:11] 2.32 3.19 3.15 2.2 1.61 ...
## ..$ qsec: num [1:11] 18.6 20 22.9 19.5 18.5 ...
## ..$ vs : num [1:11] 1 1 1 1 1 1 1 1 0 1 ...
## ..$ am : num [1:11] 1 0 0 1 1 1 0 1 1 1 ...
## ..$ gear: num [1:11] 4 4 4 4 4 4 3 4 5 5 ...
## ..$ carb: num [1:11] 1 2 2 1 2 1 1 1 2 2 ...
four_cyls <- cyl[[1]] # correct
str(four_cyls)
## 'data.frame': 11 obs. of 11 variables:
## $ mpg : num 22.8 24.4 22.8 32.4 30.4 33.9 21.5 27.3 26 30.4 ...
## $ cyl : num 4 4 4 4 4 4 4 4 4 4 ...
## $ disp: num 108 146.7 140.8 78.7 75.7 ...
## $ hp : num 93 62 95 66 52 65 97 66 91 113 ...
## $ drat: num 3.85 3.69 3.92 4.08 4.93 4.22 3.7 4.08 4.43 3.77 ...
## $ wt : num 2.32 3.19 3.15 2.2 1.61 ...
## $ qsec: num 18.6 20 22.9 19.5 18.5 ...
## $ vs : num 1 1 1 1 1 1 1 1 0 1 ...
## $ am : num 1 0 0 1 1 1 0 1 1 1 ...
## $ gear: num 4 4 4 4 4 4 3 4 5 5 ...
## $ carb: num 1 2 2 1 2 1 1 1 2 2 ...
# Fit a linear regression of miles per gallon on weight
# Fit a linear regression of mpg on wt using four_cyls
# To fit a linear model with a
# response variable y and explanatory variable x,
# you can call the lm() function with the formula y ~ x.
# incorrect:
# lm(four_cyls$mpg ~ four_cyls$wt,four_cyls)
lm(mpg ~ wt, data = four_cyls)
##
## Call:
## lm(formula = mpg ~ wt, data = four_cyls)
##
## Coefficients:
## (Intercept) wt
## 39.571 -5.647
Now we have a snippet of code that performs the operation we want on one data frame.
# create function
fit_reg <- function(df) {
lm(mpg ~ wt, data = df)
}
# pass this to map
map(cyl, fit_reg)
## $`4`
##
## Call:
## lm(formula = mpg ~ wt, data = df)
##
## Coefficients:
## (Intercept) wt
## 39.571 -5.647
##
##
## $`6`
##
## Call:
## lm(formula = mpg ~ wt, data = df)
##
## Coefficients:
## (Intercept) wt
## 28.41 -2.78
##
##
## $`8`
##
## Call:
## lm(formula = mpg ~ wt, data = df)
##
## Coefficients:
## (Intercept) wt
## 23.868 -2.192
# Rewrite to call an anonymous function
# incorrect
# map(cyl, lm(mpg ~ wt, data = df))
# The anonymous function is defined as
# function(df) lm(y ~ x, data = df)
# which should be the .f argument
# to the function map().
map(cyl, function(df) lm(mpg ~ wt, data = df))
## $`4`
##
## Call:
## lm(formula = mpg ~ wt, data = df)
##
## Coefficients:
## (Intercept) wt
## 39.571 -5.647
##
##
## $`6`
##
## Call:
## lm(formula = mpg ~ wt, data = df)
##
## Coefficients:
## (Intercept) wt
## 28.41 -2.78
##
##
## $`8`
##
## Call:
## lm(formula = mpg ~ wt, data = df)
##
## Coefficients:
## (Intercept) wt
## 23.868 -2.192
“purrr” provides a shortcut that allows you to re write an anonymous function: * as a one-sided formula * starts with a ~ * followed by an R expression
In purrr’s map functions: “R expression” can refer to an element of the .x argument using the . character.
use ananonymous function:
map_dbl(cyl, function(df) mean(df$disp))
## 4 6 8
## 105.1364 183.3143 353.1000
use formula shortcut:
# replace the function definition (function(df))
# with the ~
# to refer to the element of cyl the function operates on (in this case df)
# we use a .
map_dbl(cyl, ~ mean(.$disp))
## 4 6 8
## 105.1364 183.3143 353.1000
# Rewrite:
map(cyl, function(df) lm(mpg ~ wt, data = df))
## $`4`
##
## Call:
## lm(formula = mpg ~ wt, data = df)
##
## Coefficients:
## (Intercept) wt
## 39.571 -5.647
##
##
## $`6`
##
## Call:
## lm(formula = mpg ~ wt, data = df)
##
## Coefficients:
## (Intercept) wt
## 28.41 -2.78
##
##
## $`8`
##
## Call:
## lm(formula = mpg ~ wt, data = df)
##
## Coefficients:
## (Intercept) wt
## 23.868 -2.192
# to use the formula shortcut:
map(cyl, ~ lm(mpg ~ wt, data = .))
## $`4`
##
## Call:
## lm(formula = mpg ~ wt, data = .)
##
## Coefficients:
## (Intercept) wt
## 39.571 -5.647
##
##
## $`6`
##
## Call:
## lm(formula = mpg ~ wt, data = .)
##
## Coefficients:
## (Intercept) wt
## 28.41 -2.78
##
##
## $`8`
##
## Call:
## lm(formula = mpg ~ wt, data = .)
##
## Coefficients:
## (Intercept) wt
## 23.868 -2.192
if: .f argument to a map function is set equal to a string, let’s say “name” then: purrr extracts the “name” element from every element of .x
# list of where every element contains an a and b element:
list_of_results <- list(
list(a = 1, b = "A"),
list(a = 2, b = "C"),
list(a = 3, b = "D")
)
# pull out the a element from every entry
# string shortcut
map(list_of_results, "a")
## [[1]]
## [1] 1
##
## [[2]]
## [1] 2
##
## [[3]]
## [1] 3
# Save the result from the previous exercise to the variable models
# fit the models
models <- map(cyl, ~ lm(mpg ~ wt, data = .))
# get the coefficients for each model
coefs <- map(models, coef)
# Use string shortcut to extract the wt coefficient
map(coefs, "wt")
## $`4`
## [1] -5.647025
##
## $`6`
## [1] -2.780106
##
## $`8`
## [1] -2.192438
# Extract the second element from each vector in coefs
# using
# numeric shortcut
# map_dbl() [pulling out a single numeric value
# from each element]
map_dbl(coefs, 2)
## 4 6 8
## -5.647025 -2.780106 -2.192438
x %>% f(y) == f(x, y)
LHS “x” == 1st argument to RHS f()
# split the data frame mtcars
cyl <- split(mtcars, mtcars$cyl)
# fit models
# pass cyl as the first argument to map
map(cyl, ~ lm(mpg ~ wt, data = .))
## $`4`
##
## Call:
## lm(formula = mpg ~ wt, data = .)
##
## Coefficients:
## (Intercept) wt
## 39.571 -5.647
##
##
## $`6`
##
## Call:
## lm(formula = mpg ~ wt, data = .)
##
## Coefficients:
## (Intercept) wt
## 28.41 -2.78
##
##
## $`8`
##
## Call:
## lm(formula = mpg ~ wt, data = .)
##
## Coefficients:
## (Intercept) wt
## 23.868 -2.192
# REWRITE with PIPE OPERATOR:
# split the data frame mtcars on cyl
split(mtcars, mtcars$cyl) %>%
# use map() on the result
map(~ lm(mpg ~ wt, data = .))
## $`4`
##
## Call:
## lm(formula = mpg ~ wt, data = .)
##
## Coefficients:
## (Intercept) wt
## 39.571 -5.647
##
##
## $`6`
##
## Call:
## lm(formula = mpg ~ wt, data = .)
##
## Coefficients:
## (Intercept) wt
## 28.41 -2.78
##
##
## $`8`
##
## Call:
## lm(formula = mpg ~ wt, data = .)
##
## Coefficients:
## (Intercept) wt
## 23.868 -2.192
# chain together many operations
mtcars %>%
split(mtcars$cyl) %>%
map(~ lm(mpg ~ wt, data = .)) %>%
map(coef) %>%
map_dbl("wt")
## 4 6 8
## -5.647025 -2.780106 -2.192438
# Now, pull out the R2 from each model
models <- mtcars %>%
split(mtcars$cyl) %>%
map(~ lm(mpg ~ wt, data = .))
# Rewrite to be a single command using pipes
# summaries <- map(models, summary)
# map_dbl(summaries, "r.squared")
# LHS = models
# RHS = map
models %>%
map(summary) %>%
map_dbl("r.squared")
## 4 6 8
## 0.5086326 0.4645102 0.4229655
argument = function returns = function
never throws an error (and never stops the rest of your computation!)
returns a list with two elements:
# pass readLines() to safely()
safe_readLines <- safely(readLines)
# Call safe_readLines() on "http://example.org"
safe_readLines("http://example.org")
## $result
## [1] "<!doctype html>"
## [2] "<html>"
## [3] "<head>"
## [4] " <title>Example Domain</title>"
## [5] ""
## [6] " <meta charset=\"utf-8\" />"
## [7] " <meta http-equiv=\"Content-type\" content=\"text/html; charset=utf-8\" />"
## [8] " <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />"
## [9] " <style type=\"text/css\">"
## [10] " body {"
## [11] " background-color: #f0f0f2;"
## [12] " margin: 0;"
## [13] " padding: 0;"
## [14] " font-family: \"Open Sans\", \"Helvetica Neue\", Helvetica, Arial, sans-serif;"
## [15] " "
## [16] " }"
## [17] " div {"
## [18] " width: 600px;"
## [19] " margin: 5em auto;"
## [20] " padding: 50px;"
## [21] " background-color: #fff;"
## [22] " border-radius: 1em;"
## [23] " }"
## [24] " a:link, a:visited {"
## [25] " color: #38488f;"
## [26] " text-decoration: none;"
## [27] " }"
## [28] " @media (max-width: 700px) {"
## [29] " body {"
## [30] " background-color: #fff;"
## [31] " }"
## [32] " div {"
## [33] " width: auto;"
## [34] " margin: 0 auto;"
## [35] " border-radius: 0;"
## [36] " padding: 1em;"
## [37] " }"
## [38] " }"
## [39] " </style> "
## [40] "</head>"
## [41] ""
## [42] "<body>"
## [43] "<div>"
## [44] " <h1>Example Domain</h1>"
## [45] " <p>This domain is established to be used for illustrative examples in documents. You may use this"
## [46] " domain in examples without prior coordination or asking for permission.</p>"
## [47] " <p><a href=\"http://www.iana.org/domains/example\">More information...</a></p>"
## [48] "</div>"
## [49] "</body>"
## [50] "</html>"
##
## $error
## NULL
# Call safe_readLines() on "http://asdfasdasdkfjlda"
safe_readLines("http://asdfasdasdkfjlda")
## Warning in file(con, "r"): InternetOpenUrl failed: 'The server name or
## address could not be resolved'
## $result
## NULL
##
## $error
## <simpleError in file(con, "r"): cannot open the connection>
urls <- list(
example = "http://example.org",
rproj = "http://www.r-project.org",
asdf = "http://asdfasdasdkfjlda"
)
# download the HTML files at each URL
### map(urls, readLines)
### Error in file(con, "r") : cannot open the connection
### no output for any of the URLs
### :(
### solve this problem
### :)
### by using our safe_readLines()
html <- map(urls, safe_readLines)
## Warning in file(con, "r"): InternetOpenUrl failed: 'The server name or
## address could not be resolved'
# Warning message:
# URL 'http://asdfasdasdkfjlda/': status was 'Couldn't resolve host name'
# html
# contains
# the HTML for each of the two URLs
# on which readLines() was successful
# and the error for the other
# BUT: buried in the inner-most level of the list.
str(html)
## List of 3
## $ example:List of 2
## ..$ result: chr [1:50] "<!doctype html>" "<html>" "<head>" " <title>Example Domain</title>" ...
## ..$ error : NULL
## $ rproj :List of 2
## ..$ result: chr [1:111] "<!DOCTYPE html>" "<html lang=\"en\">" " <head>" " <meta charset=\"utf-8\">" ...
## ..$ error : NULL
## $ asdf :List of 2
## ..$ result: NULL
## ..$ error :List of 2
## .. ..$ message: chr "cannot open the connection"
## .. ..$ call : language file(con, "r")
## .. ..- attr(*, "class")= chr [1:3] "simpleError" "error" "condition"
# Extract the "result"
# from one of the two elements that was successful
# using double square bracket subsetting.
html[[1]][['result']]
## [1] "<!doctype html>"
## [2] "<html>"
## [3] "<head>"
## [4] " <title>Example Domain</title>"
## [5] ""
## [6] " <meta charset=\"utf-8\" />"
## [7] " <meta http-equiv=\"Content-type\" content=\"text/html; charset=utf-8\" />"
## [8] " <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />"
## [9] " <style type=\"text/css\">"
## [10] " body {"
## [11] " background-color: #f0f0f2;"
## [12] " margin: 0;"
## [13] " padding: 0;"
## [14] " font-family: \"Open Sans\", \"Helvetica Neue\", Helvetica, Arial, sans-serif;"
## [15] " "
## [16] " }"
## [17] " div {"
## [18] " width: 600px;"
## [19] " margin: 5em auto;"
## [20] " padding: 50px;"
## [21] " background-color: #fff;"
## [22] " border-radius: 1em;"
## [23] " }"
## [24] " a:link, a:visited {"
## [25] " color: #38488f;"
## [26] " text-decoration: none;"
## [27] " }"
## [28] " @media (max-width: 700px) {"
## [29] " body {"
## [30] " background-color: #fff;"
## [31] " }"
## [32] " div {"
## [33] " width: auto;"
## [34] " margin: 0 auto;"
## [35] " border-radius: 0;"
## [36] " padding: 1em;"
## [37] " }"
## [38] " }"
## [39] " </style> "
## [40] "</head>"
## [41] ""
## [42] "<body>"
## [43] "<div>"
## [44] " <h1>Example Domain</h1>"
## [45] " <p>This domain is established to be used for illustrative examples in documents. You may use this"
## [46] " domain in examples without prior coordination or asking for permission.</p>"
## [47] " <p><a href=\"http://www.iana.org/domains/example\">More information...</a></p>"
## [48] "</div>"
## [49] "</body>"
## [50] "</html>"
# Extract the "error"
# from the element that was unsuccessful
# again using double square bracket subsetting.
html[[3]][['error']]
## <simpleError in file(con, "r"): cannot open the connection>
nested_list <- list(
x1 = list(a = 1, b = 2),
x2 = list(a = 3, b = 4)
)
# extract the a element in x1
nested_list[["x1"]][["a"]]
## [1] 1
# transpose the list first, the order of subsetting reverses
transpose(nested_list)[["a"]][["x1"]]
## [1] 1
# handy for safe output
# can easily grab:
# all the results
# or all the errors
str(transpose(html))
## List of 2
## $ result:List of 3
## ..$ example: chr [1:50] "<!doctype html>" "<html>" "<head>" " <title>Example Domain</title>" ...
## ..$ rproj : chr [1:111] "<!DOCTYPE html>" "<html lang=\"en\">" " <head>" " <meta charset=\"utf-8\">" ...
## ..$ asdf : NULL
## $ error :List of 3
## ..$ example: NULL
## ..$ rproj : NULL
## ..$ asdf :List of 2
## .. ..$ message: chr "cannot open the connection"
## .. ..$ call : language file(con, "r")
## .. ..- attr(*, "class")= chr [1:3] "simpleError" "error" "condition"
res <- transpose(html)[["result"]]
errs <- transpose(html)[["error"]]
# collect all the results for the elements that were successful
# examine the inputs for all those that weren't
# Create a logical vector
# TRUE when errs is NULL
is_ok <- map_lgl(errs, is_null)
# Extract the successful results
# subsetting res with is_ok
res[is_ok]
## $example
## [1] "<!doctype html>"
## [2] "<html>"
## [3] "<head>"
## [4] " <title>Example Domain</title>"
## [5] ""
## [6] " <meta charset=\"utf-8\" />"
## [7] " <meta http-equiv=\"Content-type\" content=\"text/html; charset=utf-8\" />"
## [8] " <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\" />"
## [9] " <style type=\"text/css\">"
## [10] " body {"
## [11] " background-color: #f0f0f2;"
## [12] " margin: 0;"
## [13] " padding: 0;"
## [14] " font-family: \"Open Sans\", \"Helvetica Neue\", Helvetica, Arial, sans-serif;"
## [15] " "
## [16] " }"
## [17] " div {"
## [18] " width: 600px;"
## [19] " margin: 5em auto;"
## [20] " padding: 50px;"
## [21] " background-color: #fff;"
## [22] " border-radius: 1em;"
## [23] " }"
## [24] " a:link, a:visited {"
## [25] " color: #38488f;"
## [26] " text-decoration: none;"
## [27] " }"
## [28] " @media (max-width: 700px) {"
## [29] " body {"
## [30] " background-color: #fff;"
## [31] " }"
## [32] " div {"
## [33] " width: auto;"
## [34] " margin: 0 auto;"
## [35] " border-radius: 0;"
## [36] " padding: 1em;"
## [37] " }"
## [38] " }"
## [39] " </style> "
## [40] "</head>"
## [41] ""
## [42] "<body>"
## [43] "<div>"
## [44] " <h1>Example Domain</h1>"
## [45] " <p>This domain is established to be used for illustrative examples in documents. You may use this"
## [46] " domain in examples without prior coordination or asking for permission.</p>"
## [47] " <p><a href=\"http://www.iana.org/domains/example\">More information...</a></p>"
## [48] "</div>"
## [49] "</body>"
## [50] "</html>"
##
## $rproj
## [1] "<!DOCTYPE html>"
## [2] "<html lang=\"en\">"
## [3] " <head>"
## [4] " <meta charset=\"utf-8\">"
## [5] " <meta http-equiv=\"X-UA-Compatible\" content=\"IE=edge\">"
## [6] " <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\">"
## [7] " <title>R: The R Project for Statistical Computing</title>"
## [8] ""
## [9] " <link rel=\"icon\" type=\"image/png\" href=\"/favicon-32x32.png\" sizes=\"32x32\" />"
## [10] " <link rel=\"icon\" type=\"image/png\" href=\"/favicon-16x16.png\" sizes=\"16x16\" />"
## [11] ""
## [12] " <!-- Bootstrap -->"
## [13] " <link href=\"/css/bootstrap.min.css\" rel=\"stylesheet\">"
## [14] " <link href=\"/css/R.css\" rel=\"stylesheet\">"
## [15] ""
## [16] " <!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media queries -->"
## [17] " <!-- WARNING: Respond.js doesn't work if you view the page via file:// -->"
## [18] " <!--[if lt IE 9]>"
## [19] " <script src=\"https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js\"></script>"
## [20] " <script src=\"https://oss.maxcdn.com/respond/1.4.2/respond.min.js\"></script>"
## [21] " <![endif]-->"
## [22] " </head>"
## [23] " <body>"
## [24] " <div class=\"container page\">"
## [25] " <div class=\"row\">"
## [26] " <div class=\"col-xs-12 col-sm-offset-1 col-sm-2 sidebar\" role=\"navigation\">"
## [27] "<div class=\"row\">"
## [28] "<div class=\"col-xs-6 col-sm-12\">"
## [29] "<p><a href=\"/\"><img src=\"/Rlogo.png\" width=\"100\" height=\"78\" alt = \"R\" /></a></p>"
## [30] "<p><small><a href=\"/\">[Home]</a></small></p>"
## [31] "<h2 id=\"download\">Download</h2>"
## [32] "<p><a href=\"http://cran.r-project.org/mirrors.html\">CRAN</a></p>"
## [33] "<h2 id=\"r-project\">R Project</h2>"
## [34] "<ul>"
## [35] "<li><a href=\"/about.html\">About R</a></li>"
## [36] "<li><a href=\"/logo/\">Logo</a></li>"
## [37] "<li><a href=\"/contributors.html\">Contributors</a></li>"
## [38] "<li><a href=\"/news.html\">Whatâ\200\231s New?</a></li>"
## [39] "<li><a href=\"/bugs.html\">Reporting Bugs</a></li>"
## [40] "<li><a href=\"http://developer.R-project.org\">Development Site</a></li>"
## [41] "<li><a href=\"/conferences.html\">Conferences</a></li>"
## [42] "<li><a href=\"/search.html\">Search</a></li>"
## [43] "</ul>"
## [44] "</div>"
## [45] "<div class=\"col-xs-6 col-sm-12\">"
## [46] "<h2 id=\"r-foundation\">R Foundation</h2>"
## [47] "<ul>"
## [48] "<li><a href=\"/foundation/\">Foundation</a></li>"
## [49] "<li><a href=\"/foundation/board.html\">Board</a></li>"
## [50] "<li><a href=\"/foundation/members.html\">Members</a></li>"
## [51] "<li><a href=\"/foundation/donors.html\">Donors</a></li>"
## [52] "<li><a href=\"/foundation/donations.html\">Donate</a></li>"
## [53] "</ul>"
## [54] "<h2 id=\"help-with-r\">Help With R</h2>"
## [55] "<ul>"
## [56] "<li><a href=\"/help.html\">Getting Help</a></li>"
## [57] "</ul>"
## [58] "<h2 id=\"documentation\">Documentation</h2>"
## [59] "<ul>"
## [60] "<li><a href=\"http://cran.r-project.org/manuals.html\">Manuals</a></li>"
## [61] "<li><a href=\"http://cran.r-project.org/faqs.html\">FAQs</a></li>"
## [62] "<li><a href=\"http://journal.r-project.org\">The R Journal</a></li>"
## [63] "<li><a href=\"/doc/bib/R-books.html\">Books</a></li>"
## [64] "<li><a href=\"/certification.html\">Certification</a></li>"
## [65] "<li><a href=\"/other-docs.html\">Other</a></li>"
## [66] "</ul>"
## [67] "<h2 id=\"links\">Links</h2>"
## [68] "<ul>"
## [69] "<li><a href=\"http://www.bioconductor.org\">Bioconductor</a></li>"
## [70] "<li><a href=\"/other-projects.html\">Related Projects</a></li>"
## [71] "<li><a href=\"/gsoc.html\">GSoC</a></li>"
## [72] "</ul>"
## [73] "</div>"
## [74] "</div>"
## [75] " </div>"
## [76] " <div class=\"col-xs-12 col-sm-7\">"
## [77] " <h1>The R Project for Statistical Computing</h1>"
## [78] "<h2 id=\"getting-started\">Getting Started</h2>"
## [79] "<p>R is a free software environment for statistical computing and graphics. It compiles and runs on a wide variety of UNIX platforms, Windows and MacOS. To <strong><a href=\"https://cran.r-project.org/mirrors.html\">download R</a></strong>, please choose your preferred <a href=\"https://cran.r-project.org/mirrors.html\">CRAN mirror</a>.</p>"
## [80] "<p>If you have questions about R like how to download and install the software, or what the license terms are, please read our <a href=\"https://cran.R-project.org/faqs.html\">answers to frequently asked questions</a> before you send an email.</p>"
## [81] "<h2 id=\"news\">News</h2>"
## [82] "<ul>"
## [83] "<li><p><a href=\"https://cran.r-project.org/src/base/R-3\"><strong>R version 3.5.0 (Joy in Playing)</strong></a> has been released on 2018-04-23.</p></li>"
## [84] "<li><p><a href=\"https://cran.r-project.org/src/base/R-3\"><strong>R version 3.4.4 (Someone to Lean On)</strong></a> has been released on 2018-03-15.</p></li>"
## [85] "<li><p><strong>useR! 2018</strong> (July 10 - 13 in Brisbane) is open for registration at <a href=\"https://user2018.r-project.org\"><strong>https://user2018.r-project.org</strong></a></p></li>"
## [86] "<li><p><a href=\"https://journal.r-project.org/archive/2017-2\"><strong>The R Journal Volume 9/2</strong></a> is available.</p></li>"
## [87] "<li><p><strong>useR! 2017</strong> took place July 4 - 7 in Brussels <a href=\"https://user2017.brussels\"><strong>https://user2017.brussels</strong></a></p></li>"
## [88] "<li><p>The <a href=\"https://www.r-project.org/logo\"><strong>R Logo</strong></a> is available for download in high-resolution PNG or SVG formats.</p></li>"
## [89] "</ul>"
## [90] "<!--- (Boilerplate for release run-in)"
## [91] "- [**R version 3.1.3 (Smooth Sidewalk) prerelease versions**](http://cran.r-project.org/src/base-prerelease) will appear starting February 28. Final release is scheduled for 2015-03-09."
## [92] "-->"
## [93] " </div>"
## [94] " </div>"
## [95] " <div class=\"raw footer\">"
## [96] " © The R Foundation. For queries about this web site, please contact"
## [97] "\t<script type='text/javascript'>"
## [98] "<!--"
## [99] "var s=\"=b!isfg>#nbjmup;xfcnbtufsAs.qspkfdu/psh#?uif!xfcnbtufs=0b?\";"
## [100] "m=\"\"; for (i=0; i<s.length; i++) {if(s.charCodeAt(i) == 28){m+= '&';} else if (s.charCodeAt(i) == 23) {m+= '!';} else {m+=String.fromCharCode(s.charCodeAt(i)-1);}}document.write(m);//-->"
## [101] "\t</script>;"
## [102] " for queries about R itself, please consult the "
## [103] " <a href=\"help.html\">Getting Help</a> section."
## [104] " </div>"
## [105] " </div>"
## [106] " <!-- jQuery (necessary for Bootstrap's JavaScript plugins) -->"
## [107] " <script src=\"https://ajax.googleapis.com/ajax/libs/jquery/1.11.1/jquery.min.js\"></script>"
## [108] " <!-- Include all compiled plugins (below), or include individual files as needed -->"
## [109] " <script src=\"/js/bootstrap.min.js\"></script>"
## [110] " </body>"
## [111] "</html>"
# Extract the input from the unsuccessful results
# subsetting urls with !is_ok
urls[!is_ok]
## $asdf
## [1] "http://asdfasdasdkfjlda"
# simulate 5 random numbers from a Normal distribution
# with mean zero
rnorm(n = 5)
## [1] -0.7184978 -1.3529178 1.0179859 1.8410276 2.6965366
# repeat three times, but each time with a different sample size
rnorm(5)
## [1] 0.04567558 -0.73672255 -0.08525783 -0.05609098 0.12507143
rnorm(10)
## [1] -0.07671131 0.50262646 0.18372367 -1.01890939 -0.05305828
## [6] -0.61703311 0.30724594 1.51973679 0.08070157 -0.24716776
rnorm(20)
## [1] 0.1349956 -2.9998807 -1.2199234 -0.8803089 -1.1105952 0.4644794
## [7] 0.7982421 0.1093767 -0.4790875 -1.1688789 0.4490483 0.2351042
## [13] -0.7701425 0.3470370 1.8589856 1.4977189 1.8743996 0.2511965
## [19] 0.8226129 -0.6310207
# alternatively:
map(list(5, 10, 20), rnorm)
## [[1]]
## [1] -1.1936940 2.3753668 1.5358066 0.4415677 -0.1527810
##
## [[2]]
## [1] 0.2668639 0.9275570 2.9852474 1.2493899 0.1304897 0.6778199
## [7] 2.3413737 -2.5569637 -0.4942210 -0.6099386
##
## [[3]]
## [1] -0.370147792 1.360196772 -1.638766531 -1.671530675 -1.725087738
## [6] 1.131453269 -0.813178258 -0.166746486 0.002313656 2.692439552
## [11] 0.692029181 0.652989141 -0.357393210 1.013485874 0.534926318
## [16] 0.672021196 -0.094722838 -0.035539840 1.161120123 0.938780186
# i.e.
n <- list(5, 10, 20)
map(n, rnorm)
## [[1]]
## [1] -2.5350166 1.2194725 -0.3878960 -0.7289417 0.3434182
##
## [[2]]
## [1] 2.2038646 -1.7883462 -1.6556874 -0.2474666 0.4234609 -0.6239097
## [7] -1.6380009 -0.5966542 0.2046613 0.1490992
##
## [[3]]
## [1] -0.3740660 0.3592354 1.0839342 -1.5524715 1.8563490 -1.2255496
## [7] -1.7703197 0.5754285 0.8210805 -1.7213611 -0.4213728 -0.2523393
## [13] -1.5009499 0.7682673 0.5812566 -1.4294271 -0.4173622 0.0908106
## [19] 0.7927532 -0.4239674
# rnorm(n, mean = 0, sd = 1)
rnorm(5, mean = 1)
## [1] 1.705974 -0.217439 1.602343 1.029560 0.691539
rnorm(10, mean = 5)
## [1] 6.146697 5.795837 4.755039 5.690232 5.047908 4.220321 4.137719
## [8] 4.098953 4.708845 5.433843
rnorm(20, mean = 10)
## [1] 9.132280 9.416960 11.432265 10.125283 11.300966 9.877106 9.354803
## [8] 10.165987 11.324978 9.868663 10.785895 9.860812 9.142252 8.603288
## [15] 10.696172 10.788850 9.932629 11.236169 8.661722 11.750454
# alternatively:
# map2(.x, .y, .f, ...)
map2(list(5, 10, 20), list(1, 5, 10), rnorm)
## [[1]]
## [1] 1.06990843 1.11426759 1.51754138 -0.01431911 1.35857722
##
## [[2]]
## [1] 4.491806 6.132983 6.020320 4.938282 3.659294 4.029250 4.922837
## [8] 5.056045 5.087867 5.335729
##
## [[3]]
## [1] 10.595610 12.247691 11.176335 7.806663 9.662283 10.496946 9.961202
## [8] 8.100465 12.666928 11.677195 9.677266 8.800581 8.689409 9.576068
## [15] 11.117679 9.524646 7.946224 10.190701 10.188143 9.432270
# i.e.
mu <- list(1, 5, 10)
map2(n,mu, rnorm)
## [[1]]
## [1] 1.0996344 1.0934606 0.5361912 1.4577187 0.4361098
##
## [[2]]
## [1] 5.877185 3.794603 4.505450 5.521853 6.091761 5.255705 4.103266
## [8] 2.679487 5.807915 5.426072
##
## [[3]]
## [1] 11.736617 11.151447 11.270809 11.073334 9.505712 11.415830 9.989987
## [8] 10.079329 9.452842 11.519678 9.954787 11.642103 12.280424 8.187645
## [15] 8.697277 9.867203 9.378562 10.157019 10.327322 9.313307
rnorm(5, mean = 1, sd = 0.1)
## [1] 0.9933568 1.0734781 0.9982540 1.1825412 0.9935860
rnorm(10, mean = 5, sd = 0.5)
## [1] 4.710702 6.329808 4.578451 4.609126 5.015731 5.239718 4.742876
## [8] 5.492697 5.358975 5.671041
rnorm(20, mean = 10, sd = 0.1)
## [1] 9.988266 10.018874 9.995430 9.996626 9.930426 10.059853 9.799281
## [8] 10.046185 9.907805 10.153146 9.890998 9.915099 10.019474 10.043388
## [15] 10.178831 9.866670 10.168416 9.969079 9.981466 10.004589
# alternatively:
# pmap(.l, .f, ...)
pmap(list(n = list(5, 10, 20),
mean = list(1, 5, 10),
sd = list(0.1, 0.5, 0.1)), rnorm)
## [[1]]
## [1] 1.0251159 0.9143462 1.0623979 1.1446805 0.6986038
##
## [[2]]
## [1] 4.634960 4.233074 4.206222 5.077710 5.219386 4.073963 4.777418
## [8] 5.357986 5.107987 5.227026
##
## [[3]]
## [1] 9.826272 9.850642 10.150864 10.069276 10.208196 9.955140 10.014815
## [8] 10.200067 10.065475 9.884640 10.021479 9.900215 9.989499 9.959591
## [15] 10.028232 10.075510 9.845512 9.925291 10.032233 10.002264
# i.e.
sd = list(0.1, 0.5, 0.1)
pmap(list(mean = mu, n = n, sd = sd), rnorm)
## [[1]]
## [1] 0.9719579 0.9079359 1.0821508 1.2720144 1.0640284
##
## [[2]]
## [1] 5.751928 4.907903 5.053093 5.180628 5.135509 5.801066 4.707341
## [8] 5.496773 4.677394 3.765233
##
## [[3]]
## [1] 9.793010 9.972128 9.741273 9.921614 9.840243 10.128178 9.950890
## [8] 9.813654 9.876345 10.053874 10.131098 10.032148 10.105890 9.923602
## [15] 9.963030 9.912236 9.969536 10.041416 9.918537 9.988366
rnorm(5)
## [1] 0.1318228 -0.1430492 1.6102036 -0.5401508 -1.1395560
runif(5)
## [1] 0.3378201 0.9984885 0.3758897 0.1533140 0.9891426
rexp(5)
## [1] 0.2122709 0.2682666 0.3925914 0.9188603 1.5862613
# alternatively:
# invoke_map(.f, .x = list(NULL), ...)
invoke_map(list(rnorm, runif, rexp), n = 5)
## [[1]]
## [1] -1.0460192 -0.3928283 -0.9014461 -0.6139352 0.1757031
##
## [[2]]
## [1] 0.557859152 0.622239655 0.004652569 0.333880262 0.827704524
##
## [[3]]
## [1] 0.8416657 0.5616720 0.7815147 0.1123118 0.2611983
# Define list of functions
f <- list("rnorm", "runif", "rexp")
# Parameter list for rnorm()
rnorm_params <- list(mean = 10)
# Add a min element with value 0 and max element with value 5
runif_params <- list()
runif_params <- list(min = 0, max = 5)
# Add a rate element with value 5
rexp_params <- list()
rexp_params <- list(rate = 5)
# Define params for each function
params <- list(
rnorm_params,
runif_params,
rexp_params
)
# Call invoke_map() on f supplying params as the second argument
invoke_map(f, params, n = 5)
## [[1]]
## [1] 8.678056 11.960764 11.905008 10.915172 7.749312
##
## [[2]]
## [1] 2.7869078 0.7286203 2.7382496 0.7758526 1.4469598
##
## [[3]]
## [1] 0.13281932 0.03883795 0.09731466 0.45826979 0.08146358
designed for functions that don’t return anything e.g. functions with side effects like printing, plotting or saving
x <- list(1, "a", 3)
x %>% walk(print)
## [1] 1
## [1] "a"
## [1] 3
library(ggplot2)
plots <- cyl %>%
map(~ ggplot(., aes(mpg, wt)) + geom_point())
paths <- paste0(names(plots), ".pdf")
walk2(paths, plots, ggsave)
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## Saving 7 x 5 in image
# return value of walk is the input
out <- x %>% walk(print)
## [1] 1
## [1] "a"
## [1] 3
str(out)
## List of 3
## $ : num 1
## $ : chr "a"
## $ : num 3
lengths <- x %>% walk(print) %>% map_dbl(length)
## [1] 1
## [1] "a"
## [1] 3
lengths
## [1] 1 1 1
# Define list of functions
f <- list(Normal = "rnorm", Uniform = "runif", Exp = "rexp")
# Define params
params <- list(
Normal = list(mean = 10),
Uniform = list(min = 0, max = 5),
Exp = list(rate = 5)
)
# Assign the simulated samples to sims
sims <- invoke_map(f, params, n = 50)
# Use walk() to make a histogram of each element in sims
sims %>% walk(hist)
Que: Take a quick look through the three histograms, do they have any problems?
Ans: They really needed better breaks for the bins on the x-axis.
NB1: default value for the breaks argument to hist() is “Sturges”
Sol: Need to vary two arguments to hist(): x and breaks
# Replace "Sturges" with reasonable breaks for each sample
breaks_list <- list(
Normal = "Sturges",
Uniform = "Sturges",
Exp = "Sturges"
)
# default value for the breaks argument to hist() is "Sturges"
breaks_list <- list(
Normal = seq(6, 16, 0.5),
Uniform = seq(0, 5, 0.25),
Exp = seq(0, 1.5, 0.1)
)
# Use walk2() to make histograms with the right breaks
walk2(sims, breaks_list, hist)
# alternatively
sims %>% walk2(breaks_list, hist)
# hard-coded the breaks = no good, if we change the parameters of our simulation
# generate reasonable breaks based on the actual values in our simulated samples
# Turn this snippet into find_breaks()
rng <- range(sims[[1]], na.rm = TRUE)
seq(rng[1], rng[2], length.out = 30)
## [1] 7.898824 8.063608 8.228392 8.393176 8.557959 8.722743 8.887527
## [8] 9.052311 9.217095 9.381879 9.546662 9.711446 9.876230 10.041014
## [15] 10.205798 10.370581 10.535365 10.700149 10.864933 11.029717 11.194501
## [22] 11.359284 11.524068 11.688852 11.853636 12.018420 12.183204 12.347987
## [29] 12.512771 12.677555
# writing our own function:
# takes a single argument x
# and return the sequence of breaks
find_breaks <- function(x) {
rng <- range(x, na.rm = TRUE)
seq(rng[1], rng[2], length.out = 30)
}
# Check that your function works
# by calling find_breaks() on sims[[1]].
find_breaks(sims[[1]])
## [1] 7.898824 8.063608 8.228392 8.393176 8.557959 8.722743 8.887527
## [8] 9.052311 9.217095 9.381879 9.546662 9.711446 9.876230 10.041014
## [15] 10.205798 10.370581 10.535365 10.700149 10.864933 11.029717 11.194501
## [22] 11.359284 11.524068 11.688852 11.853636 12.018420 12.183204 12.347987
## [29] 12.512771 12.677555
# additional checks
find_breaks(sims[[2]])
## [1] 0.01299713 0.18460169 0.35620625 0.52781081 0.69941537 0.87101993
## [7] 1.04262449 1.21422905 1.38583361 1.55743817 1.72904273 1.90064729
## [13] 2.07225185 2.24385641 2.41546097 2.58706553 2.75867010 2.93027466
## [19] 3.10187922 3.27348378 3.44508834 3.61669290 3.78829746 3.95990202
## [25] 4.13150658 4.30311114 4.47471570 4.64632026 4.81792482 4.98952938
find_breaks(sims[[3]])
## [1] 0.001431461 0.038830039 0.076228617 0.113627195 0.151025773
## [6] 0.188424351 0.225822929 0.263221507 0.300620085 0.338018663
## [11] 0.375417241 0.412815819 0.450214397 0.487612975 0.525011553
## [16] 0.562410131 0.599808709 0.637207287 0.674605865 0.712004444
## [21] 0.749403022 0.786801600 0.824200178 0.861598756 0.898997334
## [26] 0.936395912 0.973794490 1.011193068 1.048591646 1.085990224
# Use map() to iterate find_breaks() over sims
nice_breaks <- map(sims, find_breaks)
# Use nice_breaks as the second argument to walk2()
# iterate over both
# the simulations
# and calculated breaks to plot histograms
walk2(sims, nice_breaks, hist)
# walk2(sims, nice_breaks, hist)
walk2(sims, nice_breaks, hist, xlab = "")
# Increase sample size to 1000
sims <- invoke_map(f, params, n = 50)
sims <- invoke_map(f, params, n = 1000)
# nice_breaks [given]
nice_breaks <- map(sims, find_breaks)
# Create a vector nice_titles
nice_titles <- c("Normal(10, 1)", "Uniform(0, 5)", "Exp(5)")
# Use pwalk() instead of walk2()
# walk2(sims, nice_breaks, hist, xlab = "")
# NB: keep xlab = ""
# outside of the list of arguments
# being iterated over
# since it's the same value
# for all three histograms
pwalk(list(x = sims, breaks = nice_breaks, main = nice_titles), hist, xlab = "")
walk() return the object you passed to it easy use in pipeline (pipeline = “a statement with lots of pipes”)
# e.g.
str(sims)
## List of 3
## $ Normal : num [1:1000] 10.48 10.01 9.76 10.7 10.6 ...
## $ Uniform: num [1:1000] 3.027 0.441 4.359 0.908 1.56 ...
## $ Exp : num [1:1000] 0.0105 0.2805 0.1745 0.0419 0.1122 ...
tmp <- walk(sims, hist)
str(tmp)
## List of 3
## $ Normal : num [1:1000] 10.48 10.01 9.76 10.7 10.6 ...
## $ Uniform: num [1:1000] 3.027 0.441 4.359 0.908 1.56 ...
## $ Exp : num [1:1000] 0.0105 0.2805 0.1745 0.0419 0.1122 ...
# can pipe the sims object along to other functions
# e.g. want some basic summary statistics on each sample
# Pipe this along to map(), using summary() as .f
sims %>%
walk(hist) %>%
map(summary)
## $Normal
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.886 9.304 10.007 10.013 10.661 13.261
##
## $Uniform
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.006835 1.245027 2.437147 2.480203 3.703678 4.994127
##
## $Exp
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0002914 0.0604708 0.1430724 0.2045928 0.2845496 1.5235408
# from chapter 2:
# finds the number of entries where vectors x and y both have missing values
both_na <- function(x, y) {
sum(is.na(x) & is.na(y))
}
# Define troublesome x and y
x <- c(NA, NA, NA)
y <- c( 1, NA, NA, NA)
# function works and returns 3
both_na(x, y)
## Warning in is.na(x) & is.na(y): longer object length is not a multiple of
## shorter object length
## [1] 3
# NOT COOL to pass in different length arguments
both_na <- function(x, y) {
stopifnot( length(x) == length(y) )
sum(is.na(x) & is.na(y))
}
# # function works and returns 3
both_na(x, y)
## Error in both_na(x, y): length(x) == length(y) is not TRUE
both_na <- function(x, y) {
# Replace condition with logical
# if (condition) {
if ( length(x) != length(y) ) {
# Replace "Error" with better message
# stop("Error", call. = FALSE)
stop("x and y must have the same length", call. = FALSE)
}
sum(is.na(x) & is.na(y))
}
# Call both_na()
# verify it returns a more informative error
both_na(x, y)
## Error: x and y must have the same length
i.e. when you run a function that alters the state of your R session
e.g.
show_missings <- function(x) {
n <- sum(is.na(x))
cat("Missing values: ", n, "\n", sep = "")
x
}
# prints output to the console
plot_missings <- function(x) {
plot(seq_along(x), is.na(x))
x
}
# creates a plot
exclude_missings <- function() {
options(na.action = "na.exclude")
}
# changes a global option
Pure if:
e.g.
replace_missings <- function(x, replacement) {
x[is.na(x)] <- replacement
x
}
functions with side effects are crucial for data analysis
You need to be aware of them, and deliberate in their usage.
Type-inconsistent: the type of the return object depends on the input
Surprises occur when you’ve used a type-inconsistent function inside your own function
Source of surprises, type-inconsistent f:
in your functions: PoA: use type consistent functions instead (e.g. Purr funcitons) or use tests to ensure type
1.1745604, 0.2802373, 0.1575448, 0.3131434 returns first row:
df <- data.frame(z = 1:3, y = 2:4)
str(df[1, ]) # df
## 'data.frame': 1 obs. of 2 variables:
## $ z: int 1
## $ y: int 2
df <- data.frame(z = 1:3)
str(df[1, ]) # vector
## int 1
last_row <- function(df) {
df[nrow(df), ]
}
df <- data.frame(x = 1:3) # one col df
# Not a row, just a vector
str(last_row(df))
## int 3
use drop = FALSE: df[x, , drop = FALSE]
last_row <- function(df) {
df[nrow(df), , drop = FALSE]
}
df <- data.frame(x = 1:3) # one col df
str(last_row(df)) # now a df
## 'data.frame': 1 obs. of 1 variable:
## $ x: int 3
Subset the data frame like a list: df[x]
The type of output returned from sapply() depends on the type of input
df <- data.frame(
a = 1L,
b = 1.5,
y = Sys.time(),
z = ordered(1)
)
A <- sapply(df[1:4], class)
B <- sapply(df[3:4], class)
What type of objects will be A and B be?
# A will be a list
A
## $a
## [1] "integer"
##
## $b
## [1] "numeric"
##
## $y
## [1] "POSIXct" "POSIXt"
##
## $z
## [1] "ordered" "factor"
str(A)
## List of 4
## $ a: chr "integer"
## $ b: chr "numeric"
## $ y: chr [1:2] "POSIXct" "POSIXt"
## $ z: chr [1:2] "ordered" "factor"
# B will be a character matrix.
B
## y z
## [1,] "POSIXct" "ordered"
## [2,] "POSIXt" "factor"
str(B)
## chr [1:2, 1:2] "POSIXct" "POSIXt" "ordered" "factor"
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:2] "y" "z"
unpredictable behaviour => shouldn’t rely on sapply() inside your own functions
call class() on the columns of df
=> expect: character output
=> map_chr()
df <- data.frame(
a = 1L,
b = 1.5,
y = Sys.time(),
z = ordered(1)
)
A <- map_chr(df[1:4], class)
## Error: Result 3 is not a length 1 atomic vector
B <- map_chr(df[3:4], class)
## Error: Result 1 is not a length 1 atomic vector
ERROR alerts us that our assumption is wrong (that class() would return purely character output)
# sapply calls
A <- sapply(df[1:4], class)
B <- sapply(df[3:4], class)
C <- sapply(df[1:2], class)
# Demonstrate type inconsistency
str(A)
## List of 4
## $ a: chr "integer"
## $ b: chr "numeric"
## $ y: chr [1:2] "POSIXct" "POSIXt"
## $ z: chr [1:2] "ordered" "factor"
str(B)
## chr [1:2, 1:2] "POSIXct" "POSIXt" "ordered" "factor"
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:2] "y" "z"
str(C)
## Named chr [1:2] "integer" "numeric"
## - attr(*, "names")= chr [1:2] "a" "b"
# Use map() to define X, Y and Z
X <- map(df[1:4], class)
Y <- map(df[3:4], class)
Z <- map(df[1:2], class)
# Use str() to check type consistency
str(X)
## List of 4
## $ a: chr "integer"
## $ b: chr "numeric"
## $ y: chr [1:2] "POSIXct" "POSIXt"
## $ z: chr [1:2] "ordered" "factor"
str(Y)
## List of 2
## $ y: chr [1:2] "POSIXct" "POSIXt"
## $ z: chr [1:2] "ordered" "factor"
str(Z)
## List of 2
## $ a: chr "integer"
## $ b: chr "numeric"
# always return a list
# becuase used map() a type consistent function
col_classes <- function(df) {
map(df, class)
}
# Want this function to return a character string
col_classes <- function(df) {
# Assign list output to class_list
class_list <- map(df, class)
# Use map_chr()
# along with the numeric subsetting shortcut
# to extract first element in class_list
map_chr(class_list, 1)
}
# Check that our new function is:
# type consistent
# always returns a character vector.
df %>% col_classes() %>% str()
## Named chr [1:4] "integer" "numeric" "POSIXct" "ordered"
## - attr(*, "names")= chr [1:4] "a" "b" "y" "z"
df[3:4] %>% col_classes() %>% str()
## Named chr [1:2] "POSIXct" "ordered"
## - attr(*, "names")= chr [1:2] "y" "z"
df[1:2] %>% col_classes() %>% str()
## Named chr [1:2] "integer" "numeric"
## - attr(*, "names")= chr [1:2] "a" "b"
col_classes <- function(df) {
map_chr(df, class)
}
df %>% col_classes() %>% str()
## Error: Result 3 is not a length 1 atomic vector
class_list <- map(df, class)
class_list
## $a
## [1] "integer"
##
## $b
## [1] "numeric"
##
## $y
## [1] "POSIXct" "POSIXt"
##
## $z
## [1] "ordered" "factor"
# vector of lengths for the elements in class_list
map_dbl(class_list, length)
## a b y z
## 1 1 2 2
# one and only argument to any() is a conditional statement
any(map_dbl(class_list, length) > 1)
## [1] TRUE
col_classes <- function(df) {
class_list <- map(df, class)
# Add a check that no element of class_list has length > 1
if (any(map_dbl(class_list, length) > 1)) {
stop("Some columns have more than one class", call. = FALSE)
}
# Use flatten_chr() to return a character vector
flatten_chr(class_list)
}
# Check that our new function is type consistent
df %>% col_classes() %>% str()
## Error: Some columns have more than one class
df[3:4] %>% col_classes() %>% str()
## Error: Some columns have more than one class
df[1:2] %>% col_classes() %>% str()
## chr [1:2] "integer" "numeric"
e.g. filter() - from the dplyr package
# return all rows in df
# where the x column exceeds a certain threshold
big_x <- function(df, threshold) {
dplyr::filter(df, x > threshold)
}
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data(diamonds)
diamonds_sub <- head(diamonds,20)
diamonds_sub <- diamonds[sample(1:length(diamonds$price), 20), ]
# Use big_x() to find rows in diamonds_sub where x > 7
big_x(diamonds_sub, 7)
## # A tibble: 2 x 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 1.76 Ideal I SI2 61.2 55 9714 7.79 7.76 4.76
## 2 2.07 Ideal J VVS2 62.7 54 16617 8.12 8.17 5.11
unexpected outputs & NO indication (i.e. error message)
# Remove the x column from diamonds
diamonds_sub$x <- NULL
# Create variable x with value 1
x <- 1
# Use big_x() to find rows in diamonds_sub where x > 7
big_x(diamonds_sub, 7)
## # A tibble: 0 x 9
## # ... with 9 variables: carat <dbl>, cut <ord>, color <ord>,
## # clarity <ord>, depth <dbl>, table <dbl>, price <int>, y <dbl>, z <dbl>
# Create a threshold column with value 100
diamonds_sub$threshold <- 100
# Use big_x() to find rows in diamonds_sub where x > 7
big_x(diamonds_sub, 7)
## # A tibble: 0 x 10
## # ... with 10 variables: carat <dbl>, cut <ord>, color <ord>,
## # clarity <ord>, depth <dbl>, table <dbl>, price <int>, y <dbl>,
## # z <dbl>, threshold <dbl>
Hadley’s vignette: http://rpubs.com/hadley/157957
Provide protection against the problem cases
big_x <- function(df, threshold) {
# Write a check for x not being in df
# If x is not in names(df),
# if ( x !in names(df) ) { - INCORRECT
if (!"x" %in% names(df)) {
stop("df must contain variable called x", call. = FALSE)
}
# Write a check for threshold being in df
# If threshold is in names(df),
# if ( threshold in names(df) ) {
if ("threshold" %in% names(df)) {
stop("df must not contain variable called threshold", call. = FALSE)
}
dplyr::filter(df, x > threshold)
big_x(diamonds_sub, 7)
}