intermediate - 3 - apply family

# lapply(X, FUN, ...)
# takes a vector or list X
# applies the function FUN to each of its members

# If "FUN" requires additional arguments, 
# you pass them after you've specified "X" and "FUN (...)"
# output = list
#          same length as X, 
#          where each element = result of 
#                               applying FUN 
#                               on the corresponding element of X.

# The vector pioneers has already been created for you
pioneers <- c("GAUSS:1777", "BAYES:1702", "PASCAL:1623", "PEARSON:1857")

# Split names from birth year
split_math <- strsplit(pioneers, split = ":")

# Convert to lowercase strings: split_low
split_low <- lapply(split_math, tolower)

# Take a look at the structure of split_low
str(split_low)
## List of 4
##  $ : chr [1:2] "gauss" "1777"
##  $ : chr [1:2] "bayes" "1702"
##  $ : chr [1:2] "pascal" "1623"
##  $ : chr [1:2] "pearson" "1857"


# Write function select_first()
select_first <- function(x) {
  x[1]
}

# Apply select_first() over split_low: names
names <- lapply(split_low, select_first)

# Write function select_second()
select_second <- function(x) {
  x[2]
}

# Apply select_second() over split_low: years
years <- lapply(split_low, select_second)

names
## [[1]]
## [1] "gauss"
## 
## [[2]]
## [1] "bayes"
## 
## [[3]]
## [1] "pascal"
## 
## [[4]]
## [1] "pearson"
years
## [[1]]
## [1] "1777"
## 
## [[2]]
## [1] "1702"
## 
## [[3]]
## [1] "1623"
## 
## [[4]]
## [1] "1857"


# defining functions to use them only once = overkill

# lapply and anonymous functions

# Named function
triple <- function(x) { 3 * x }

# Anonymous function with same implementation
function(x) { 3 * x }
## function(x) { 3 * x }

# Use anonymous function inside lapply()
lapply(list(1,2,3), function(x) { 3 * x })
## [[1]]
## [1] 3
## 
## [[2]]
## [1] 6
## 
## [[3]]
## [1] 9

# Transform: use anonymous function inside lapply
names <- lapply(split_low, function(k) {k[1]})

# Transform: use anonymous function inside lapply
years <- lapply(split_low, function(k) {k[2]})

names
## [[1]]
## [1] "gauss"
## 
## [[2]]
## [1] "bayes"
## 
## [[3]]
## [1] "pascal"
## 
## [[4]]
## [1] "pearson"
years
## [[1]]
## [1] "1777"
## 
## [[2]]
## [1] "1702"
## 
## [[3]]
## [1] "1623"
## 
## [[4]]
## [1] "1857"

# lapply with additional arguments

multiply <- function(x, factor) {
  x * factor
}
lapply(list(1,2,3), multiply, factor = 3)
## [[1]]
## [1] 3
## 
## [[2]]
## [1] 6
## 
## [[3]]
## [1] 9

# Generic select function
select_el <- function(x, index) {
  x[index]
}

# Use lapply() twice on split_low: names and years
names <- lapply(split_low,select_el,1)
years <- lapply(split_low,select_el,2)

names
## [[1]]
## [1] "gauss"
## 
## [[2]]
## [1] "bayes"
## 
## [[3]]
## [1] "pascal"
## 
## [[4]]
## [1] "pearson"
years
## [[1]]
## [1] "1777"
## 
## [[2]]
## [1] "1702"
## 
## [[3]]
## [1] "1623"
## 
## [[4]]
## [1] "1857"



# sapply(X, FUN, ...)
# simplifies the list that lapply() would return 
# by turning it into a vector

temp <- list(c(3, 7, 9, 6, -1), c(6, 9, 12, 13, 5), c(4, 8, 3, -1, -3), c(1, 4, 7, 2, -2), 
             c(5, 7, 9, 4, 2), c(-3, 5, 8, 9, 4), c(3, 6, 9, 4, 1))

# Use lapply() to find each day's minimum temperature
lapply(temp, min)
## [[1]]
## [1] -1
## 
## [[2]]
## [1] 5
## 
## [[3]]
## [1] -3
## 
## [[4]]
## [1] -2
## 
## [[5]]
## [1] 2
## 
## [[6]]
## [1] -3
## 
## [[7]]
## [1] 1

# Use sapply() to find each day's minimum temperature
sapply(temp, min)
## [1] -1  5 -3 -2  2 -3  1

# Use lapply() to find each day's maximum temperature
lapply(temp, max)
## [[1]]
## [1] 9
## 
## [[2]]
## [1] 13
## 
## [[3]]
## [1] 8
## 
## [[4]]
## [1] 7
## 
## [[5]]
## [1] 9
## 
## [[6]]
## [1] 9
## 
## [[7]]
## [1] 9

# Use sapply() to find each day's maximum temperature
sapply(temp, max)
## [1]  9 13  8  7  9  9  9

# self-written function
extremes_avg <- function(x) {
  ( min(x) + max(x) ) / 2
}

# Apply extremes_avg() over temp using sapply()
sapply(temp, extremes_avg)
## [1] 4.0 9.0 2.5 2.5 5.5 3.0 5.0

# Apply extremes_avg() over temp using lapply()
lapply(temp, extremes_avg)
## [[1]]
## [1] 4
## 
## [[2]]
## [1] 9
## 
## [[3]]
## [1] 2.5
## 
## [[4]]
## [1] 2.5
## 
## [[5]]
## [1] 5.5
## 
## [[6]]
## [1] 3
## 
## [[7]]
## [1] 5

# what if the function 
# you're applying over a list or a vector
# returns a vector of length greater than 1? 
# function that returns min and max of a vector: extremes
extremes <- function(x) {
  c(min = min(x), max = max(x))
}

# Apply extremes() over temp with sapply()
sapply(temp, extremes)
##     [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## min   -1    5   -3   -2    2   -3    1
## max    9   13    8    7    9    9    9

# Apply extremes() over temp with lapply()
lapply(temp, extremes)
## [[1]]
## min max 
##  -1   9 
## 
## [[2]]
## min max 
##   5  13 
## 
## [[3]]
## min max 
##  -3   8 
## 
## [[4]]
## min max 
##  -2   7 
## 
## [[5]]
## min max 
##   2   9 
## 
## [[6]]
## min max 
##  -3   9 
## 
## [[7]]
## min max 
##   1   9

# sapply can't simplify, now what?

# Definition of below_zero()
below_zero <- function(x) {
  return(x[x < 0])
}

# Apply below_zero over temp using sapply(): freezing_s
freezing_s <- sapply(temp, below_zero)
freezing_s
## [[1]]
## [1] -1
## 
## [[2]]
## numeric(0)
## 
## [[3]]
## [1] -1 -3
## 
## [[4]]
## [1] -2
## 
## [[5]]
## numeric(0)
## 
## [[6]]
## [1] -3
## 
## [[7]]
## numeric(0)
# Apply below_zero over temp using lapply(): freezing_l
freezing_l <- lapply(temp, below_zero)
freezing_l
## [[1]]
## [1] -1
## 
## [[2]]
## numeric(0)
## 
## [[3]]
## [1] -1 -3
## 
## [[4]]
## [1] -2
## 
## [[5]]
## numeric(0)
## 
## [[6]]
## [1] -3
## 
## [[7]]
## numeric(0)
# Are freezing_s and freezing_l identical?
# freezing_s == freezing_l
identical(freezing_s,freezing_l)
## [1] TRUE

# sapply with functions that return NULL

# Definition of print_info()
print_info <- function(x) {
  cat("The average temperature is", mean(x), "\n")
}

# Apply print_info() over temp using sapply()
sapply(temp, print_info)
## The average temperature is 4.8 
## The average temperature is 9 
## The average temperature is 2.2 
## The average temperature is 2.4 
## The average temperature is 5.4 
## The average temperature is 4.6 
## The average temperature is 4.6
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL

# Apply print_info() over temp using lapply()
lapply(temp, print_info)
## The average temperature is 4.8 
## The average temperature is 9 
## The average temperature is 2.2 
## The average temperature is 2.4 
## The average temperature is 5.4 
## The average temperature is 4.6 
## The average temperature is 4.6
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL

identical(sapply(temp, print_info),lapply(temp, print_info))
## The average temperature is 4.8 
## The average temperature is 9 
## The average temperature is 2.2 
## The average temperature is 2.4 
## The average temperature is 5.4 
## The average temperature is 4.6 
## The average temperature is 4.6 
## The average temperature is 4.8 
## The average temperature is 9 
## The average temperature is 2.2 
## The average temperature is 2.4 
## The average temperature is 5.4 
## The average temperature is 4.6 
## The average temperature is 4.6
## [1] TRUE
# 'vector-version' of a list of NULL's 
# would simply be a NULL, 
# which is no longer a vector 
# with the same length as the input.


# vapply(X, FUN, FUN.VALUE, ..., USE.NAMES = TRUE)

# FUN.VALUE argument expects a template 
# for the return argument of FUN
# USE.NAMES => generate a named array (if possible)

# i.e. restrict the output of the function
# i.e. more robust version of sapply()

# Definition of basics()
# takes a vector, and returns a named vector of length 3
basics <- function(x) {
  c(min = min(x), mean = mean(x), max = max(x))
}

# Apply basics() over temp using vapply()
### vapply(temp, basics, numeric(3))

# structure of the output of FUN
# does not correspond to the template you specify in FUN.VALUE
# vapply() will throw an error 
#                        misalignment between expected and actual output

basics <- function(x) {
  c(min = min(x), mean = mean(x), median = median(x), max = max(x))
}

# error, because FUN(X[[1]]) result is length 4
### vapply(temp, basics, numeric(3)) # => values must be length 3

# fixed
vapply(temp, basics, numeric(4))
##        [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## min    -1.0    5 -3.0 -2.0  2.0 -3.0  1.0
## mean    4.8    9  2.2  2.4  5.4  4.6  4.6
## median  6.0    9  3.0  2.0  5.0  5.0  4.0
## max     9.0   13  8.0  7.0  9.0  9.0  9.0


# add robustness

# Convert from sapply to vapply() expression
sapply(temp, max)
## [1]  9 13  8  7  9  9  9
vapply(temp, max, numeric(1))
## [1]  9 13  8  7  9  9  9

# Convert from sapply to vapply() expression
sapply(temp, function(x, y) { mean(x) > y }, y = 5)
## [1] FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE
vapply(temp, function(x, y) { mean(x) > y }, y = 5, logical(1))
## [1] FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE