Functions covered:
References
Before explaining the apply family functions, here are four related functions that will save you lots of time and will help you understand the apply family functions. They are rowSums
, colSums
, rowMeans
, and colMeans
.
dat <- matrix(rnorm(n = 15, mean = 1000, sd = 300), nrow = 3, ncol = 5) #create a 3 by 5 matrix
dat
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1063.3860 828.3269 1389.4636 890.3890 1155.828
## [2,] 855.7374 956.7276 871.5523 350.2771 1042.304
## [3,] 896.1662 1280.5432 830.1753 1172.5095 1400.434
Sum of each row:
rowSums(dat) #3 rows
## [1] 5327.393 4076.598 5579.828
Sum of each column:
colSums(dat) #5 columns
## [1] 2815.290 3065.598 3091.191 2413.176 3598.566
Mean of each row:
rowMeans(dat) #3 rows
## [1] 1065.4786 815.3196 1115.9657
Mean of each column:
colMeans(dat) #5 columns
## [1] 938.4299 1021.8659 1030.3971 804.3919 1199.5219
apply
returns a vectordat <- matrix(rnorm(n = 15, mean = 100, sd = 30), nrow = 3, ncol = 5) #create a 2 3 by 5 dataframe
dat
## [,1] [,2] [,3] [,4] [,5]
## [1,] 164.1750 86.9395 106.95664 107.49572 117.00086
## [2,] 106.6604 100.4944 39.89724 86.59982 20.47244
## [3,] 124.9766 137.9387 120.85706 141.07982 88.77707
Applying function to each row (row = margin 1):
apply(dat, 1, mean) #find mean of each row in dat
## [1] 116.51353 70.82487 122.72586
rowMeans(dat) #same output as above
## [1] 116.51353 70.82487 122.72586
Applying function to each column (column = margin 2):
apply(dat, 2, mean) #find mean of each column in dat
## [1] 131.93735 108.45753 89.23698 111.72512 75.41679
colMeans(dat) #same output as above
## [1] 131.93735 108.45753 89.23698 111.72512 75.41679
#create character vector with city names
cities <- c("New York", "London", "Cape Town")
#for each element in X, apply function nchar() to count number of characters
lapply(X = cities, FUN = nchar)
## [[1]]
## [1] 8
##
## [[2]]
## [1] 6
##
## [[3]]
## [1] 9
To return values as vector, use unlist
:
unlist(lapply(X = cities, FUN = nchar))
## [1] 8 6 9
lapply
is similar to the verbose for loop:
numChars <- c()
for (i in 1:length(cities)) {
numChars[i] <- nchar(cities[i])
}
numChars
## [1] 8 6 9
num <- list(a = c(1:3), b = c(2:4), c = c(3:5))
num
## $a
## [1] 1 2 3
##
## $b
## [1] 2 3 4
##
## $c
## [1] 3 4 5
#for each of the three elements in the list, calculate the sum
lapply(num, sum)
## $a
## [1] 6
##
## $b
## [1] 9
##
## $c
## [1] 12
#for each element in the list, select the third element
lapply(num, function(z) {z[3]})
## $a
## [1] 3
##
## $b
## [1] 4
##
## $c
## [1] 5
#unlist output
unlist(lapply(num, function(z) {z[3]}))
## a b c
## 3 4 5
#a list of temperature measurements for 5 days
temp <- list(
c(3, 7, 9, 6, -1), #temperatures for day 1
c(6, 9, 12, 13, 5), #temperatures for day 2
c(4, 8, 3, -1, -3), #temperatures for day 3
c(1, 4, 7, 2, -2), #temperatures for day 4
c(5, 7, 9, 4, 2) #temperatures for day 5
)
sapply(temp, min) #find min for each day (returns a vector)
## [1] -1 5 -3 -2 2
Compare the output of sapply
with lapply
:
lapply(temp, min) #find min for each day (returns a list)
## [[1]]
## [1] -1
##
## [[2]]
## [1] 5
##
## [[3]]
## [1] -3
##
## [[4]]
## [1] -2
##
## [[5]]
## [1] 2
To get a vector when we use lapply, we need to use unlist
:
unlist(lapply(temp, min)) #sapplly returns exactly the same output!
## [1] -1 5 -3 -2 2
What if the function supplied to sapply
returns more than one value?
minAndMax <- function(x) { #create a function that returns the min and max
return(c(min = min(x), max = max(x)))
}
sapply(temp, minAndMax) #see how sapply returns a nicely formatted matrix
## [,1] [,2] [,3] [,4] [,5]
## min -1 5 -3 -2 2
## max 9 13 8 7 9
Compare sapply
with lapply
:
lapply(temp, minAndMax)
## [[1]]
## min max
## -1 9
##
## [[2]]
## min max
## 5 13
##
## [[3]]
## min max
## -3 8
##
## [[4]]
## min max
## -2 7
##
## [[5]]
## min max
## 2 9
Even if you use unlist
on the output from lapply
, it won’t be nice:
unlist(lapply(temp, minAndMax))
## min max min max min max min max min max
## -1 9 5 13 -3 8 -2 7 2 9
What happens when the function supplied to sapply
returns output lengths?
belowZero <- function(x) { #create a function that return values below 0
return(x[x < 0])
}
sapply
returns a list:
sapply(temp, belowZero)
## [[1]]
## [1] -1
##
## [[2]]
## numeric(0)
##
## [[3]]
## [1] -1 -3
##
## [[4]]
## [1] -2
##
## [[5]]
## numeric(0)
which is the same as the output from lapply
identical(sapply(temp, belowZero), lapply(temp, belowZero))
## [1] TRUE
lapply(temp, belowZero) #exactly the same as sapply
## [[1]]
## [1] -1
##
## [[2]]
## numeric(0)
##
## [[3]]
## [1] -1 -3
##
## [[4]]
## [1] -2
##
## [[5]]
## numeric(0)
sapply
in terms of functionalitycities <- c("New York", "London", "Cape Town")
vapply(cities, nchar, numeric(1)) #tells vapply we're expecting 1 number for each element
## New York London Cape Town
## 8 6 9
If you specify numeric(2)
, R will return an error message.
dat <- list(a = 1:10, b = 11:20)
vapply(dat, function(x) {return(c(min = min(x), max = max(x)))}, c(min = 0, max = 0)) #returns minimum and maximum
## a b
## min 1 11
## max 10 20
sapply
mapply
) to each elementmapply(sum, 1:5, 1:5, 1:5)
## [1] 3 6 9 12 15
What mapply(sum, 1:5, 1:5, 1:5)
does is apply a function to the first elements of each, second elements of each, third elements of each… So what it does in mapply(sum, 1:5, 1:5, 1:5)
is:
Another example:
mapply(rep, 1:4, 4:1)
## [[1]]
## [1] 1 1 1 1
##
## [[2]]
## [1] 2 2 2
##
## [[3]]
## [1] 3 3
##
## [[4]]
## [1] 4
Repeat 1 four times, 2 three times, 3 two times, and 4 one time. mapply(rep, 1:4, 4:1)
is equivalent to:
list(rep(1, 4), rep(2, 3), rep(3, 2), rep(4, 1))
## [[1]]
## [1] 1 1 1 1
##
## [[2]]
## [1] 2 2 2
##
## [[3]]
## [1] 3 3
##
## [[4]]
## [1] 4
rapply
because recursivedat <- list(a = list("A", "B", "C"), b = c(1, 100), c = list("Hey"))
dat
## $a
## $a[[1]]
## [1] "A"
##
## $a[[2]]
## [1] "B"
##
## $a[[3]]
## [1] "C"
##
##
## $b
## [1] 1 100
##
## $c
## $c[[1]]
## [1] "Hey"
#create custom function to supply to rapply
myFunction <- function(x) {
if (is.character(x)) { #if element within the list is a character, add !
return(paste0(x, "!"))
}
else {
return(x + 1000) #if element isn't a character, add 1000 to it
}
}
rapply(dat, myFunction)
## a1 a2 a3 b1 b2 c
## "A!" "B!" "C!" "1001" "1100" "Hey!"
x <- c(1, 2, 3, 10, 20, 30, 100, 200, 300) #create vector
groups <- c("a", "a", "a", "b", "b", "b", "c", "c", "c") #create grouping variable (3 groups)
tapply(x, groups, mean)
## a b c
## 2 20 200
In essence, tapply
has done this:
c(a = mean(c(1, 2, 3)), #mean of group a
b = mean(c(10, 20, 30)), #mean of group b
c = mean(c(100, 200, 300)) #mean of group c
)
## a b c
## 2 20 200
Another example with the ChickWeight
dataset included in R:
head(ChickWeight)
## weight Time Chick Diet
## 1 42 0 1 1
## 2 51 2 1 1
## 3 59 4 1 1
## 4 64 6 1 1
## 5 76 8 1 1
## 6 93 10 1 1
table(ChickWeight$Diet) #four types of diets (1, 2, 3, 4)
##
## 1 2 3 4
## 220 120 120 118
Use tapply
to find mean weight for each type of diet:
tapply(ChickWeight$weight, ChickWeight$Diet, mean)
## 1 2 3 4
## 102.6455 122.6167 142.9500 135.2627