This session is to practice apply family functions

There are several functions in apply family function, but we will focus on four main methods

Before we jump in applying the family function, we will create a dataset (matrix called mt)

set.seed(122)

mt<-matrix(rnorm(100),ncol = 10,nrow = 10,byrow = T)

head(mt)
##            [,1]       [,2]       [,3]       [,4]       [,5]        [,6]
## [1,]  1.3107009 -0.8758531  0.1995237  0.4659544 -1.8020568  1.44874346
## [2,] -0.1321288 -0.8416347  0.2742522 -2.4625904 -2.1533960  1.15917575
## [3,] -0.5378254 -0.5226487 -1.0529219  0.1015172  1.8635011 -0.01960909
## [4,]  1.4113547  0.6013653 -0.9139976  1.5804259 -0.4699156  1.60183889
## [5,] -0.6580097 -1.6085024 -1.3440913 -0.1966739  1.3756627  0.17331500
## [6,]  2.5567545 -0.3615805 -0.4385265 -1.0592134  1.0109921  0.05498478
##            [,7]        [,8]       [,9]      [,10]
## [1,]  0.2988538  0.36181165 -1.0069404 -0.2569991
## [2,]  0.6897401  0.88025132 -0.9433702  0.7190729
## [3,] -1.0783075  0.17260428  0.5330541  1.9522645
## [4,]  0.3727399  0.03387786  0.1041034 -2.0183743
## [5,] -0.4392776 -1.59532290 -0.7442776  1.8152081
## [6,]  0.4523293 -0.17415183  0.8398331  0.8454829

We should give the matrix column and row names, should not we?

# Label matrix columns

colnames(mt)<-paste("Col",1:ncol(mt),sep="_")

# Label matrix rows

rownames(mt)<-paste("Row",1:nrow(mt),sep="_")

# Check the new labeled matrix

head(mt)
##            Col_1      Col_2      Col_3      Col_4      Col_5       Col_6
## Row_1  1.3107009 -0.8758531  0.1995237  0.4659544 -1.8020568  1.44874346
## Row_2 -0.1321288 -0.8416347  0.2742522 -2.4625904 -2.1533960  1.15917575
## Row_3 -0.5378254 -0.5226487 -1.0529219  0.1015172  1.8635011 -0.01960909
## Row_4  1.4113547  0.6013653 -0.9139976  1.5804259 -0.4699156  1.60183889
## Row_5 -0.6580097 -1.6085024 -1.3440913 -0.1966739  1.3756627  0.17331500
## Row_6  2.5567545 -0.3615805 -0.4385265 -1.0592134  1.0109921  0.05498478
##            Col_7       Col_8      Col_9     Col_10
## Row_1  0.2988538  0.36181165 -1.0069404 -0.2569991
## Row_2  0.6897401  0.88025132 -0.9433702  0.7190729
## Row_3 -1.0783075  0.17260428  0.5330541  1.9522645
## Row_4  0.3727399  0.03387786  0.1041034 -2.0183743
## Row_5 -0.4392776 -1.59532290 -0.7442776  1.8152081
## Row_6  0.4523293 -0.17415183  0.8398331  0.8454829

We will apply the first function apply

# apply function takes three main arguments mt (matrix), margin (row "1" or colum "2", fun (what methods))

# Calculating the mean of all columns
apply(mt,2,mean)
##       Col_1       Col_2       Col_3       Col_4       Col_5       Col_6 
##  0.07389782 -0.28224671 -0.22575555 -0.35505915  0.10132837  0.50260884 
##       Col_7       Col_8       Col_9      Col_10 
##  0.09821195  0.05646223 -0.19066573  0.34130073
# Calculating standard deviation

apply(mt,2, sd)
##     Col_1     Col_2     Col_3     Col_4     Col_5     Col_6     Col_7 
## 1.2488349 0.8685868 0.7180313 1.4720801 1.3703386 0.8003884 0.9481447 
##     Col_8     Col_9    Col_10 
## 0.8103711 0.9292266 1.5184065
# Calculating median

apply(mt,2, median)
##       Col_1       Col_2       Col_3       Col_4       Col_5       Col_6 
## -0.36679628 -0.44211458 -0.14176502 -0.06962635  0.45663236  0.20510032 
##       Col_7       Col_8       Col_9      Col_10 
##  0.37193682  0.12832925 -0.19719723  0.57398151
# Find maximum or minimum values 

apply(mt,2,min)
##      Col_1      Col_2      Col_3      Col_4      Col_5      Col_6 
## -1.1521364 -1.6085024 -1.3440913 -2.5177170 -2.1533960 -0.8038404 
##      Col_7      Col_8      Col_9     Col_10 
## -1.8190443 -1.5953229 -1.7766103 -2.0183743
apply(mt,2,max)
##     Col_1     Col_2     Col_3     Col_4     Col_5     Col_6     Col_7 
## 2.5567545 1.4023026 0.6753913 1.7159783 1.8635011 1.6018389 1.3827442 
##     Col_8     Col_9    Col_10 
## 1.2739718 0.8765213 2.3340020

When using apply family functions, we don’t need to convert it to a matrix

–lapply function: This function returns its result in list

library(ggplot2)

lapply(trees,mean)
## $Girth
## [1] 13.24839
## 
## $Height
## [1] 76
## 
## $Volume
## [1] 30.17097

Define a function

# Calculate mean 

my_func<- function(x){
  mean(x)/min(x)
}

lapply(trees,my_func)
## $Girth
## [1] 1.596191
## 
## $Height
## [1] 1.206349
## 
## $Volume
## [1] 2.957938
# Calculating mean or sd using sapply

sapply(trees,mean)
##    Girth   Height   Volume 
## 13.24839 76.00000 30.17097
sapply(df,mean)
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
##   x df1 df2 ncp log     
##  NA  NA  NA  NA   0  NA
sapply(trees, sd)
##     Girth    Height    Volume 
##  3.138139  6.371813 16.437846

Define a function to calculate mean of all tree columns

sapply(trees, function(x){sum(x)/length(x)})
##    Girth   Height   Volume 
## 13.24839 76.00000 30.17097

If dataset is a list, using lapply to get summary

mylist<-list(trees)

lapply(mylist,summary)
## [[1]]
##      Girth           Height       Volume     
##  Min.   : 8.30   Min.   :63   Min.   :10.20  
##  1st Qu.:11.05   1st Qu.:72   1st Qu.:19.40  
##  Median :12.90   Median :76   Median :24.20  
##  Mean   :13.25   Mean   :76   Mean   :30.17  
##  3rd Qu.:15.25   3rd Qu.:80   3rd Qu.:37.30  
##  Max.   :20.60   Max.   :87   Max.   :77.00

If you want to multiply 3 to all your dataset, using lapply to do that

mytrees<-trees

my_tree<-sapply(mytrees, function(x){x*3})

head(my_tree)
##      Girth Height Volume
## [1,]  24.9    210   30.9
## [2,]  25.8    195   30.9
## [3,]  26.4    189   30.6
## [4,]  31.5    216   49.2
## [5,]  32.1    243   56.4
## [6,]  32.4    249   59.1

Some other interesting features of sapply and lapply

# Displaying first six rows
sapply(trees,head)
##      Girth Height Volume
## [1,]   8.3     70   10.3
## [2,]   8.6     65   10.3
## [3,]   8.8     63   10.2
## [4,]  10.5     72   16.4
## [5,]  10.7     81   18.8
## [6,]  10.8     83   19.7
sapply(trees,colnames)
## $Girth
## NULL
## 
## $Height
## NULL
## 
## $Volume
## NULL

tapply used for calculating mean, sd, median ... for each group

library(caret)
## Loading required package: lattice
# Calculating the mean of Sepal length for each species

tapply(iris$Sepal.Length,iris$Species, mean)
##     setosa versicolor  virginica 
##      5.006      5.936      6.588