tApply and by command in R

This page is based on Mike Marin’s Statslectures video ’tApply Function in R‘ and …. It is part of ‘My notes on R programming’ on my site: https://dataz4s.com/

The tApply and the by commands can be used to apply a function to a subset of a vector or a variable.

Read in data, checking and attaching

# Read in data via read_excel
library(readxl)
LungCapData <- read_excel("C:/Users/Usuario/Documents/dataZ4s/R/MarinLectures/LungCapData.xlsx", 
                          col_types = c("numeric", "numeric", "numeric", 
                                        "text", "text", "text"))

# R reads in Smoke, Gender and Caesarean as "text". Needs change to "factor"
# Change Smoke, Gender and Caesarean to factors with as.factor() command
LungCapData$Smoke <- as.factor(LungCapData$Smoke)
LungCapData$Gender <- as.factor(LungCapData$Gender)
LungCapData$Caesarean <- as.factor(LungCapData$Caesarean)


# Checking the data and attaching the dataset
summary(LungCapData)
##     LungCap            Age            Height      Smoke        Gender   
##  Min.   : 0.507   Min.   : 3.00   Min.   :45.30   no :648   female:358  
##  1st Qu.: 6.150   1st Qu.: 9.00   1st Qu.:59.90   yes: 77   male  :367  
##  Median : 8.000   Median :13.00   Median :65.40                         
##  Mean   : 7.863   Mean   :12.33   Mean   :64.84                         
##  3rd Qu.: 9.800   3rd Qu.:15.00   3rd Qu.:70.30                         
##  Max.   :14.675   Max.   :19.00   Max.   :81.80                         
##  Caesarean
##  no :561  
##  yes:164  
##           
##           
##           
## 
# and attach it
attach(LungCapData)

Mean age of smokers/non-smokers

# Subsetting using [ ], can get us the result
mean(Age[Smoke=="no"])
## [1] 12.03549
mean(Age[Smoke=="yes"])
## [1] 14.77922
# But the "tapply" is more efficient:
# Age is the variable to apply the mean command to 
# Subsetting data with the INDEX command as to status of smoking
# The FUN=mean and we remove NA values with na.rm command
tapply(X=Age, INDEX=Smoke, FUN=mean, na.rm=T)
##       no      yes 
## 12.03549 14.77922
# Saving in object
m <- tapply(X=Age, INDEX=Smoke, FUN=mean, na.rm=T)

The "simplify’ argument

# 'simplify' argument is TRUE be default
# When set to  "FALSE" the results from above are returned in list format
tapply(X=Age, INDEX=Smoke, FUN=mean, simplify=FALSE)
## $no
## [1] 12.03549
## 
## $yes
## [1] 14.77922

“quantile” and “summary” functions

# Using "quantile" function to Age and Smoke
tapply(Age, Smoke, quantile, probs=c(0.2, 0.8))
## $no
## 20% 80% 
##   8  16 
## 
## $yes
## 20% 80% 
##  12  17
# Using the "summary" commandfor smokers and non-smokers
tapply(X=Age, INDEX=Smoke, FUN=summary)
## $no
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3.00    9.00   12.00   12.04   15.00   19.00 
## 
## $yes
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.00   13.00   15.00   14.78   17.00   19.00
# we can "subset" based on multiple variables/vectors
# calculate the mean Age for Smoker/NonSmoker and male/female
tapply(X=Age, INDEX=list(Smoke, Gender), FUN=mean, na.rm=T)
##       female     male
## no  12.12739 11.94910
## yes 14.75000 14.81818

Subsetting based on multiple variables/vectors

# Calculating mean age for smokers and non-smokers as per gender
# We are subsetting Smoke and Gender
tapply(X=Age, INDEX=list(Smoke, Gender), mean, na.rm=T)
##       female     male
## no  12.12739 11.94910
## yes 14.75000 14.81818
# Subsetting with [], can do the same but less efficiently
mean(Age[Smoke=="no" & Gender=="female"])
## [1] 12.12739
mean(Age[Smoke=="no" & Gender=="male"])
## [1] 11.9491
mean(Age[Smoke=="yes" & Gender=="female"])
## [1] 14.75
mean(Age[Smoke=="yes" & Gender=="male"])
## [1] 14.81818

The same operations with ‘by’ command

# The "by" command does the same as the "tapply" command, but...
# ... returns the result are returned similar to a vector format
by(Age, list(Smoke, Gender), mean, na.rm=T)
## : no
## : female
## [1] 12.12739
## ------------------------------------------------------------ 
## : yes
## : female
## [1] 14.75
## ------------------------------------------------------------ 
## : no
## : male
## [1] 11.9491
## ------------------------------------------------------------ 
## : yes
## : male
## [1] 14.81818
# Subsetting with "by"
temp <- by(Age, list(Smoke, Gender), mean, na.rm=T)
temp
## : no
## : female
## [1] 12.12739
## ------------------------------------------------------------ 
## : yes
## : female
## [1] 14.75
## ------------------------------------------------------------ 
## : no
## : male
## [1] 11.9491
## ------------------------------------------------------------ 
## : yes
## : male
## [1] 14.81818
temp[4]
## [1] 14.81818
# Converting to a vector 
c(temp)
## [1] 12.12739 14.75000 11.94910 14.81818
temp2 <- c(temp)
temp2
## [1] 12.12739 14.75000 11.94910 14.81818
# Checking it's class
class(temp2)
## [1] "numeric"

View this content on my page: https://dataz4s.com/r-statistical-programming/tapply-by-command-r/