Why preprocess?

library(caret); library(kernlab); data(spam)
inTrain <- createDataPartition(y=spam$type,
                              p=0.75, list=FALSE)
training <- spam[inTrain,]
testing <- spam[-inTrain,]
hist(training$capitalAve,main="",xlab="ave. capital run length")

Why preprocess?

mean(training$capitalAve)
## [1] 5.121469
sd(training$capitalAve)
## [1] 31.6629

Standardizing

trainCapAve <- training$capitalAve
trainCapAveS <- (trainCapAve  - mean(trainCapAve))/sd(trainCapAve) 
mean(trainCapAveS)
## [1] -4.391447e-18
sd(trainCapAveS)
## [1] 1

Standardizing - test set

testCapAve <- testing$capitalAve
testCapAveS <- (testCapAve  - mean(trainCapAve))/sd(trainCapAve) 
mean(testCapAveS)
## [1] 0.00885096
sd(testCapAveS)
## [1] 1.00879

Standardizing - preProcess function

preObj <- preProcess(training[,-58],method=c("center","scale"))
trainCapAveS <- predict(preObj,training[,-58])$capitalAve
mean(trainCapAveS)
## [1] -4.391447e-18
sd(trainCapAveS)
## [1] 1

Standardizing - preProcess function

testCapAveS <- predict(preObj,testing[,-58])$capitalAve
mean(testCapAveS)
## [1] 0.00885096
sd(testCapAveS)
## [1] 1.00879

Standardizing - preProcess argument

set.seed(32343)
modelFit <- train(type ~.,data=training,
                  preProcess=c("center","scale"),method="glm")
modelFit
## Generalized Linear Model 
## 
## 3451 samples
##   57 predictor
##    2 classes: 'nonspam', 'spam' 
## 
## Pre-processing: centered (57), scaled (57) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 3451, 3451, 3451, 3451, 3451, 3451, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.9181332  0.8274945
## 
## 

Standardizing - Box-Cox transforms

preObj <- preProcess(training[,-58],method=c("BoxCox"))
trainCapAveS <- predict(preObj,training[,-58])$capitalAve
par(mfrow=c(1,2)); hist(trainCapAveS); qqnorm(trainCapAveS)

# Standardizing - Imputing data

set.seed(13343)
require(RANN)

# Make some values NA
training$capAve <- training$capitalAve
selectNA <- rbinom(dim(training)[1],size=1,prob=0.05)==1
training$capAve[selectNA] <- NA

# Impute and standardize
preObj <- preProcess(training[,-58],method="knnImpute")
capAve <- predict(preObj,training[,-58])$capAve

# Standardize true values
capAveTruth <- training$capitalAve
capAveTruth <- (capAveTruth-mean(capAveTruth))/sd(capAveTruth)

Standardizing - Imputing data

quantile(capAve - capAveTruth)
##            0%           25%           50%           75%          100% 
## -0.4780907248  0.0004790646  0.0011232362  0.0014329089  0.6938164145
quantile((capAve - capAveTruth)[selectNA])
##           0%          25%          50%          75%         100% 
## -0.352427698 -0.005650978  0.003686965  0.024096109  0.693816414
quantile((capAve - capAveTruth)[!selectNA])
##            0%           25%           50%           75%          100% 
## -0.4780907248  0.0004933287  0.0011147430  0.0014132005  0.0016624413

Notes and further reading