Why preprocess?

library(caret); library(kernlab); data(spam)
inTrain <- createDataPartition(y=spam$type,
                              p=0.75, list=FALSE)
training <- spam[inTrain,]
testing <- spam[-inTrain,]
hist(training$capitalAve,main="",xlab="ave. capital run length")

Why preprocess?

mean(training$capitalAve)

## [1] 5.121469

sd(training$capitalAve)

## [1] 31.6629

Standardizing

trainCapAve <- training$capitalAve
trainCapAveS <- (trainCapAve  - mean(trainCapAve))/sd(trainCapAve) 
mean(trainCapAveS)

## [1] -4.391447e-18

sd(trainCapAveS)

## [1] 1

Standardizing - test set

testCapAve <- testing$capitalAve
testCapAveS <- (testCapAve  - mean(trainCapAve))/sd(trainCapAve) 
mean(testCapAveS)

## [1] 0.00885096

sd(testCapAveS)

## [1] 1.00879

Standardizing - preProcess function

preObj <- preProcess(training[,-58],method=c("center","scale"))
trainCapAveS <- predict(preObj,training[,-58])$capitalAve
mean(trainCapAveS)

## [1] -4.391447e-18

sd(trainCapAveS)

## [1] 1

Standardizing - preProcess function

testCapAveS <- predict(preObj,testing[,-58])$capitalAve
mean(testCapAveS)

## [1] 0.00885096

sd(testCapAveS)

## [1] 1.00879

Standardizing - preProcess argument

set.seed(32343)
modelFit <- train(type ~.,data=training,
                  preProcess=c("center","scale"),method="glm")
modelFit

## Generalized Linear Model 
## 
## 3451 samples
##   57 predictor
##    2 classes: 'nonspam', 'spam' 
## 
## Pre-processing: centered (57), scaled (57) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 3451, 3451, 3451, 3451, 3451, 3451, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.9181332  0.8274945
## 
##

Standardizing - Box-Cox transforms

preObj <- preProcess(training[,-58],method=c("BoxCox"))
trainCapAveS <- predict(preObj,training[,-58])$capitalAve
par(mfrow=c(1,2)); hist(trainCapAveS); qqnorm(trainCapAveS)

# Standardizing - Imputing data

set.seed(13343)
require(RANN)

# Make some values NA
training$capAve <- training$capitalAve
selectNA <- rbinom(dim(training)[1],size=1,prob=0.05)==1
training$capAve[selectNA] <- NA

# Impute and standardize
preObj <- preProcess(training[,-58],method="knnImpute")
capAve <- predict(preObj,training[,-58])$capAve

# Standardize true values
capAveTruth <- training$capitalAve
capAveTruth <- (capAveTruth-mean(capAveTruth))/sd(capAveTruth)

Standardizing - Imputing data

quantile(capAve - capAveTruth)

##            0%           25%           50%           75%          100% 
## -0.4780907248  0.0004790646  0.0011232362  0.0014329089  0.6938164145

quantile((capAve - capAveTruth)[selectNA])

##           0%          25%          50%          75%         100% 
## -0.352427698 -0.005650978  0.003686965  0.024096109  0.693816414

quantile((capAve - capAveTruth)[!selectNA])

##            0%           25%           50%           75%          100% 
## -0.4780907248  0.0004933287  0.0011147430  0.0014132005  0.0016624413

Notes and further reading

Training and test must be processed in the same way
Test transformations will likely be imperfect
Especially if the test/training sets collected at different times
Careful when transforming factor variables!
preprocessing with caret

Preprocessing

Duc Nguyen

Why preprocess?

Standardizing

Standardizing - test set

Standardizing - preProcess function

Standardizing - preProcess function

Standardizing - preProcess argument

Standardizing - Box-Cox transforms

Standardizing - Imputing data

Notes and further reading