library(caret); library(kernlab); data(spam)
inTrain <- createDataPartition(y=spam$type,
p=0.75, list=FALSE)
training <- spam[inTrain,]
testing <- spam[-inTrain,]
hist(training$capitalAve,main="",xlab="ave. capital run length")
Why preprocess?
mean(training$capitalAve)
## [1] 5.121469
sd(training$capitalAve)
## [1] 31.6629
trainCapAve <- training$capitalAve
trainCapAveS <- (trainCapAve - mean(trainCapAve))/sd(trainCapAve)
mean(trainCapAveS)
## [1] -4.391447e-18
sd(trainCapAveS)
## [1] 1
testCapAve <- testing$capitalAve
testCapAveS <- (testCapAve - mean(trainCapAve))/sd(trainCapAve)
mean(testCapAveS)
## [1] 0.00885096
sd(testCapAveS)
## [1] 1.00879
preObj <- preProcess(training[,-58],method=c("center","scale"))
trainCapAveS <- predict(preObj,training[,-58])$capitalAve
mean(trainCapAveS)
## [1] -4.391447e-18
sd(trainCapAveS)
## [1] 1
testCapAveS <- predict(preObj,testing[,-58])$capitalAve
mean(testCapAveS)
## [1] 0.00885096
sd(testCapAveS)
## [1] 1.00879
set.seed(32343)
modelFit <- train(type ~.,data=training,
preProcess=c("center","scale"),method="glm")
modelFit
## Generalized Linear Model
##
## 3451 samples
## 57 predictor
## 2 classes: 'nonspam', 'spam'
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 3451, 3451, 3451, 3451, 3451, 3451, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9181332 0.8274945
##
##
preObj <- preProcess(training[,-58],method=c("BoxCox"))
trainCapAveS <- predict(preObj,training[,-58])$capitalAve
par(mfrow=c(1,2)); hist(trainCapAveS); qqnorm(trainCapAveS)
# Standardizing - Imputing data
set.seed(13343)
require(RANN)
# Make some values NA
training$capAve <- training$capitalAve
selectNA <- rbinom(dim(training)[1],size=1,prob=0.05)==1
training$capAve[selectNA] <- NA
# Impute and standardize
preObj <- preProcess(training[,-58],method="knnImpute")
capAve <- predict(preObj,training[,-58])$capAve
# Standardize true values
capAveTruth <- training$capitalAve
capAveTruth <- (capAveTruth-mean(capAveTruth))/sd(capAveTruth)
quantile(capAve - capAveTruth)
## 0% 25% 50% 75% 100%
## -0.4780907248 0.0004790646 0.0011232362 0.0014329089 0.6938164145
quantile((capAve - capAveTruth)[selectNA])
## 0% 25% 50% 75% 100%
## -0.352427698 -0.005650978 0.003686965 0.024096109 0.693816414
quantile((capAve - capAveTruth)[!selectNA])
## 0% 25% 50% 75% 100%
## -0.4780907248 0.0004933287 0.0011147430 0.0014132005 0.0016624413