Preprocessing Data

knitr::opts_chunk$set(warning=FALSE)

library(caret);library(kernlab);data(spam)

## Loading required package: lattice
## Loading required package: ggplot2

inTrain<-createDataPartition(y=spam$type,
                             p=0.75,list=FALSE)
training<-spam[inTrain,]
testing<-spam[-inTrain,]
hist(training$capitalAve,main="",xlab="ave. capital run length")
rug(x = training$capitalAve)

mean(training$capitalAve)

## [1] 5.340146

sd(training$capitalAve)

## [1] 34.47517

The capitalAve feature of the training subset of spam is highly skewed, has multiple outliers, and has a very high standard deviation. It is thus a good candidate for preprocessing.

Preprocessing via standardization

trainCapAve<-training$capitalAve
trainCapAveS<-(trainCapAve-mean(trainCapAve))/sd(trainCapAve)
mean(trainCapAveS)

## [1] -4.940212e-19

sd(trainCapAveS)

## [1] 1

testCapAve<-testing$capitalAve
testCapAveS<-(testCapAve-mean(trainCapAve))/sd(trainCapAve)
mean(testCapAveS)

## [1] -0.01724877

sd(testCapAveS)

## [1] 0.6231694

# Note that testing$capitalAve is normalized w/parameters from the training set

We can accomplish same normalization with preProcess function in caret library.

preObj<-preProcess(training[,-58],method=c("center","scale"))
trainCapAveS<-predict(preObj,training[,-58])$capitalAve
mean(trainCapAveS)

## [1] -4.940212e-19

sd(trainCapAveS)

## [1] 1

testCapAveS<-predict(preObj,testing[,-58])$capitalAve
mean(testCapAveS)

## [1] -0.01724877

sd(testCapAveS)

## [1] 0.6231694

Standardizing: Imputing Data

set.seed(13343)
training$capAve <- training$capitalAve
selectNA <- rbinom(dim(training)[1],size=1,prob=0.05)==1
training$capAve[selectNA]<-NA
# Impute and standardize
preObj<-preProcess(training[,-58],method="knnImpute")
capAve<-predict(preObj,training[,-58])$capAve
capAveTruth<-training$capitalAve
capAveTruth<-(capAveTruth-mean(capAveTruth))/sd(capAveTruth)
quantile(capAve-capAveTruth)

##            0%           25%           50%           75%          100% 
## -6.0285284620 -0.0005132252  0.0001545048  0.0015654873  1.0134646906

quantile((capAve-capAveTruth)[selectNA])

##           0%          25%          50%          75%         100% 
## -6.028528462 -0.011882272  0.002922897  0.016200973  0.188186813

quantile((capAve-capAveTruth)[!selectNA])

##            0%           25%           50%           75%          100% 
## -0.0010243840 -0.0004842135  0.0001512813  0.0014595715  1.0134646906

Preprocessing via PCA

rm(list=ls())
library(caret);library(kernlab);data(spam)
inTrain<-createDataPartition(y=spam$type,
                             p=0.75,list=FALSE)
training<-spam[inTrain,]
testing<-spam[-inTrain,]
M<-abs(cor(training[,-58]))
diag(M)<-0
which(M>0.8,arr.ind = T)

##        row col
## num415  34  32
## direct  40  32
## num857  32  34
## direct  40  34
## num857  32  40
## num415  34  40

names(spam)[c(34,32)]; names(spam)[c(40,32)]; names(spam)[c(40,34)]

## [1] "num415" "num857"

## [1] "direct" "num857"

## [1] "direct" "num415"

p<-par(mfrow=c(1,3))
plot(spam[,34],spam[,32])
plot(spam[,40],spam[,32])
plot(spam[,40],spam[,34])

We apply principal components analysis on features with high correlation, individually and pairwise.

miniSpam<-spam[,c(32,34,40)]
PCA<-prcomp(miniSpam)
plot(PCA$x[,1],PCA$x[,2]); plot(PCA$x[,1],PCA$x[,3]); plot(PCA$x[,2],PCA$x[,3])

PCA is valuable because it can be applied not just pairwise but to the entire dataset simultaneously, i.e., not pairwise, but on all the features.

typeCol<-((spam$type=="spam")*1+1)
prComp<-prcomp(log10(spam[-58]+1))
p<-par(mfrow=c(1,1))
plot(prComp$x[,1],prComp$x[,2],col=typeCol,xlab="PC1",ylab="PC2")

Finally, apply PCA to fit a model on the training data.

preProc<-preProcess(log10(training[,-58]+1),method="pca",pcaComp=2)
trainPC<-predict(preProc,log10(training[,-58]+1))

modelFit<-train(training$type~.,method="glm",data=trainPC)

We observe the performance of the model when applied to the test data.

testPC<-predict(preProc,log10(testing[,-58]+1))
confusionMatrix(testing$type,predict(modelFit,testPC))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction nonspam spam
##    nonspam     636   61
##    spam         54  399
##                                           
##                Accuracy : 0.9             
##                  95% CI : (0.8812, 0.9167)
##     No Information Rate : 0.6             
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7911          
##  Mcnemar's Test P-Value : 0.5758          
##                                           
##             Sensitivity : 0.9217          
##             Specificity : 0.8674          
##          Pos Pred Value : 0.9125          
##          Neg Pred Value : 0.8808          
##              Prevalence : 0.6000          
##          Detection Rate : 0.5530          
##    Detection Prevalence : 0.6061          
##       Balanced Accuracy : 0.8946          
##                                           
##        'Positive' Class : nonspam         
##

Preprocessing Data

MSL

October 25, 2015

Preprocessing via standardization

Standardizing: Imputing Data

Preprocessing via PCA