knitr::opts_chunk$set(warning=FALSE)
library(caret);library(kernlab);data(spam)
## Loading required package: lattice
## Loading required package: ggplot2
inTrain<-createDataPartition(y=spam$type,
p=0.75,list=FALSE)
training<-spam[inTrain,]
testing<-spam[-inTrain,]
hist(training$capitalAve,main="",xlab="ave. capital run length")
rug(x = training$capitalAve)
mean(training$capitalAve)
## [1] 5.340146
sd(training$capitalAve)
## [1] 34.47517
The capitalAve feature of the training subset of spam is highly skewed, has multiple outliers, and has a very high standard deviation. It is thus a good candidate for preprocessing.
trainCapAve<-training$capitalAve
trainCapAveS<-(trainCapAve-mean(trainCapAve))/sd(trainCapAve)
mean(trainCapAveS)
## [1] -4.940212e-19
sd(trainCapAveS)
## [1] 1
testCapAve<-testing$capitalAve
testCapAveS<-(testCapAve-mean(trainCapAve))/sd(trainCapAve)
mean(testCapAveS)
## [1] -0.01724877
sd(testCapAveS)
## [1] 0.6231694
# Note that testing$capitalAve is normalized w/parameters from the training set
We can accomplish same normalization with preProcess function in caret library.
preObj<-preProcess(training[,-58],method=c("center","scale"))
trainCapAveS<-predict(preObj,training[,-58])$capitalAve
mean(trainCapAveS)
## [1] -4.940212e-19
sd(trainCapAveS)
## [1] 1
testCapAveS<-predict(preObj,testing[,-58])$capitalAve
mean(testCapAveS)
## [1] -0.01724877
sd(testCapAveS)
## [1] 0.6231694
set.seed(13343)
training$capAve <- training$capitalAve
selectNA <- rbinom(dim(training)[1],size=1,prob=0.05)==1
training$capAve[selectNA]<-NA
# Impute and standardize
preObj<-preProcess(training[,-58],method="knnImpute")
capAve<-predict(preObj,training[,-58])$capAve
capAveTruth<-training$capitalAve
capAveTruth<-(capAveTruth-mean(capAveTruth))/sd(capAveTruth)
quantile(capAve-capAveTruth)
## 0% 25% 50% 75% 100%
## -6.0285284620 -0.0005132252 0.0001545048 0.0015654873 1.0134646906
quantile((capAve-capAveTruth)[selectNA])
## 0% 25% 50% 75% 100%
## -6.028528462 -0.011882272 0.002922897 0.016200973 0.188186813
quantile((capAve-capAveTruth)[!selectNA])
## 0% 25% 50% 75% 100%
## -0.0010243840 -0.0004842135 0.0001512813 0.0014595715 1.0134646906
rm(list=ls())
library(caret);library(kernlab);data(spam)
inTrain<-createDataPartition(y=spam$type,
p=0.75,list=FALSE)
training<-spam[inTrain,]
testing<-spam[-inTrain,]
M<-abs(cor(training[,-58]))
diag(M)<-0
which(M>0.8,arr.ind = T)
## row col
## num415 34 32
## direct 40 32
## num857 32 34
## direct 40 34
## num857 32 40
## num415 34 40
names(spam)[c(34,32)]; names(spam)[c(40,32)]; names(spam)[c(40,34)]
## [1] "num415" "num857"
## [1] "direct" "num857"
## [1] "direct" "num415"
p<-par(mfrow=c(1,3))
plot(spam[,34],spam[,32])
plot(spam[,40],spam[,32])
plot(spam[,40],spam[,34])
We apply principal components analysis on features with high correlation, individually and pairwise.
miniSpam<-spam[,c(32,34,40)]
PCA<-prcomp(miniSpam)
plot(PCA$x[,1],PCA$x[,2]); plot(PCA$x[,1],PCA$x[,3]); plot(PCA$x[,2],PCA$x[,3])
PCA is valuable because it can be applied not just pairwise but to the entire dataset simultaneously, i.e., not pairwise, but on all the features.
typeCol<-((spam$type=="spam")*1+1)
prComp<-prcomp(log10(spam[-58]+1))
p<-par(mfrow=c(1,1))
plot(prComp$x[,1],prComp$x[,2],col=typeCol,xlab="PC1",ylab="PC2")
Finally, apply PCA to fit a model on the training data.
preProc<-preProcess(log10(training[,-58]+1),method="pca",pcaComp=2)
trainPC<-predict(preProc,log10(training[,-58]+1))
modelFit<-train(training$type~.,method="glm",data=trainPC)
We observe the performance of the model when applied to the test data.
testPC<-predict(preProc,log10(testing[,-58]+1))
confusionMatrix(testing$type,predict(modelFit,testPC))
## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 636 61
## spam 54 399
##
## Accuracy : 0.9
## 95% CI : (0.8812, 0.9167)
## No Information Rate : 0.6
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7911
## Mcnemar's Test P-Value : 0.5758
##
## Sensitivity : 0.9217
## Specificity : 0.8674
## Pos Pred Value : 0.9125
## Neg Pred Value : 0.8808
## Prevalence : 0.6000
## Detection Rate : 0.5530
## Detection Prevalence : 0.6061
## Balanced Accuracy : 0.8946
##
## 'Positive' Class : nonspam
##