Correlated predictors

library(caret); library(kernlab); data(spam)
inTrain <- createDataPartition(y=spam$type,
                              p=0.75, list=FALSE)
training <- spam[inTrain,]
testing <- spam[-inTrain,]

M <- abs(cor(training[,-58]))
diag(M) <- 0
which(M > 0.8,arr.ind=T)
       row col
num415  34  32
num857  32  34

Correlated predictors

names(spam)[c(34,32)]
[1] "num415" "num857"
plot(spam[,34],spam[,32])
plot of chunk unnamed-chunk-1

Basic PCA idea


We could rotate the plot

\[ X = 0.71 \times {\rm num 415} + 0.71 \times {\rm num857}\]

\[ Y = 0.71 \times {\rm num 415} - 0.71 \times {\rm num857}\]

X <- 0.71*training$num415 + 0.71*training$num857
Y <- 0.71*training$num415 - 0.71*training$num857
plot(X,Y)
plot of chunk unnamed-chunk-2

Principal components in R - prcomp

smallSpam <- spam[,c(34,32)]
prComp <- prcomp(smallSpam)
plot(prComp$x[,1],prComp$x[,2])
plot of chunk prcomp

Principal components in R - prcomp

prComp$rotation
          PC1     PC2
num415 0.7081  0.7061
num857 0.7061 -0.7081

PCA on SPAM data

typeColor <- ((spam$type=="spam")*1 + 1)
prComp <- prcomp(log10(spam[,-58]+1))
plot(prComp$x[,1],prComp$x[,2],col=typeColor,xlab="PC1",ylab="PC2")
plot of chunk spamPC

PCA with caret

preProc <- preProcess(log10(spam[,-58]+1),method="pca",pcaComp=2)
spamPC <- predict(preProc,log10(spam[,-58]+1))
plot(spamPC[,1],spamPC[,2],col=typeColor)
plot of chunk unnamed-chunk-4

Preprocessing with PCA

preProc <- preProcess(log10(training[,-58]+1),method="pca",pcaComp=2)
trainPC <- predict(preProc,log10(training[,-58]+1))
modelFit <- train(training$type ~ .,method="glm",data=trainPC)

Preprocessing with PCA

testPC <- predict(preProc,log10(testing[,-58]+1))
confusionMatrix(testing$type,predict(modelFit,testPC))
Confusion Matrix and Statistics

          Reference
Prediction nonspam spam
   nonspam     646   51
   spam         64  389
                                        
               Accuracy : 0.9           
                 95% CI : (0.881, 0.917)
    No Information Rate : 0.617         
    P-Value [Acc > NIR] : <2e-16        
                                        
                  Kappa : 0.79          
 Mcnemar's Test P-Value : 0.263         
                                        
            Sensitivity : 0.910         
            Specificity : 0.884         
         Pos Pred Value : 0.927         
         Neg Pred Value : 0.859         
             Prevalence : 0.617         
         Detection Rate : 0.562         
   Detection Prevalence : 0.606         
                                        
       'Positive' Class : nonspam       
                                        

Alternative (sets # of PCs)

modelFit <- train(training$type ~ .,method="glm",preProcess="pca",data=training)
confusionMatrix(testing$type,predict(modelFit,testing))
Confusion Matrix and Statistics

          Reference
Prediction nonspam spam
   nonspam     660   37
   spam         54  399
                                        
               Accuracy : 0.921         
                 95% CI : (0.904, 0.936)
    No Information Rate : 0.621         
    P-Value [Acc > NIR] : <2e-16        
                                        
                  Kappa : 0.833         
 Mcnemar's Test P-Value : 0.0935        
                                        
            Sensitivity : 0.924         
            Specificity : 0.915         
         Pos Pred Value : 0.947         
         Neg Pred Value : 0.881         
             Prevalence : 0.621         
         Detection Rate : 0.574         
   Detection Prevalence : 0.606         
                                        
       'Positive' Class : nonspam       
                                        

Final thoughts on PCs