Correlated Predictors

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(kernlab)
data(spam)
inTrain<- createDataPartition(y=spam$type,p=0.75,list=FALSE)
training<- spam[inTrain,]
testing<- spam[-inTrain,]
M<- abs(cor(training[,-58]))
#Because we don't care about the signal, we do abs
diag(M)<- 0 
#Since we don't need the diagonal, we make them equal to 0
which(M>0.8,arr.ind=T)
##        row col
## num857  32  31
## num415  34  31
## direct  40  31
## telnet  31  32
## num415  34  32
## direct  40  32
## telnet  31  34
## num857  32  34
## direct  40  34
## telnet  31  40
## num857  32  40
## num415  34  40
#Returen a array, whose correlationship > 0.8

Correlated Predictors

Let’s take num415 and num857 as the example

names(spam)[c(34,32)] #Take out the No.34 and No.32 variable(predictor)
## [1] "num415" "num857"
plot(spam[,34],spam[,32])

#It look like a pretty straight line

Now two solutions for dimention reduction–SVD and PCA

PCA

smallSpam<- spam[,c(34,32)]
prComp<- prcomp(smallSpam)
plot(prComp$x[,1],prComp$x[,2])

prComp$rotation
##              PC1        PC2
## num415 0.7080625  0.7061498
## num857 0.7061498 -0.7080625

PCA on SPAM data

typeColor<- ((spam$type=="spam")*1+1)
#Make the spam or non spam become a logical term for color
prComp<- prcomp(log10(spam[,-58]+1))
#First do the log transaction to normalize the data
plot(prComp$x[,1],prComp$x[,2],col=typeColor,xlab="PC1",ylab="PC2")

#From the plot, we can find that the dots will be more likely change with the change of PC1

PCA with caret(Use caret package to plot the predicted data)

preProc<- preProcess(log(spam[,-58]+1),method="pca",pcaComp=2)
spamPC<- predict(preProc,log(spam[,-58]+1))
#Use the pca method to predict
plot(spamPC[,1],spamPC[,2],col=typeColor)

##Preprocessing with PCA (Using the PCA to get the prediction result)

preProc <- preProcess(log10(training[,-58]+1),method="pca",pcaComp=2)
trainPC <- predict(preProc,log10(training[,-58]+1))
modelFit <- train(training$type ~ .,method="glm",data=trainPC)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
testPC <- predict(preProc,log10(testing[,-58]+1))
confusionMatrix(testing$type,predict(modelFit,testPC))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction nonspam spam
##    nonspam     652   45
##    spam         67  386
##                                          
##                Accuracy : 0.9026         
##                  95% CI : (0.884, 0.9191)
##     No Information Rate : 0.6252         
##     P-Value [Acc > NIR] : < 2e-16        
##                                          
##                   Kappa : 0.7943         
##  Mcnemar's Test P-Value : 0.04722        
##                                          
##             Sensitivity : 0.9068         
##             Specificity : 0.8956         
##          Pos Pred Value : 0.9354         
##          Neg Pred Value : 0.8521         
##              Prevalence : 0.6252         
##          Detection Rate : 0.5670         
##    Detection Prevalence : 0.6061         
##       Balanced Accuracy : 0.9012         
##                                          
##        'Positive' Class : nonspam        
##