names(spam)[c(34,32)] #Take out the No.34 and No.32 variable(predictor)
## [1] "num415" "num857"
plot(spam[,34],spam[,32])
#It look like a pretty straight line
smallSpam<- spam[,c(34,32)]
prComp<- prcomp(smallSpam)
plot(prComp$x[,1],prComp$x[,2])
prComp$rotation
## PC1 PC2
## num415 0.7080625 0.7061498
## num857 0.7061498 -0.7080625
typeColor<- ((spam$type=="spam")*1+1)
#Make the spam or non spam become a logical term for color
prComp<- prcomp(log10(spam[,-58]+1))
#First do the log transaction to normalize the data
plot(prComp$x[,1],prComp$x[,2],col=typeColor,xlab="PC1",ylab="PC2")
#From the plot, we can find that the dots will be more likely change with the change of PC1
preProc<- preProcess(log(spam[,-58]+1),method="pca",pcaComp=2)
spamPC<- predict(preProc,log(spam[,-58]+1))
#Use the pca method to predict
plot(spamPC[,1],spamPC[,2],col=typeColor)
##Preprocessing with PCA (Using the PCA to get the prediction result)
preProc <- preProcess(log10(training[,-58]+1),method="pca",pcaComp=2)
trainPC <- predict(preProc,log10(training[,-58]+1))
modelFit <- train(training$type ~ .,method="glm",data=trainPC)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
testPC <- predict(preProc,log10(testing[,-58]+1))
confusionMatrix(testing$type,predict(modelFit,testPC))
## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 652 45
## spam 67 386
##
## Accuracy : 0.9026
## 95% CI : (0.884, 0.9191)
## No Information Rate : 0.6252
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.7943
## Mcnemar's Test P-Value : 0.04722
##
## Sensitivity : 0.9068
## Specificity : 0.8956
## Pos Pred Value : 0.9354
## Neg Pred Value : 0.8521
## Prevalence : 0.6252
## Detection Rate : 0.5670
## Detection Prevalence : 0.6061
## Balanced Accuracy : 0.9012
##
## 'Positive' Class : nonspam
##