#Build two predictive models, one using the predictors as they are and one using PCA with principal components explaining 80% of the variance in the predictors. Use method="glm" in the train function.
#What is the accuracy of each method in the test set? Which is more accurate?
library(caret)
library(AppliedPredictiveModeling)
set.seed(3433)
data(AlzheimerDisease)
adData = data.frame(diagnosis,predictors)
inTrain = createDataPartition(adData$diagnosis, p = 3/4, list = FALSE)
training <- adData[inTrain, ]
testing = adData[-inTrain,]
#Column including "IL"
grepIL <- grep("^[Ii][Ll].*", names(training))
new_training <- training[,c(grepIL)]
new_training <- cbind(new_training, diagnosis = training$diagnosis)
new_testing <- testing[,c(grepIL)]
new_testing <- cbind(new_testing, diagnosis = testing$diagnosis)
preProc <- preProcess(new_training[-13], method = "pca", thresh = 0.8)
trainPC <- predict(preProc, new_training[-13])
trainPC <- cbind(trainPC, diagnosis = new_training$diagnosis)
testPC <- predict(preProc, new_testing[-13])
testPC <- cbind(testPC, diagnosis = new_testing$diagnosis)
pcFit <- train(diagnosis~., data = trainPC, method = "glm")
predpcFit <- predict(pcFit, newdata = testPC)
confusionMatrix(predpcFit,testPC$diagnosis)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Impaired Control
## Impaired 3 4
## Control 19 56
##
## Accuracy : 0.7195
## 95% CI : (0.6094, 0.8132)
## No Information Rate : 0.7317
## P-Value [Acc > NIR] : 0.651780
##
## Kappa : 0.0889
## Mcnemar's Test P-Value : 0.003509
##
## Sensitivity : 0.13636
## Specificity : 0.93333
## Pos Pred Value : 0.42857
## Neg Pred Value : 0.74667
## Prevalence : 0.26829
## Detection Rate : 0.03659
## Detection Prevalence : 0.08537
## Balanced Accuracy : 0.53485
##
## 'Positive' Class : Impaired
##
NonPcFit <- train(diagnosis~., data = new_training, method = "glm")
predNonPcFit <- predict(NonPcFit, newdata = new_testing)
confusionMatrix(predNonPcFit, new_testing$diagnosis)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Impaired Control
## Impaired 2 9
## Control 20 51
##
## Accuracy : 0.6463
## 95% CI : (0.533, 0.7488)
## No Information Rate : 0.7317
## P-Value [Acc > NIR] : 0.96637
##
## Kappa : -0.0702
## Mcnemar's Test P-Value : 0.06332
##
## Sensitivity : 0.09091
## Specificity : 0.85000
## Pos Pred Value : 0.18182
## Neg Pred Value : 0.71831
## Prevalence : 0.26829
## Detection Rate : 0.02439
## Detection Prevalence : 0.13415
## Balanced Accuracy : 0.47045
##
## 'Positive' Class : Impaired
##
Results : the accuracy of Non-Processd is 0.65, Preprocessd is 0.72.