Question 1:

Load the Alzehiemer’s disease data using the commands:

library(AppliedPredictiveModeling)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
data(AlzheimerDisease)

Which of the following commands will create training and test sets with about 50% of the observations assigned to each?

adData = data.frame(diagnosis,predictors)
testIndex = createDataPartition(diagnosis, p = 0.50,list=FALSE)
training = adData[-testIndex,]
testing = adData[testIndex,]

answer third choice.

Question 2:

Load the cement data using the commands:

library(AppliedPredictiveModeling)
data(concrete)
library(caret)
set.seed(1000)
inTrain = createDataPartition(mixtures$CompressiveStrength, p = 3/4)[[1]]
training = mixtures[ inTrain,]
testing = mixtures[-inTrain,]

Make a histogram and confirm the SuperPlasticizer variable is skewed. Normally you might use the log transform to try to make the data more symmetric. Why would that be a poor choice for this variable?

hist(training$Superplasticizer, breaks=20)

hist(log(training$Superplasticizer+1), breaks=20)

*Answer: There are a large number of values that are the same and even if you took the log(SuperPlasticizer + 1) they would still all be identical so the distribution would not be symmetric.

Question 3:

Load the Alzheimer’s disease data using the commands:

library(caret)
library(AppliedPredictiveModeling)
set.seed(3433)
data(AlzheimerDisease)
adData = data.frame(diagnosis,predictors)
inTrain = createDataPartition(adData$diagnosis, p = 3/4)[[1]]
training = adData[ inTrain,]
testing = adData[-inTrain,]

Find all the predictor variables in the training set that begin with IL. Perform principal components on these variables with the preProcess() function from the caret package. Calculate the number of principal components needed to capture 90% of the variance. How many are there?

IL_col_idx <- grep("^[Ii][Ll].*", names(training))
preObj <- preProcess(training[, IL_col_idx], method=c("center", "scale", "pca"), thresh=0.9)
preObj
## 
## Call:
## preProcess.default(x = training[, IL_col_idx], method =
##  c("center", "scale", "pca"), thresh = 0.9)
## 
## Created from 251 samples and 12 variables
## Pre-processing: centered, scaled, principal component signal extraction 
## 
## PCA needed 9 components to capture 90 percent of the variance
names(preObj)
##  [1] "call"       "dim"        "bc"         "yj"         "et"        
##  [6] "mean"       "std"        "ranges"     "rotation"   "method"    
## [11] "thresh"     "pcaComp"    "numComp"    "ica"        "k"         
## [16] "knnSummary" "bagImp"     "median"     "data"

*Answer 9

Question 4:

Load the Alzheimer’s disease data using the commands:

library(caret)
library(AppliedPredictiveModeling)
set.seed(3433)
data(AlzheimerDisease)
adData = data.frame(diagnosis,predictors)
inTrain = createDataPartition(adData$diagnosis, p = 3/4)[[1]]
training = adData[ inTrain,]
testing = adData[-inTrain,]

Create a training data set consisting of only the predictors with variable names beginning with IL and the diagnosis. Build two predictive models, one using the predictors as they are and one using PCA with principal components explaining 80% of the variance in the predictors. Use method=“glm” in the train function. What is the accuracy of each method in the test set? Which is more accurate?

# extract new training and testing sets
IL_col_idx <- grep("^[Ii][Ll].*", names(training))
suppressMessages(library(dplyr))
new_training <- training[, c(names(training)[IL_col_idx], "diagnosis")]
names(new_training)
##  [1] "IL_11"         "IL_13"         "IL_16"         "IL_17E"       
##  [5] "IL_1alpha"     "IL_3"          "IL_4"          "IL_5"         
##  [9] "IL_6"          "IL_6_Receptor" "IL_7"          "IL_8"         
## [13] "diagnosis"
IL_col_idx <- grep("^[Ii][Ll].*", names(testing))
suppressMessages(library(dplyr))
new_testing <- testing[, c(names(testing)[IL_col_idx], "diagnosis")]
names(new_testing)
##  [1] "IL_11"         "IL_13"         "IL_16"         "IL_17E"       
##  [5] "IL_1alpha"     "IL_3"          "IL_4"          "IL_5"         
##  [9] "IL_6"          "IL_6_Receptor" "IL_7"          "IL_8"         
## [13] "diagnosis"
# compute the model with non_pca predictors
non_pca_model <- train(diagnosis ~ ., data=new_training, method="glm")
# apply the non pca model on the testing set and check the accuracy
non_pca_result <- confusionMatrix(new_testing[, 13], predict(non_pca_model, new_testing[, -13]))
non_pca_result
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Impaired Control
##   Impaired        2      20
##   Control         9      51
##                                          
##                Accuracy : 0.6463         
##                  95% CI : (0.533, 0.7488)
##     No Information Rate : 0.8659         
##     P-Value [Acc > NIR] : 1.00000        
##                                          
##                   Kappa : -0.0702        
##  Mcnemar's Test P-Value : 0.06332        
##                                          
##             Sensitivity : 0.18182        
##             Specificity : 0.71831        
##          Pos Pred Value : 0.09091        
##          Neg Pred Value : 0.85000        
##              Prevalence : 0.13415        
##          Detection Rate : 0.02439        
##    Detection Prevalence : 0.26829        
##       Balanced Accuracy : 0.45006        
##                                          
##        'Positive' Class : Impaired       
## 
# perform PCA extraction on the new training and testing sets
pc_training_obj <- preProcess(new_training[, -13], method=c('center', 'scale', 'pca'), thresh=0.8)
pc_training_preds <- predict(pc_training_obj, new_training[, -13])
pc_testing_preds <- predict(pc_training_obj, new_testing[, -13])
# compute the model with pca predictors
pca_model <- train(new_training$diagnosis ~ ., data=pc_training_preds, method="glm")
# apply the PCA model on the testing set
pca_result <- confusionMatrix(new_testing[, 13], predict(pca_model, pc_testing_preds))
pca_result
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Impaired Control
##   Impaired        3      19
##   Control         4      56
##                                           
##                Accuracy : 0.7195          
##                  95% CI : (0.6094, 0.8132)
##     No Information Rate : 0.9146          
##     P-Value [Acc > NIR] : 1.000000        
##                                           
##                   Kappa : 0.0889          
##  Mcnemar's Test P-Value : 0.003509        
##                                           
##             Sensitivity : 0.42857         
##             Specificity : 0.74667         
##          Pos Pred Value : 0.13636         
##          Neg Pred Value : 0.93333         
##              Prevalence : 0.08537         
##          Detection Rate : 0.03659         
##    Detection Prevalence : 0.26829         
##       Balanced Accuracy : 0.58762         
##                                           
##        'Positive' Class : Impaired        
## 

*Answer: Non-PCA accuracy: 0.65 PCA accuracy: 0.72