Practical Machine Learning Week 2 Quiz

Course - https://www.coursera.org/learn/practical-machine-learning/home/week/2

Discussion - https://github.com/topepo/caret/issues/521

Quiz 1

Which of the following commands will create non-overlapping training and test sets with about 50% of the observations assigned to each?

library(caret)
library(AppliedPredictiveModeling)
data(AlzheimerDisease)
adData = data.frame(diagnosis,predictors)
trainIndex = createDataPartition(diagnosis, p = 0.50, list=FALSE)
training = adData[trainIndex,]
testing = adData[-trainIndex,]

Quiz 2

Make a plot of the outcome (CompressiveStrength) versus the index of the samples. Color by each of the variables in the data set (you may find the cut2() function in the Hmisc package useful for turning continuous covariates into factors). What do you notice in these plots?

##suppressPackageStartupMessages(library(caret))
library(AppliedPredictiveModeling)
data(concrete)
library(caret)
set.seed(1000)
inTrain = createDataPartition(mixtures$CompressiveStrength, p = 3/4)[[1]]
training = mixtures[ inTrain,]
testing = mixtures[-inTrain,]

library(Hmisc)
training$CompressiveStrength <- cut2(training$CompressiveStrength, seq(0, 100, by = 10))
featurePlot(x=training, y=training$CompressiveStrength, plot="pairs")

Quiz 3

Make a histogram and confirm the SuperPlasticizer variable is skewed. Normally you might use the log transform to try to make the data more symmetric. Why would that be a poor choice for this variable?

library(AppliedPredictiveModeling)
data(concrete)
library(caret)
set.seed(1000)
inTrain = createDataPartition(mixtures$CompressiveStrength, p = 3/4)[[1]]
training = mixtures[ inTrain,]
testing = mixtures[-inTrain,]

qplot(CompressiveStrength, log(Superplasticizer + 1), data=training)

hist(log(training$Superplasticizer + 1))

Quiz 4

Find all the predictor variables in the training set that begin with IL. Perform principal components on these variables with the preProcess() function from the caret package. Calculate the number of principal components needed to capture 80% of the variance. How many are there?

library(caret)
library(AppliedPredictiveModeling)
set.seed(3433)
data(AlzheimerDisease)
adData = data.frame(diagnosis,predictors)
inTrain = createDataPartition(adData$diagnosis, p = 3/4)[[1]]
training = adData[ inTrain,]
testing = adData[-inTrain,]

head(colnames(training))

## [1] "diagnosis"                       "ACE_CD143_Angiotensin_Converti" 
## [3] "ACTH_Adrenocorticotropic_Hormon" "AXL"                            
## [5] "Adiponectin"                     "Alpha_1_Antichymotrypsin"

head(training[grepl("^IL_", names(training))])

##      IL_11    IL_13    IL_16   IL_17E IL_1alpha      IL_3     IL_4
## 2 4.936704 1.269463 2.876338 6.705891 -8.047190 -3.912023 2.397895
## 4 6.223931 1.307549 2.441056 4.695848 -7.600902 -4.268698 1.481605
## 5 7.070709 1.309980 4.736472 4.204987 -6.943657 -2.995732 2.708050
## 6 6.103215 1.282549 2.671032 3.637051 -8.180721 -3.863233 1.208960
## 7 2.031412 1.286356 3.476091 6.705891 -6.907755 -3.296837 1.871802
## 8 5.180840 1.293295 3.593860 4.037285 -7.418581 -2.956512 2.397895
##          IL_5        IL_6 IL_6_Receptor     IL_7     IL_8
## 2  0.69314718  0.09622438    0.43115645 3.705506 1.675557
## 4  0.78845736 -0.37116408    0.57519641 2.336211 1.719944
## 5  1.16315081 -0.07204658    0.09668586 4.287562 1.764298
## 6 -0.40047757  0.18568645   -0.51727788 2.776394 1.708270
## 7  0.83290912  0.09622438    0.43115645 4.009916 1.698489
## 8 -0.09431068  1.00562217   -0.60969274 3.705506 1.701858

## Non-PCA
modelFit <- train(diagnosis ~ IL_11 + IL_13 + IL_16 + IL_17E
                  + IL_1alpha + IL_3 + IL_4 + IL_5 + IL_6 + IL_6_Receptor + IL_7 + IL_8,
                  data=training,
                  preProcess=c("center", "scale"),
                  method="glm")
modelFit

## Generalized Linear Model 
## 
## 251 samples
##  12 predictor
##   2 classes: 'Impaired', 'Control' 
## 
## Pre-processing: centered (12), scaled (12) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 251, 251, 251, 251, 251, 251, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.7048613  0.0725562

which(colnames(training)=="diagnosis")

## [1] 1

## PCA
preProc <- preProcess(training[grepl("^IL_", names(training))],
                      method=c("center", "scale", "pca"), thresh=0.8)
preProc

## Created from 251 samples and 12 variables
## 
## Pre-processing:
##   - centered (12)
##   - ignored (0)
##   - principal component signal extraction (12)
##   - scaled (12)
## 
## PCA needed 7 components to capture 80 percent of the variance

Quiz 5

Create a training data set consisting of only the predictors with variable names beginning with IL and the diagnosis. Build two predictive models, one using the predictors as they are and one using PCA with principal components explaining 80% of the variance in the predictors. Use method=“glm” in the train function.

What is the accuracy of each method in the test set? Which is more accurate?

library(caret)
library(AppliedPredictiveModeling)
set.seed(3433)
data(AlzheimerDisease)
adData = data.frame(diagnosis,predictors)
inTrain = createDataPartition(adData$diagnosis, p = 3/4)[[1]]
training = adData[ inTrain,]
testing = adData[-inTrain,]

training_IL <- training[c("diagnosis", names(training)[grep("^IL",colnames(training))])]
testing_IL <- testing[c("diagnosis", names(testing)[grep("^IL",colnames(testing))])]

## PCA
preProc <- preProcess(training_IL[, -1], method=c("center", "scale", "pca"), thresh=0.8) ## 9 columns
trainPC <- predict(preProc, training_IL) ## 10 columns
testPC <- predict(preProc, testing_IL)   ## 10 columns
modelFit <- train(diagnosis ~ ., method="glm", data=trainPC)
confusionMatrix(testing_IL$diagnosis, predict(modelFit, testPC)) ## Accuracy : 0.7195

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Impaired Control
##   Impaired        3      19
##   Control         4      56
##                                           
##                Accuracy : 0.7195          
##                  95% CI : (0.6094, 0.8132)
##     No Information Rate : 0.9146          
##     P-Value [Acc > NIR] : 1.000000        
##                                           
##                   Kappa : 0.0889          
##  Mcnemar's Test P-Value : 0.003509        
##                                           
##             Sensitivity : 0.42857         
##             Specificity : 0.74667         
##          Pos Pred Value : 0.13636         
##          Neg Pred Value : 0.93333         
##              Prevalence : 0.08537         
##          Detection Rate : 0.03659         
##    Detection Prevalence : 0.26829         
##       Balanced Accuracy : 0.58762         
##                                           
##        'Positive' Class : Impaired        
##

## Non-PCA
modelFit2 <- train(diagnosis ~ ., method="glm", data=training_IL)
confusionMatrix(testing_IL$diagnosis, predict(modelFit2, testing_IL)) ## Accuracy : 0.6463

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Impaired Control
##   Impaired        2      20
##   Control         9      51
##                                          
##                Accuracy : 0.6463         
##                  95% CI : (0.533, 0.7488)
##     No Information Rate : 0.8659         
##     P-Value [Acc > NIR] : 1.00000        
##                                          
##                   Kappa : -0.0702        
##  Mcnemar's Test P-Value : 0.06332        
##                                          
##             Sensitivity : 0.18182        
##             Specificity : 0.71831        
##          Pos Pred Value : 0.09091        
##          Neg Pred Value : 0.85000        
##              Prevalence : 0.13415        
##          Detection Rate : 0.02439        
##    Detection Prevalence : 0.26829        
##       Balanced Accuracy : 0.45006        
##                                          
##        'Positive' Class : Impaired       
##