Question 1

Load the Alzheimer’s disease data using the commands:

library(AppliedPredictiveModeling)
data(AlzheimerDisease)

Which of the following commands will create non-overlapping training and test sets with about 50% of the observations assigned to each?

Answer 1

library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
adData = data.frame(diagnosis,predictors)
trainIndex = createDataPartition(diagnosis, p = 0.50, list = FALSE)
training = adData[trainIndex,]
testing = adData[-trainIndex,]
dim(adData)
## [1] 333 131
dim(training)
## [1] 167 131
dim(testing)
## [1] 166 131

Question 2

Load the cement data using the commands:

library(AppliedPredictiveModeling)
data(concrete)
library(caret)
set.seed(1000)
inTrain = createDataPartition(mixtures$CompressiveStrength, p = 3/4)[[1]]
training = mixtures[ inTrain,]
testing = mixtures[-inTrain,]

Make a plot of the outcome (CompressiveStrength) versus the index of the samples. Color by each of the variables in the data set (you may find the cut2() function in the Hmisc package useful for turning continuous covariates into factors). What do you notice in these plots?

Answer 2

library(Hmisc)
library(ggplot2)
library(ggpubr)
head(training)
##       Cement BlastFurnaceSlag FlyAsh      Water Superplasticizer
## 1 0.22309440       0.00000000      0 0.06692832      0.001032844
## 2 0.22172039       0.00000000      0 0.06651612      0.001026483
## 3 0.14917003       0.06393001      0 0.10228802      0.000000000
## 4 0.14917003       0.06393001      0 0.10228802      0.000000000
## 5 0.08534961       0.05689974      0 0.08251322      0.000000000
## 6 0.12036199       0.05158371      0 0.10316742      0.000000000
##   CoarseAggregate FineAggregate Age CompressiveStrength
## 1       0.4296633     0.2792811  28               79.99
## 2       0.4331759     0.2775611  28               61.89
## 3       0.4181247     0.2664872 270               40.27
## 4       0.4181247     0.2664872 365               41.05
## 5       0.4204736     0.3547638 360               44.30
## 6       0.4217195     0.3031674  90               47.03
g1 <- ggplot(concrete, aes(CompressiveStrength, 1:length(CompressiveStrength), colour = cut2(Cement, g = 3))) + geom_point()

g2 <- ggplot(concrete, aes(CompressiveStrength, 1:length(CompressiveStrength), colour = cut2(BlastFurnaceSlag, g = 3))) + geom_point()

g3 <- ggplot(concrete, aes(CompressiveStrength, 1:length(CompressiveStrength),  colour = cut2(FlyAsh, g = 3))) + geom_point()

g4 <- ggplot(concrete, aes(CompressiveStrength, 1:length(CompressiveStrength), colour = cut2(Water, g = 3))) + geom_point()

g5 <- ggplot(concrete, aes(CompressiveStrength, 1:length(CompressiveStrength),  colour = cut2(Superplasticizer, g = 3))) + geom_point()

g6 <- ggplot(concrete, aes(CompressiveStrength, 1:length(CompressiveStrength), colour = cut2(CoarseAggregate, g = 3))) + geom_point()

g7 <- ggplot(concrete, aes(CompressiveStrength, 1:length(CompressiveStrength), colour = cut2(FineAggregate, g = 3))) + geom_point()

g8 <- ggplot(concrete, aes(CompressiveStrength, 1:length(CompressiveStrength), colour = cut2(Age, g = 3))) + geom_point()

ggarrange(g1, g2, g3, g4, g5, g6, g7, g8,
          labels = c("A", "B", "C", "D", "E", "F", "G", "H"),
          ncol = 2, nrow = 4)

There is a non-random pattern in the plot of the outcome versus index that does not appear to be perfectly explained by any predictor suggesting a variable may be missing.

Question 3

Load the cement data using the commands:

library(AppliedPredictiveModeling)
data(concrete)
library(caret)
set.seed(1000)
inTrain = createDataPartition(mixtures$CompressiveStrength, p = 3/4)[[1]]
training = mixtures[ inTrain,]
testing = mixtures[-inTrain,]

Make a histogram and confirm the SuperPlasticizer variable is skewed. Normally you might use the log transform to try to make the data more symmetric. Why would that be a poor choice for this variable?

Answer 3

summary(concrete$Superplasticizer)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   6.400   6.205  10.200  32.200
g1 <- ggplot(concrete, aes(x = Superplasticizer)) +
  geom_histogram(color="red", fill="salmon")

g2 <- ggplot(concrete, aes(x = log(Superplasticizer))) +
  geom_histogram(color="blue", fill="lightblue")

g3 <- ggplot(concrete, aes(x = log(Superplasticizer+1))) +
  geom_histogram(color="blue", fill="lightblue")


ggarrange(g1, g2, g3,
          labels = c("A", "B", 'C'),
          ncol = 3, nrow = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 379 rows containing non-finite values (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

There are values of zero so when you take the log() transform those values will be -Inf.

Question 4

Load the Alzheimer’s disease data using the commands:

library(caret)
library(AppliedPredictiveModeling)
set.seed(3433)
data(AlzheimerDisease)
adData = data.frame(diagnosis, predictors)
inTrain = createDataPartition(adData$diagnosis, p = 3/4)[[1]]
training = adData[inTrain,]
testing = adData[-inTrain,]

Find all the predictor variables in the training set that begin with IL. Perform principal components on these variables with the preProcess() function from the caret package. Calculate the number of principal components needed to capture 90% of the variance. How many are there?

Answer 4

# Define the training set only based on the predictors variables in the training set that begin with IL via grep 
trainingIL <- training[,names(training)[grep("^IL", names(training))]]
# Compute the principal components via prcomp, (with the scaled predictors) for trainingIL.
PCA <- prcomp(trainingIL, scale. = TRUE)
# Set the captured variance to the desired threshold
varThreshold <- 0.9
# Compute which component has the corresponding cumulative proportion of captured variance above the threshold
which.max(summary(PCA)$importance['Cumulative Proportion',] > varThreshold)
## PC9 
##   9
# Compute the principal components via preProcess, (with method pca) setting the using the defined threshold for the captured variance.
varThreshold <- 0.9
PCA <- preProcess(trainingIL, 
                  method = "pca", 
                  thresh = varThreshold)
PCA
## Created from 251 samples and 12 variables
## 
## Pre-processing:
##   - centered (12)
##   - ignored (0)
##   - principal component signal extraction (12)
##   - scaled (12)
## 
## PCA needed 9 components to capture 90 percent of the variance

Question 5

Load the Alzheimer’s disease data using the commands:

library(caret)
library(AppliedPredictiveModeling)
set.seed(3433)
data(AlzheimerDisease)
adData = data.frame(diagnosis,predictors)
inTrain = createDataPartition(adData$diagnosis, p = 3/4)[[1]]
training = adData[inTrain,]
testing = adData[-inTrain,]

Create a training data set consisting of only the predictors with variable names beginning with IL and the diagnosis. Build two predictive models, one using the predictors as they are and one using PCA with principal components explaining 80% of the variance in the predictors. Use method=“glm” in the train function.

What is the accuracy of each method in the test set? Which is more accurate?

Answer 5

trainingIL <- training[,names(training)[grep("^IL|diagnosis", names(training))]]
#head(trainingIL)
testingIL <- testing[,names(testing)[grep("^IL|diagnosis",  names(testing))]]
#head(testingIL)
fitAll <- train(diagnosis ~ .,
                method = 'glm',
                data = trainingIL)
confusionMatrix(testingIL$diagnosis, predict(fitAll, testingIL))$overall['Accuracy']
##  Accuracy 
## 0.7560976
#----
preObj <- preProcess(trainingIL[,-1],
                     method = c('center', 'scale'))
trainingILS <- predict(preObj, trainingIL[,-1])
trainingILS <- cbind(trainingIL[,1],trainingILS)
colnames(trainingILS)[1] <- 'diagnosis'
testingILS <- predict(preObj, testingIL[,-1])
testingILS <- cbind(testingIL[,1],testingILS)
colnames(testingILS)[1] <- 'diagnosis'
fitAllS <- train(diagnosis ~ .,
                method = 'glm',
                data = trainingILS)
confusionMatrix(testingILS$diagnosis, predict(fitAllS, testingILS))$overall['Accuracy']
##  Accuracy 
## 0.7560976
#----
preProc <- preProcess(trainingIL,
                      method = 'pca', 
                      thresh =  0.8)
trainingILPCA08 <- predict(preProc,
                           trainingIL)
#head(trainingILPCA08)
testingILPCA08 <- predict(preProc,
                  testingIL)
fitPCA <- train(diagnosis ~ .,
                method = 'glm',
                data = trainingILPCA08)
confusionMatrix(testingIL$diagnosis, predict(fitPCA, testingILPCA08))$overall['Accuracy']
##  Accuracy 
## 0.7195122