Week two Quiz

Question 1.

Load the Alzheimer’s disease data using the commands:

library(AppliedPredictiveModeling)
data(AlzheimerDisease)

Which of the following commands will create non-overlapping training and test sets with about 50% of the observations assigned to each?

adData = data.frame(diagnosis,predictors)
testIndex = createDataPartition(diagnosis, p = 0.50,list=FALSE)
training = adData[-testIndex,]
testing = adData[testIndex,]

Question 2.

Load the cement data using the commands:

#install.packages("AppliedPredictiveModeling")
library(AppliedPredictiveModeling)
data(concrete)
library(caret)
set.seed(1000)
inTrain = createDataPartition(mixtures$CompressiveStrength, p = 3/4)[[1]]
training = mixtures[ inTrain,]
testing = mixtures[-inTrain,]

Make a plot of the outcome (CompressiveStrength) versus the index of the samples. Color by each of the variables in the data set (you may find the cut2() function in the Hmisc package useful for turning continuous covariates into factors). What do you notice in these plots?

?mixtures
## starting httpd help server ... done
summary(training)
##      Cement        BlastFurnaceSlag       FlyAsh            Water        
##  Min.   :0.04482   Min.   :0.000000   Min.   :0.00000   Min.   :0.05139  
##  1st Qu.:0.08179   1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.06972  
##  Median :0.11462   Median :0.009993   Median :0.00000   Median :0.07862  
##  Mean   :0.11782   Mean   :0.032051   Mean   :0.02247   Mean   :0.07774  
##  3rd Qu.:0.14793   3rd Qu.:0.061968   3rd Qu.:0.04999   3rd Qu.:0.08384  
##  Max.   :0.22541   Max.   :0.150339   Max.   :0.08884   Max.   :0.11222  
##  Superplasticizer   CoarseAggregate  FineAggregate         Age        
##  Min.   :0.000000   Min.   :0.3459   Min.   :0.2480   Min.   :  1.00  
##  1st Qu.:0.000000   1st Qu.:0.3986   1st Qu.:0.3113   1st Qu.: 14.00  
##  Median :0.002726   Median :0.4213   Median :0.3305   Median : 28.00  
##  Mean   :0.002608   Mean   :0.4167   Mean   :0.3306   Mean   : 47.46  
##  3rd Qu.:0.004351   3rd Qu.:0.4389   3rd Qu.:0.3542   3rd Qu.: 56.00  
##  Max.   :0.013149   Max.   :0.4798   Max.   :0.4141   Max.   :365.00  
##  CompressiveStrength
##  Min.   : 2.33      
##  1st Qu.:23.71      
##  Median :34.48      
##  Mean   :35.64      
##  3rd Qu.:46.13      
##  Max.   :82.60
head(training)
##       Cement BlastFurnaceSlag FlyAsh      Water Superplasticizer
## 1 0.22309440       0.00000000      0 0.06692832      0.001032844
## 3 0.14917003       0.06393001      0 0.10228802      0.000000000
## 5 0.08534961       0.05689974      0 0.08251322      0.000000000
## 7 0.17048004       0.04262001      0 0.10228802      0.000000000
## 8 0.17048004       0.04262001      0 0.10228802      0.000000000
## 9 0.12036199       0.05158371      0 0.10316742      0.000000000
##   CoarseAggregate FineAggregate Age CompressiveStrength
## 1       0.4296633     0.2792811  28               79.99
## 3       0.4181247     0.2664872 270               40.27
## 5       0.4204736     0.3547638 360               44.30
## 7       0.4181247     0.2664872 365               43.70
## 8       0.4181247     0.2664872  28               36.45
## 9       0.4217195     0.3031674  28               45.85
training2 <- training
#cut CompressiveStrength into 3 levels.  This is the only way to work with colour in ggpair
training2$CompressiveStrength <- cut2(training2$CompressiveStrength, g=3)
ggpairs(data = training2, mapping = ggplot2::aes(colour = CompressiveStrength),progress=FALSE, axisLabels = "internal")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

  • There is a non-random pattern in the plot of the outcome versus index.
  • There is a non-random pattern in the plot of the outcome versus index that is perfectly explained by the FlyAsh variable.
  • There is a non-random pattern in the plot of the outcome versus index that is perfectly explained by the Age variable.
  • There is a non-random pattern in the plot of the outcome versus index that does not appear to be perfectly explained by any predictor suggesting a variable may be missing.

Question 3.

Load the cement data using the commands:

library(AppliedPredictiveModeling)
data(concrete)
library(caret)
set.seed(1000)
inTrain = createDataPartition(mixtures$CompressiveStrength, p = 3/4)[[1]]
training = mixtures[ inTrain,]
testing = mixtures[-inTrain,]

Make a histogram and confirm the SuperPlasticizer variable is skewed. Normally you might use the log transform to try to make the data more symmetric. Why would that be a poor choice for this variable?

par(mfrow=c(1,3))
hist(training$Superplasticizer)
hist(log(training$Superplasticizer))
hist(log(training$Superplasticizer+1))

  • The log transform is not a monotone transformation of the data.
  • The SuperPlasticizer data include negative values so the log transform can not be performed.
  • The log transform does not reduce the skewness of the non-zero values of SuperPlasticizer
  • There are a large number of values that are the same and even if you took the log(SuperPlasticizer + 1) they would still all be identical so the distribution would not be symmetric.

Question 4.

Load the Alzheimer’s disease data using the commands:

library(caret)
library(AppliedPredictiveModeling)
set.seed(3433)
data(AlzheimerDisease)
adData = data.frame(diagnosis,predictors)
inTrain = createDataPartition(adData$diagnosis, p = 3/4)[[1]]
training = adData[ inTrain,]
testing = adData[-inTrain,]

Find all the predictor variables in the training set that begin with IL. Perform principal components on these variables with the preProcess() function from the caret package. Calculate the number of principal components needed to capture 90% of the variance. How many are there?

trainingIL <- training[,grep("^IL", names(training))]
procTrain <- preProcess(trainingIL, method = "pca", thresh = 0.9 )
procTrain
## Created from 251 samples and 12 variables
## 
## Pre-processing:
##   - centered (12)
##   - ignored (0)
##   - principal component signal extraction (12)
##   - scaled (12)
## 
## PCA needed 9 components to capture 90 percent of the variance
  1. 5
  2. 7
  3. 10
  4. 9

Question 5.

Load the Alzheimer’s disease data using the commands:

library(caret)
library(AppliedPredictiveModeling)
set.seed(3433)
data(AlzheimerDisease)
adData = data.frame(diagnosis,predictors)
inTrain = createDataPartition(adData$diagnosis, p = 3/4)[[1]]
training = adData[ inTrain,]
testing = adData[-inTrain,]

Create a training data set consisting of only the predictors with variable names beginning with IL and the diagnosis. Build two predictive models, one using the predictors as they are and one using PCA with principal components explaining 80% of the variance in the predictors. Use method=“glm” in the train function.

What is the accuracy of each method in the test set? Which is more accurate?

# grep all columns with IL and diagnosis in the traning and testing set
trainingIL <- training[,grep("^IL|diagnosis", names(training))]
testingIL <- testing[,grep("^IL|diagnosis", names(testing))]

# non-PCA
model <- train(diagnosis ~ ., data = trainingIL, method = "glm")
predict_model <- predict(model, newdata= testingIL)
matrix_model <- confusionMatrix(predict_model, testingIL$diagnosis)
matrix_model$overall[1]
##  Accuracy 
## 0.6463415
# PCA
modelPCA <- train(diagnosis ~., data = trainingIL, method = "glm", preProcess = "pca",trControl=trainControl(preProcOptions=list(thresh=0.8)))
matrix_modelPCA <- confusionMatrix(testingIL$diagnosis, predict(modelPCA, testingIL))
matrix_modelPCA$overall[1]
##  Accuracy 
## 0.7195122
  1. Non-PCA Accuracy: 0.72, PCA Accuracy: 0.71
  2. Non-PCA Accuracy: 0.91, PCA Accuracy: 0.93
  3. Non-PCA Accuracy: 0.65, PCA Accuracy: 0.72 4 Non-PCA Accuracy: 0.72,PCA Accuracy: 0.65