Question 1

Load the Alzheimer’s disease data using the commands: Which of the following commands will create non-overlapping training and test sets with about 50% of the observations assigned to each?

library(AppliedPredictiveModeling)
data(AlzheimerDisease)
library(caret)
## Warning: package 'caret' was built under R version 3.2.5
## Loading required package: lattice
## Loading required package: ggplot2
adData = data.frame(diagnosis,predictors)
trainIndex = createDataPartition(diagnosis, p = 0.50,list=FALSE)
training = adData[trainIndex,]
testing = adData[-trainIndex,]

Question2

Load the cement data using the commands:

library(AppliedPredictiveModeling)
data(concrete)
library(caret)
set.seed(1000)
inTrain = createDataPartition(mixtures$CompressiveStrength, p = 3/4)[[1]]
training = mixtures[ inTrain,]
testing = mixtures[-inTrain,]

Make a plot of the outcome (CompressiveStrength) versus the index of the samples. Color by each of the variables in the data set (you may find the cut2() function in the Hmisc package useful for turning continuous covariates into factors). What do you notice in these plots?

library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.2.5
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
plot(x = inTrain, y = training$CompressiveStrength, col = cut2(training$CompressiveStrength, g = 10))

names <- colnames(concrete)
names <- names[-length(names)]
index <- seq_along(1:nrow(training))
ggplot(data = training, aes(x = index, y = CompressiveStrength)) + geom_point(aes(colour = cut2 (training$Cement, g = 10))) 

ggplot(data = training, aes(x = index, y = CompressiveStrength)) + geom_point(aes(colour = cut2 (training$BlastFurnaceSlag, g = 10))) 

ggplot(data = training, aes(x = index, y = CompressiveStrength)) + geom_point(aes(colour = cut2 (training$FlyAsh, g = 10))) 

ggplot(data = training, aes(x = index, y = CompressiveStrength)) + geom_point(aes(colour = cut2 (training$Age, g = 10))) 

Question 3

Load the cement data using the commands:

library(AppliedPredictiveModeling)
data(concrete)
library(caret)
set.seed(1000)
inTrain = createDataPartition(mixtures$CompressiveStrength, p = 3/4)[[1]]
training = mixtures[ inTrain,]
testing = mixtures[-inTrain,]
hist(training$Superplasticizer, breaks = 20, col = "blue")

hist(log(training$Superplasticizer+1), breaks  = 20, col="red")

#Question 4

Load the Alzheimer’s disease data using the commands:

library(caret)
library(AppliedPredictiveModeling)
set.seed(3433)
data(AlzheimerDisease)
adData = data.frame(diagnosis,predictors)
inTrain = createDataPartition(adData$diagnosis, p = 3/4)[[1]]
training = adData[ inTrain,]
testing = adData[-inTrain,]

Find all the predictor variables in the training set that begin with IL. Perform principal components on these variables with the preProcess() function from the caret package. Calculate the number of principal components needed to capture 80% of the variance. How many are there?

subset4 = training[,grep("^IL",names(training))]
preProcess(subset4,thresh = 0.8, method = "pca" )
## Created from 251 samples and 12 variables
## 
## Pre-processing:
##   - centered (12)
##   - ignored (0)
##   - principal component signal extraction (12)
##   - scaled (12)
## 
## PCA needed 7 components to capture 80 percent of the variance

Question 5

Load the Alzheimer’s disease data using the commands:

library(caret)
library(AppliedPredictiveModeling)
set.seed(3433)
data(AlzheimerDisease)
adData = data.frame(diagnosis,predictors)
inTrain = createDataPartition(adData$diagnosis, p = 3/4)[[1]]
training = adData[ inTrain,]
testing = adData[-inTrain,]

Create a training data set consisting of only the predictors with variable names beginning with IL and the diagnosis. Build two predictive models, one using the predictors as they are and one using PCA with principal components explaining 80% of the variance in the predictors. Use method=“glm” in the train function. What is the accuracy of each method in the test set? Which is more accurate?

subsettrain5 = training[,grep("^IL",names(training))]
subsettest5 = testing[,grep("^IL",names(testing))]
pcaData = preProcess(subsettrain5,thresh = 0.8, method = "pca" )
trainTransformed <- predict(pcaData, subsettrain5)
testTransformed <- predict(pcaData, subsettest5)
subsettrain5$diagnosis = training$diagnosis
subsettest5$diagnosis = testing$diagnosis
trainTransformed$diagnosis = training$diagnosis
testTransformed$diagnosis = testing$diagnosis
glmpca = train(diagnosis ~ ., data = trainTransformed, method = "glm")
glm = train(diagnosis ~ ., data = subsettrain5, method = "glm")