Load the Alzheimer’s disease data using the commands:
library(AppliedPredictiveModeling)
data(AlzheimerDisease)
Which of the following commands will create non-overlapping training and test sets with about 50% of the observations assigned to each?
adData = data.frame(diagnosis,predictors)
testIndex = createDataPartition(diagnosis, p = 0.50,list=FALSE)
training = adData[-testIndex,]
testing = adData[testIndex,]
Load the cement data using the commands:
#install.packages("AppliedPredictiveModeling")
library(AppliedPredictiveModeling)
data(concrete)
library(caret)
set.seed(1000)
inTrain = createDataPartition(mixtures$CompressiveStrength, p = 3/4)[[1]]
training = mixtures[ inTrain,]
testing = mixtures[-inTrain,]
Make a plot of the outcome (CompressiveStrength) versus the index of the samples. Color by each of the variables in the data set (you may find the cut2() function in the Hmisc package useful for turning continuous covariates into factors). What do you notice in these plots?
?mixtures
## starting httpd help server ... done
summary(training)
## Cement BlastFurnaceSlag FlyAsh Water
## Min. :0.04482 Min. :0.000000 Min. :0.00000 Min. :0.05139
## 1st Qu.:0.08179 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.06972
## Median :0.11462 Median :0.009993 Median :0.00000 Median :0.07862
## Mean :0.11782 Mean :0.032051 Mean :0.02247 Mean :0.07774
## 3rd Qu.:0.14793 3rd Qu.:0.061968 3rd Qu.:0.04999 3rd Qu.:0.08384
## Max. :0.22541 Max. :0.150339 Max. :0.08884 Max. :0.11222
## Superplasticizer CoarseAggregate FineAggregate Age
## Min. :0.000000 Min. :0.3459 Min. :0.2480 Min. : 1.00
## 1st Qu.:0.000000 1st Qu.:0.3986 1st Qu.:0.3113 1st Qu.: 14.00
## Median :0.002726 Median :0.4213 Median :0.3305 Median : 28.00
## Mean :0.002608 Mean :0.4167 Mean :0.3306 Mean : 47.46
## 3rd Qu.:0.004351 3rd Qu.:0.4389 3rd Qu.:0.3542 3rd Qu.: 56.00
## Max. :0.013149 Max. :0.4798 Max. :0.4141 Max. :365.00
## CompressiveStrength
## Min. : 2.33
## 1st Qu.:23.71
## Median :34.48
## Mean :35.64
## 3rd Qu.:46.13
## Max. :82.60
head(training)
## Cement BlastFurnaceSlag FlyAsh Water Superplasticizer
## 1 0.22309440 0.00000000 0 0.06692832 0.001032844
## 3 0.14917003 0.06393001 0 0.10228802 0.000000000
## 5 0.08534961 0.05689974 0 0.08251322 0.000000000
## 7 0.17048004 0.04262001 0 0.10228802 0.000000000
## 8 0.17048004 0.04262001 0 0.10228802 0.000000000
## 9 0.12036199 0.05158371 0 0.10316742 0.000000000
## CoarseAggregate FineAggregate Age CompressiveStrength
## 1 0.4296633 0.2792811 28 79.99
## 3 0.4181247 0.2664872 270 40.27
## 5 0.4204736 0.3547638 360 44.30
## 7 0.4181247 0.2664872 365 43.70
## 8 0.4181247 0.2664872 28 36.45
## 9 0.4217195 0.3031674 28 45.85
training2 <- training
#cut CompressiveStrength into 3 levels. This is the only way to work with colour in ggpair
training2$CompressiveStrength <- cut2(training2$CompressiveStrength, g=3)
ggpairs(data = training2, mapping = ggplot2::aes(colour = CompressiveStrength),progress=FALSE, axisLabels = "internal")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Load the cement data using the commands:
library(AppliedPredictiveModeling)
data(concrete)
library(caret)
set.seed(1000)
inTrain = createDataPartition(mixtures$CompressiveStrength, p = 3/4)[[1]]
training = mixtures[ inTrain,]
testing = mixtures[-inTrain,]
Make a histogram and confirm the SuperPlasticizer variable is skewed. Normally you might use the log transform to try to make the data more symmetric. Why would that be a poor choice for this variable?
par(mfrow=c(1,3))
hist(training$Superplasticizer)
hist(log(training$Superplasticizer))
hist(log(training$Superplasticizer+1))
Load the Alzheimer’s disease data using the commands:
library(caret)
library(AppliedPredictiveModeling)
set.seed(3433)
data(AlzheimerDisease)
adData = data.frame(diagnosis,predictors)
inTrain = createDataPartition(adData$diagnosis, p = 3/4)[[1]]
training = adData[ inTrain,]
testing = adData[-inTrain,]
Find all the predictor variables in the training set that begin with IL. Perform principal components on these variables with the preProcess() function from the caret package. Calculate the number of principal components needed to capture 90% of the variance. How many are there?
trainingIL <- training[,grep("^IL", names(training))]
procTrain <- preProcess(trainingIL, method = "pca", thresh = 0.9 )
procTrain
## Created from 251 samples and 12 variables
##
## Pre-processing:
## - centered (12)
## - ignored (0)
## - principal component signal extraction (12)
## - scaled (12)
##
## PCA needed 9 components to capture 90 percent of the variance
Load the Alzheimer’s disease data using the commands:
library(caret)
library(AppliedPredictiveModeling)
set.seed(3433)
data(AlzheimerDisease)
adData = data.frame(diagnosis,predictors)
inTrain = createDataPartition(adData$diagnosis, p = 3/4)[[1]]
training = adData[ inTrain,]
testing = adData[-inTrain,]
Create a training data set consisting of only the predictors with variable names beginning with IL and the diagnosis. Build two predictive models, one using the predictors as they are and one using PCA with principal components explaining 80% of the variance in the predictors. Use method=“glm” in the train function.
What is the accuracy of each method in the test set? Which is more accurate?
# grep all columns with IL and diagnosis in the traning and testing set
trainingIL <- training[,grep("^IL|diagnosis", names(training))]
testingIL <- testing[,grep("^IL|diagnosis", names(testing))]
# non-PCA
model <- train(diagnosis ~ ., data = trainingIL, method = "glm")
predict_model <- predict(model, newdata= testingIL)
matrix_model <- confusionMatrix(predict_model, testingIL$diagnosis)
matrix_model$overall[1]
## Accuracy
## 0.6463415
# PCA
modelPCA <- train(diagnosis ~., data = trainingIL, method = "glm", preProcess = "pca",trControl=trainControl(preProcOptions=list(thresh=0.8)))
matrix_modelPCA <- confusionMatrix(testingIL$diagnosis, predict(modelPCA, testingIL))
matrix_modelPCA$overall[1]
## Accuracy
## 0.7195122