Machine learning Quiz2

library(AppliedPredictiveModeling)
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

library(Hmisc)

## Loading required package: survival

## 
## Attaching package: 'survival'

## The following object is masked from 'package:caret':
## 
##     cluster

## Loading required package: Formula

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units

load data

data(AlzheimerDisease)
data(concrete)

v1 <- cut2(concrete$Cement, g = 10)
v2 <- cut2(concrete$BlastFurnaceSlag, g = 5)
v3 <- cut2(concrete$FlyAsh, g = 5)
v4 <- cut2(concrete$Water, g = 5)
v5 <- cut2(concrete$Superplasticizer, g = 5)
v6 <- cut2(concrete$Superplasticizer, g = 5)
v7 <- cut2(concrete$CoarseAggregate, g = 5)
v8 <- cut2(concrete$FineAggregate, g = 5)
v9 <- cut2(concrete$Age, g = 5)

ggplot(concrete, aes(x = seq(1:nrow(concrete)),y = CompressiveStrength, col = v9 )) + geom_point()

ggplot(concrete, aes(x = seq(1:nrow(concrete)),y = CompressiveStrength, col = v8 )) + geom_point()

ggplot(concrete, aes(x = seq(1:nrow(concrete)),y = CompressiveStrength, col = v7 )) + geom_point()

ggplot(concrete, aes(x = seq(1:nrow(concrete)),y = CompressiveStrength, col = v6 )) + geom_point()

ggplot(concrete, aes(x = seq(1:nrow(concrete)),y = CompressiveStrength, col = v5 )) + geom_point()

ggplot(concrete, aes(x = seq(1:nrow(concrete)),y = CompressiveStrength, col = v4 )) + geom_point()

ggplot(concrete, aes(x = seq(1:nrow(concrete)),y = CompressiveStrength, col = v3 )) + geom_point()

ggplot(concrete, aes(x = seq(1:nrow(concrete)),y = CompressiveStrength, col = v2 )) + geom_point()

ggplot(concrete, aes(x = seq(1:nrow(concrete)),y = CompressiveStrength, col = v1 )) + geom_point()

set.seed(3433)

adData = data.frame(diagnosis,predictors)

inTrain = createDataPartition(adData$diagnosis, p = 3/4)[[1]]
training = adData[ inTrain,]
testing = adData[-inTrain,]

IL <- grep("^IL.*", names(adData), value = T)
trainingIL = training[, c("diagnosis", IL)]
testingIL = testing[, c("diagnosis", IL)]


## Including all the predictors
model1 <- train(diagnosis ~., method = "glm", data = trainingIL)

confusionMatrix(testing$diagnosis, predict(model1, testing))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Impaired Control
##   Impaired        2      20
##   Control         9      51
##                                          
##                Accuracy : 0.6463         
##                  95% CI : (0.533, 0.7488)
##     No Information Rate : 0.8659         
##     P-Value [Acc > NIR] : 1.00000        
##                                          
##                   Kappa : -0.0702        
##  Mcnemar's Test P-Value : 0.06332        
##                                          
##             Sensitivity : 0.18182        
##             Specificity : 0.71831        
##          Pos Pred Value : 0.09091        
##          Neg Pred Value : 0.85000        
##              Prevalence : 0.13415        
##          Detection Rate : 0.02439        
##    Detection Prevalence : 0.26829        
##       Balanced Accuracy : 0.45006        
##                                          
##        'Positive' Class : Impaired       
##

# Including PCA processed predictors
preProc <- preProcess(trainingIL[,-1], method = "pca", thresh = .8)
trainPC <- predict(preProc, trainingIL[,-1])

modelfit <- train(x = trainPC, y = trainingIL$diagnosis,  method = "glm")

testPC <- predict(preProc, testingIL)
confusionMatrix(testing$diagnosis, predict(modelfit, testPC))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Impaired Control
##   Impaired        3      19
##   Control         4      56
##                                           
##                Accuracy : 0.7195          
##                  95% CI : (0.6094, 0.8132)
##     No Information Rate : 0.9146          
##     P-Value [Acc > NIR] : 1.000000        
##                                           
##                   Kappa : 0.0889          
##  Mcnemar's Test P-Value : 0.003509        
##                                           
##             Sensitivity : 0.42857         
##             Specificity : 0.74667         
##          Pos Pred Value : 0.13636         
##          Neg Pred Value : 0.93333         
##              Prevalence : 0.08537         
##          Detection Rate : 0.03659         
##    Detection Prevalence : 0.26829         
##       Balanced Accuracy : 0.58762         
##                                           
##        'Positive' Class : Impaired        
##

#alternative way

model2 <- train(diagnosis ~ ., method = "glm", data = trainingIL, 
        preProcess = "pca",
        trControl = trainControl(preProcOptions = list(thresh = 0.8)))

confusionMatrix(testing$diagnosis, predict(model2, testing))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Impaired Control
##   Impaired        3      19
##   Control         4      56
##                                           
##                Accuracy : 0.7195          
##                  95% CI : (0.6094, 0.8132)
##     No Information Rate : 0.9146          
##     P-Value [Acc > NIR] : 1.000000        
##                                           
##                   Kappa : 0.0889          
##  Mcnemar's Test P-Value : 0.003509        
##                                           
##             Sensitivity : 0.42857         
##             Specificity : 0.74667         
##          Pos Pred Value : 0.13636         
##          Neg Pred Value : 0.93333         
##              Prevalence : 0.08537         
##          Detection Rate : 0.03659         
##    Detection Prevalence : 0.26829         
##       Balanced Accuracy : 0.58762         
##                                           
##        'Positive' Class : Impaired        
##

Machine learning Quiz2

Xiaoping Li

January 27, 2017