Machine learning with linear and quadratic discriminant analyses, binary logistic regression,cross validations, model selection, ROC curves and AUC

library(mlbench)
data(PimaIndiansDiabetes)
dim(PimaIndiansDiabetes)

## [1] 768   9

levels(PimaIndiansDiabetes$diabetes)

## [1] "neg" "pos"

head(PimaIndiansDiabetes)

##   pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1        6     148       72      35       0 33.6    0.627  50      pos
## 2        1      85       66      29       0 26.6    0.351  31      neg
## 3        8     183       64       0       0 23.3    0.672  32      pos
## 4        1      89       66      23      94 28.1    0.167  21      neg
## 5        0     137       40      35     168 43.1    2.288  33      pos
## 6        5     116       74       0       0 25.6    0.201  30      neg

str(PimaIndiansDiabetes)

## 'data.frame':    768 obs. of  9 variables:
##  $ pregnant: num  6 1 8 1 0 5 3 10 2 8 ...
##  $ glucose : num  148 85 183 89 137 116 78 115 197 125 ...
##  $ pressure: num  72 66 64 66 40 74 50 0 70 96 ...
##  $ triceps : num  35 29 0 23 35 0 32 0 45 0 ...
##  $ insulin : num  0 0 0 94 168 0 88 0 543 0 ...
##  $ mass    : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ pedigree: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ age     : num  50 31 32 21 33 30 26 29 53 54 ...
##  $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...

summary(PimaIndiansDiabetes)

##     pregnant         glucose         pressure         triceps     
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     insulin           mass          pedigree           age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780   Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437   1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725   Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719   Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262   3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200   Max.   :81.00  
##  diabetes 
##  neg:500  
##  pos:268  
##           
##           
##           
##

library(reshape2)
library(ggplot2)
pima.m = melt(PimaIndiansDiabetes, id.var="diabetes")
ggplot(data=pima.m, aes(x=diabetes, y=value)) + geom_boxplot() +facet_wrap(~variable,ncol = 3)

library(corrplot)
pc = cor(PimaIndiansDiabetes[ ,1:8])
corrplot.mixed(pc)

set.seed(123)
ind = sample(2, nrow(PimaIndiansDiabetes), replace=TRUE, prob=c(0.7, 0.3))
train = PimaIndiansDiabetes[ind==1,]
test = PimaIndiansDiabetes[ind==2,]
str(test)

## 'data.frame':    229 obs. of  9 variables:
##  $ pregnant: num  1 1 0 10 4 7 1 3 9 10 ...
##  $ glucose : num  85 89 137 115 110 100 115 126 119 125 ...
##  $ pressure: num  66 66 40 0 92 0 70 88 80 70 ...
##  $ triceps : num  29 23 35 0 0 0 30 41 35 26 ...
##  $ insulin : num  0 94 168 0 0 0 96 235 0 115 ...
##  $ mass    : num  26.6 28.1 43.1 35.3 37.6 30 34.6 39.3 29 31.1 ...
##  $ pedigree: num  0.351 0.167 2.288 0.134 0.191 ...
##  $ age     : num  31 21 33 29 30 32 32 27 29 41 ...
##  $ diabetes: Factor w/ 2 levels "neg","pos": 1 1 2 1 1 2 2 1 2 2 ...

table(train$diabetes)

## 
## neg pos 
## 355 184

table(test$diabetes)

## 
## neg pos 
## 145  84

full.fit = glm(diabetes~., family=binomial, data=train)
summary(full.fit)

## 
## Call:
## glm(formula = diabetes ~ ., family = binomial, data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3792  -0.7097  -0.4091   0.7182   2.9452  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -8.685620   0.875844  -9.917  < 2e-16 ***
## pregnant     0.092897   0.040677   2.284   0.0224 *  
## glucose      0.035882   0.004530   7.921 2.35e-15 ***
## pressure    -0.013330   0.006799  -1.960   0.0499 *  
## triceps     -0.001551   0.008283  -0.187   0.8515    
## insulin     -0.001467   0.001093  -1.343   0.1793    
## mass         0.094193   0.018798   5.011 5.42e-07 ***
## pedigree     0.823112   0.357394   2.303   0.0213 *  
## age          0.023416   0.011679   2.005   0.0450 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 692.01  on 538  degrees of freedom
## Residual deviance: 499.21  on 530  degrees of freedom
## AIC: 517.21
## 
## Number of Fisher Scoring iterations: 5

confint(full.fit)

## Waiting for profiling to be done...

##                     2.5 %        97.5 %
## (Intercept) -10.480952137 -7.0421224282
## pregnant      0.013635114  0.1734387562
## glucose       0.027303306  0.0450950219
## pressure     -0.026893788 -0.0001258701
## triceps      -0.017698961  0.0147872351
## insulin      -0.003619024  0.0006882973
## mass          0.058463785  0.1322401014
## pedigree      0.127911990  1.5307339152
## age           0.000498035  0.0464727915

exp(coef(full.fit))

##  (Intercept)     pregnant      glucose     pressure      triceps 
## 0.0001689986 1.0973486809 1.0365338256 0.9867587189 0.9984506507 
##      insulin         mass     pedigree          age 
## 0.9985339169 1.0987714667 2.2775766549 1.0236919341

library(car)
vif(full.fit)

## pregnant  glucose pressure  triceps  insulin     mass pedigree      age 
## 1.485457 1.256421 1.199338 1.512798 1.572336 1.257257 1.036254 1.526423

train$probs = predict(full.fit, type="response")
train$probs[1:5]

## [1] 0.73498784 0.78040585 0.14604768 0.06177953 0.73368873

contrasts(train$diabetes)

##     pos
## neg   0
## pos   1

train$predict = rep("neg", 539)
train$predict[train$probs>0.5]="pos"
table(train$predict, train$diabetes)

##      
##       neg pos
##   neg 320  76
##   pos  35 108

mean(train$predict==train$diabetes)

## [1] 0.7940631

test$prob = predict(full.fit, newdata=test, type="response")
test$predict = rep("neg", 229)
test$predict[test$prob>0.5]="pos"
table(test$predict, test$diabetes)

##      
##       neg pos
##   neg 124  37
##   pos  21  47

mean(test$predict==test$diabetes)

## [1] 0.7467249

#Cross-validation coupled with logistic regression
library(bestglm)

## Loading required package: leaps

train$y=rep(0,539)
train$y[train$diabetes=="pos"]=1
head(train[ ,12])

## [1] 1 1 0 1 1 1

pima.cv = train[ ,-9:-11]
head(pima.cv)

##    pregnant glucose pressure triceps insulin mass pedigree age y
## 1         6     148       72      35       0 33.6    0.627  50 1
## 3         8     183       64       0       0 23.3    0.672  32 1
## 6         5     116       74       0       0 25.6    0.201  30 0
## 7         3      78       50      32      88 31.0    0.248  26 1
## 9         2     197       70      45     543 30.5    0.158  53 1
## 10        8     125       96       0       0  0.0    0.232  54 1

bestglm(Xy = pima.cv, IC="CV", CVArgs=list(Method="HTF", K=10,
REP=1), family=binomial)

## Morgan-Tatar search since family is non-gaussian.

## CV(K = 10, REP = 1)
## BICq equivalent for q in (9.38531232528295e-05, 0.0116666667889647)
## Best Model:
##                Estimate  Std. Error    z value     Pr(>|z|)
## (Intercept) -7.63788926 0.722301655 -10.574376 3.917769e-26
## glucose      0.03610392 0.003972768   9.087850 1.010164e-19
## mass         0.07588215 0.016303809   4.654259 3.251480e-06

reduce.fit = glm(diabetes~glucose+mass, family=binomial, data=train)
train$cv.probs = predict(reduce.fit, type="response")
train$cv.predict = rep("neg", 539)
train$cv.predict[train$cv.probs>0.5]="pos"
table(train$cv.predict, train$diabetes)

##      
##       neg pos
##   neg 320  83
##   pos  35 101

test$cv.probs = predict(reduce.fit, newdata=test, type="response")
test$predict = rep("neg", 229)
test$predict[test$cv.probs>0.5]="pos"
table(test$predict, test$diabetes)

##      
##       neg pos
##   neg 127  43
##   pos  18  41

bestglm(Xy= pima.cv, IC="BIC", family=binomial)

## Morgan-Tatar search since family is non-gaussian.

## BIC
## BICq equivalent for q in (0.0116666667889647, 0.714450728909061)
## Best Model:
##                Estimate  Std. Error    z value     Pr(>|z|)
## (Intercept) -8.12683206 0.757115665 -10.733937 7.052573e-27
## pregnant     0.12694198 0.032944520   3.853205 1.165818e-04
## glucose      0.03480252 0.003975587   8.754057 2.058167e-18
## mass         0.07985700 0.016718599   4.776537 1.783402e-06

bic.fit=glm(diabetes~pregnant+glucose+mass, family=binomial, data=train)
test$bic.probs = predict(bic.fit, newdata=test, type="response")
test$bic.predict = rep("neg", 229)
test$bic.predict[test$bic.probs>0.5]="pos"
table(test$bic.predict, test$diabetes)

##      
##       neg pos
##   neg 125  40
##   pos  20  44

#Discriminant Analyses
#LDA

lda.train = train[ ,-10:-14]
lda.train[1:3,]

##   pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1        6     148       72      35       0 33.6    0.627  50      pos
## 3        8     183       64       0       0 23.3    0.672  32      pos
## 6        5     116       74       0       0 25.6    0.201  30      neg

lda.test = test[ ,-10:-14]
lda.test[1:3,]

##   pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 2        1      85       66      29       0 26.6    0.351  31      neg
## 4        1      89       66      23      94 28.1    0.167  21      neg
## 5        0     137       40      35     168 43.1    2.288  33      pos

library(MASS)
lda.fit = lda(diabetes~., data=lda.train)
lda.fit

## Call:
## lda(diabetes ~ ., data = lda.train)
## 
## Prior probabilities of groups:
##       neg       pos 
## 0.6586271 0.3413729 
## 
## Group means:
##     pregnant  glucose pressure  triceps   insulin     mass  pedigree
## neg 3.329577 108.9437 67.91831 19.71268  69.05915 30.25915 0.4315606
## pos 4.858696 141.6196 71.66848 22.61413 108.35326 35.18913 0.5354891
##          age
## neg 30.76056
## pos 37.30435
## 
## Coefficients of linear discriminants:
##                    LD1
## pregnant  6.877065e-02
## glucose   2.749623e-02
## pressure -1.116674e-02
## triceps  -7.530528e-05
## insulin  -1.036254e-03
## mass      6.146317e-02
## pedigree  6.138762e-01
## age       1.867235e-02

plot(lda.fit, type="both")

lda.predict = predict(lda.fit)
train$lda = lda.predict$class
table(train$lda, train$diabetes)

##      
##       neg pos
##   neg 319  76
##   pos  36 108

lda.test = predict(lda.fit, newdata = test)
test$lda = lda.test$class
table(test$lda, test$diabetes)

##      
##       neg pos
##   neg 126  38
##   pos  19  46

mean(test$lda==test$diabetes)

## [1] 0.7510917

#Quadratic Discriminant Analysis
#QDA

qda.fit = qda(diabetes~., data=lda.train)
qda.fit

## Call:
## qda(diabetes ~ ., data = lda.train)
## 
## Prior probabilities of groups:
##       neg       pos 
## 0.6586271 0.3413729 
## 
## Group means:
##     pregnant  glucose pressure  triceps   insulin     mass  pedigree
## neg 3.329577 108.9437 67.91831 19.71268  69.05915 30.25915 0.4315606
## pos 4.858696 141.6196 71.66848 22.61413 108.35326 35.18913 0.5354891
##          age
## neg 30.76056
## pos 37.30435

qda.predict = predict(qda.fit)
train$qda = qda.predict$class
table(train$qda, train$diabetes)

##      
##       neg pos
##   neg 307  72
##   pos  48 112

qda.test = predict(qda.fit, newdata=test)
test$qda = qda.test$class
table(test$qda, test$diabetes)

##      
##       neg pos
##   neg 112  34
##   pos  33  50

library(ROCR)

## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

poor.fit = glm(diabetes~age, family=binomial, data=test)
test$poor.probs = predict(poor.fit, type="response")
pred.full = prediction(test$prob, test$diabetes)
perf.full = performance(pred.full, "tpr", "fpr")
plot(perf.full, main="ROC", col=1)

pred.bic = prediction(test$bic.probs, test$diabetes)
perf.bic = performance(pred.bic, "tpr", "fpr")
plot(perf.bic, col=2, add=TRUE)

pred.poor = prediction(test$poor, test$diabetes)
perf.poor = performance(pred.poor, "tpr", "fpr")
plot(perf.poor, col=3, add=TRUE)
legend(0.6, 0.6, c("FULL", "BIC", "BAD"),1:3)

auc.full = performance(pred.full, "auc")
auc.full

## An object of class "performance"
## Slot "x.name":
## [1] "None"
## 
## Slot "y.name":
## [1] "Area under the ROC curve"
## 
## Slot "alpha.name":
## [1] "none"
## 
## Slot "x.values":
## list()
## 
## Slot "y.values":
## [[1]]
## [1] 0.8286535
## 
## 
## Slot "alpha.values":
## list()

auc.bic = performance(pred.bic, "auc")
auc.bic

## An object of class "performance"
## Slot "x.name":
## [1] "None"
## 
## Slot "y.name":
## [1] "Area under the ROC curve"
## 
## Slot "alpha.name":
## [1] "none"
## 
## Slot "x.values":
## list()
## 
## Slot "y.values":
## [[1]]
## [1] 0.8144499
## 
## 
## Slot "alpha.values":
## list()

auc.poor = performance(pred.poor, "auc")
auc.poor

## An object of class "performance"
## Slot "x.name":
## [1] "None"
## 
## Slot "y.name":
## [1] "Area under the ROC curve"
## 
## Slot "alpha.name":
## [1] "none"
## 
## Slot "x.values":
## list()
## 
## Slot "y.values":
## [[1]]
## [1] 0.6619869
## 
## 
## Slot "alpha.values":
## list()

Machine learning with linear and quadratic discriminant analyses, binary logistic regression,cross validations, model selection, ROC curves and AUC

Kushan De Silva

July 30, 2017