library(mlbench)
data(PimaIndiansDiabetes)
dim(PimaIndiansDiabetes)
## [1] 768 9
levels(PimaIndiansDiabetes$diabetes)
## [1] "neg" "pos"
head(PimaIndiansDiabetes)
## pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1 6 148 72 35 0 33.6 0.627 50 pos
## 2 1 85 66 29 0 26.6 0.351 31 neg
## 3 8 183 64 0 0 23.3 0.672 32 pos
## 4 1 89 66 23 94 28.1 0.167 21 neg
## 5 0 137 40 35 168 43.1 2.288 33 pos
## 6 5 116 74 0 0 25.6 0.201 30 neg
str(PimaIndiansDiabetes)
## 'data.frame': 768 obs. of 9 variables:
## $ pregnant: num 6 1 8 1 0 5 3 10 2 8 ...
## $ glucose : num 148 85 183 89 137 116 78 115 197 125 ...
## $ pressure: num 72 66 64 66 40 74 50 0 70 96 ...
## $ triceps : num 35 29 0 23 35 0 32 0 45 0 ...
## $ insulin : num 0 0 0 94 168 0 88 0 543 0 ...
## $ mass : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ pedigree: num 0.627 0.351 0.672 0.167 2.288 ...
## $ age : num 50 31 32 21 33 30 26 29 53 54 ...
## $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...
summary(PimaIndiansDiabetes)
## pregnant glucose pressure triceps
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## insulin mass pedigree age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## diabetes
## neg:500
## pos:268
##
##
##
##
library(reshape2)
library(ggplot2)
pima.m = melt(PimaIndiansDiabetes, id.var="diabetes")
ggplot(data=pima.m, aes(x=diabetes, y=value)) + geom_boxplot() +facet_wrap(~variable,ncol = 3)

library(corrplot)
pc = cor(PimaIndiansDiabetes[ ,1:8])
corrplot.mixed(pc)

set.seed(123)
ind = sample(2, nrow(PimaIndiansDiabetes), replace=TRUE, prob=c(0.7, 0.3))
train = PimaIndiansDiabetes[ind==1,]
test = PimaIndiansDiabetes[ind==2,]
str(test)
## 'data.frame': 229 obs. of 9 variables:
## $ pregnant: num 1 1 0 10 4 7 1 3 9 10 ...
## $ glucose : num 85 89 137 115 110 100 115 126 119 125 ...
## $ pressure: num 66 66 40 0 92 0 70 88 80 70 ...
## $ triceps : num 29 23 35 0 0 0 30 41 35 26 ...
## $ insulin : num 0 94 168 0 0 0 96 235 0 115 ...
## $ mass : num 26.6 28.1 43.1 35.3 37.6 30 34.6 39.3 29 31.1 ...
## $ pedigree: num 0.351 0.167 2.288 0.134 0.191 ...
## $ age : num 31 21 33 29 30 32 32 27 29 41 ...
## $ diabetes: Factor w/ 2 levels "neg","pos": 1 1 2 1 1 2 2 1 2 2 ...
table(train$diabetes)
##
## neg pos
## 355 184
table(test$diabetes)
##
## neg pos
## 145 84
full.fit = glm(diabetes~., family=binomial, data=train)
summary(full.fit)
##
## Call:
## glm(formula = diabetes ~ ., family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3792 -0.7097 -0.4091 0.7182 2.9452
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.685620 0.875844 -9.917 < 2e-16 ***
## pregnant 0.092897 0.040677 2.284 0.0224 *
## glucose 0.035882 0.004530 7.921 2.35e-15 ***
## pressure -0.013330 0.006799 -1.960 0.0499 *
## triceps -0.001551 0.008283 -0.187 0.8515
## insulin -0.001467 0.001093 -1.343 0.1793
## mass 0.094193 0.018798 5.011 5.42e-07 ***
## pedigree 0.823112 0.357394 2.303 0.0213 *
## age 0.023416 0.011679 2.005 0.0450 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 692.01 on 538 degrees of freedom
## Residual deviance: 499.21 on 530 degrees of freedom
## AIC: 517.21
##
## Number of Fisher Scoring iterations: 5
confint(full.fit)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -10.480952137 -7.0421224282
## pregnant 0.013635114 0.1734387562
## glucose 0.027303306 0.0450950219
## pressure -0.026893788 -0.0001258701
## triceps -0.017698961 0.0147872351
## insulin -0.003619024 0.0006882973
## mass 0.058463785 0.1322401014
## pedigree 0.127911990 1.5307339152
## age 0.000498035 0.0464727915
exp(coef(full.fit))
## (Intercept) pregnant glucose pressure triceps
## 0.0001689986 1.0973486809 1.0365338256 0.9867587189 0.9984506507
## insulin mass pedigree age
## 0.9985339169 1.0987714667 2.2775766549 1.0236919341
library(car)
vif(full.fit)
## pregnant glucose pressure triceps insulin mass pedigree age
## 1.485457 1.256421 1.199338 1.512798 1.572336 1.257257 1.036254 1.526423
train$probs = predict(full.fit, type="response")
train$probs[1:5]
## [1] 0.73498784 0.78040585 0.14604768 0.06177953 0.73368873
contrasts(train$diabetes)
## pos
## neg 0
## pos 1
train$predict = rep("neg", 539)
train$predict[train$probs>0.5]="pos"
table(train$predict, train$diabetes)
##
## neg pos
## neg 320 76
## pos 35 108
mean(train$predict==train$diabetes)
## [1] 0.7940631
test$prob = predict(full.fit, newdata=test, type="response")
test$predict = rep("neg", 229)
test$predict[test$prob>0.5]="pos"
table(test$predict, test$diabetes)
##
## neg pos
## neg 124 37
## pos 21 47
mean(test$predict==test$diabetes)
## [1] 0.7467249
#Cross-validation coupled with logistic regression
library(bestglm)
## Loading required package: leaps
train$y=rep(0,539)
train$y[train$diabetes=="pos"]=1
head(train[ ,12])
## [1] 1 1 0 1 1 1
pima.cv = train[ ,-9:-11]
head(pima.cv)
## pregnant glucose pressure triceps insulin mass pedigree age y
## 1 6 148 72 35 0 33.6 0.627 50 1
## 3 8 183 64 0 0 23.3 0.672 32 1
## 6 5 116 74 0 0 25.6 0.201 30 0
## 7 3 78 50 32 88 31.0 0.248 26 1
## 9 2 197 70 45 543 30.5 0.158 53 1
## 10 8 125 96 0 0 0.0 0.232 54 1
bestglm(Xy = pima.cv, IC="CV", CVArgs=list(Method="HTF", K=10,
REP=1), family=binomial)
## Morgan-Tatar search since family is non-gaussian.
## CV(K = 10, REP = 1)
## BICq equivalent for q in (9.38531232528295e-05, 0.0116666667889647)
## Best Model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.63788926 0.722301655 -10.574376 3.917769e-26
## glucose 0.03610392 0.003972768 9.087850 1.010164e-19
## mass 0.07588215 0.016303809 4.654259 3.251480e-06
reduce.fit = glm(diabetes~glucose+mass, family=binomial, data=train)
train$cv.probs = predict(reduce.fit, type="response")
train$cv.predict = rep("neg", 539)
train$cv.predict[train$cv.probs>0.5]="pos"
table(train$cv.predict, train$diabetes)
##
## neg pos
## neg 320 83
## pos 35 101
test$cv.probs = predict(reduce.fit, newdata=test, type="response")
test$predict = rep("neg", 229)
test$predict[test$cv.probs>0.5]="pos"
table(test$predict, test$diabetes)
##
## neg pos
## neg 127 43
## pos 18 41
bestglm(Xy= pima.cv, IC="BIC", family=binomial)
## Morgan-Tatar search since family is non-gaussian.
## BIC
## BICq equivalent for q in (0.0116666667889647, 0.714450728909061)
## Best Model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.12683206 0.757115665 -10.733937 7.052573e-27
## pregnant 0.12694198 0.032944520 3.853205 1.165818e-04
## glucose 0.03480252 0.003975587 8.754057 2.058167e-18
## mass 0.07985700 0.016718599 4.776537 1.783402e-06
bic.fit=glm(diabetes~pregnant+glucose+mass, family=binomial, data=train)
test$bic.probs = predict(bic.fit, newdata=test, type="response")
test$bic.predict = rep("neg", 229)
test$bic.predict[test$bic.probs>0.5]="pos"
table(test$bic.predict, test$diabetes)
##
## neg pos
## neg 125 40
## pos 20 44
#Discriminant Analyses
#LDA
lda.train = train[ ,-10:-14]
lda.train[1:3,]
## pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1 6 148 72 35 0 33.6 0.627 50 pos
## 3 8 183 64 0 0 23.3 0.672 32 pos
## 6 5 116 74 0 0 25.6 0.201 30 neg
lda.test = test[ ,-10:-14]
lda.test[1:3,]
## pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 2 1 85 66 29 0 26.6 0.351 31 neg
## 4 1 89 66 23 94 28.1 0.167 21 neg
## 5 0 137 40 35 168 43.1 2.288 33 pos
library(MASS)
lda.fit = lda(diabetes~., data=lda.train)
lda.fit
## Call:
## lda(diabetes ~ ., data = lda.train)
##
## Prior probabilities of groups:
## neg pos
## 0.6586271 0.3413729
##
## Group means:
## pregnant glucose pressure triceps insulin mass pedigree
## neg 3.329577 108.9437 67.91831 19.71268 69.05915 30.25915 0.4315606
## pos 4.858696 141.6196 71.66848 22.61413 108.35326 35.18913 0.5354891
## age
## neg 30.76056
## pos 37.30435
##
## Coefficients of linear discriminants:
## LD1
## pregnant 6.877065e-02
## glucose 2.749623e-02
## pressure -1.116674e-02
## triceps -7.530528e-05
## insulin -1.036254e-03
## mass 6.146317e-02
## pedigree 6.138762e-01
## age 1.867235e-02
plot(lda.fit, type="both")

lda.predict = predict(lda.fit)
train$lda = lda.predict$class
table(train$lda, train$diabetes)
##
## neg pos
## neg 319 76
## pos 36 108
lda.test = predict(lda.fit, newdata = test)
test$lda = lda.test$class
table(test$lda, test$diabetes)
##
## neg pos
## neg 126 38
## pos 19 46
mean(test$lda==test$diabetes)
## [1] 0.7510917
#Quadratic Discriminant Analysis
#QDA
qda.fit = qda(diabetes~., data=lda.train)
qda.fit
## Call:
## qda(diabetes ~ ., data = lda.train)
##
## Prior probabilities of groups:
## neg pos
## 0.6586271 0.3413729
##
## Group means:
## pregnant glucose pressure triceps insulin mass pedigree
## neg 3.329577 108.9437 67.91831 19.71268 69.05915 30.25915 0.4315606
## pos 4.858696 141.6196 71.66848 22.61413 108.35326 35.18913 0.5354891
## age
## neg 30.76056
## pos 37.30435
qda.predict = predict(qda.fit)
train$qda = qda.predict$class
table(train$qda, train$diabetes)
##
## neg pos
## neg 307 72
## pos 48 112
qda.test = predict(qda.fit, newdata=test)
test$qda = qda.test$class
table(test$qda, test$diabetes)
##
## neg pos
## neg 112 34
## pos 33 50
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
poor.fit = glm(diabetes~age, family=binomial, data=test)
test$poor.probs = predict(poor.fit, type="response")
pred.full = prediction(test$prob, test$diabetes)
perf.full = performance(pred.full, "tpr", "fpr")
plot(perf.full, main="ROC", col=1)
pred.bic = prediction(test$bic.probs, test$diabetes)
perf.bic = performance(pred.bic, "tpr", "fpr")
plot(perf.bic, col=2, add=TRUE)
pred.poor = prediction(test$poor, test$diabetes)
perf.poor = performance(pred.poor, "tpr", "fpr")
plot(perf.poor, col=3, add=TRUE)
legend(0.6, 0.6, c("FULL", "BIC", "BAD"),1:3)

auc.full = performance(pred.full, "auc")
auc.full
## An object of class "performance"
## Slot "x.name":
## [1] "None"
##
## Slot "y.name":
## [1] "Area under the ROC curve"
##
## Slot "alpha.name":
## [1] "none"
##
## Slot "x.values":
## list()
##
## Slot "y.values":
## [[1]]
## [1] 0.8286535
##
##
## Slot "alpha.values":
## list()
auc.bic = performance(pred.bic, "auc")
auc.bic
## An object of class "performance"
## Slot "x.name":
## [1] "None"
##
## Slot "y.name":
## [1] "Area under the ROC curve"
##
## Slot "alpha.name":
## [1] "none"
##
## Slot "x.values":
## list()
##
## Slot "y.values":
## [[1]]
## [1] 0.8144499
##
##
## Slot "alpha.values":
## list()
auc.poor = performance(pred.poor, "auc")
auc.poor
## An object of class "performance"
## Slot "x.name":
## [1] "None"
##
## Slot "y.name":
## [1] "Area under the ROC curve"
##
## Slot "alpha.name":
## [1] "none"
##
## Slot "x.values":
## list()
##
## Slot "y.values":
## [[1]]
## [1] 0.6619869
##
##
## Slot "alpha.values":
## list()