Diabetes Risk Prediction Model

Nir Regev
Principal Data Scientist
Sisense Ltd.

April 16, 2016

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(rpart)
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
library(partykit)
## 
## Attaching package: 'partykit'
## The following objects are masked from 'package:party':
## 
##     cforest, ctree, ctree_control, edge_simple, mob, mob_control,
##     node_barplot, node_bivplot, node_boxplot, node_inner,
##     node_surv, node_terminal
library(Formula)
library(e1071)
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(ggplot2)

Starting with Sisense Diabetes Data Widget

Sisense Diabetes Widget Attribute Information:

  1. Number of times pregnant
  2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test
  3. Diastolic blood pressure (mm Hg)
  4. Triceps skin fold thickness (mm)
  5. 2-Hour serum insulin (mu U/ml)
  6. Body mass index (weight in kg/(height in m)^2)
  7. Diabetes pedigree function
  8. Age (years)
  9. Class variable (0 or 1)

A Generic Model For Diabetes Classificaiton

setwd("C:/Users/Nir.Regev/Documents/EXL")
data("PimaIndiansDiabetes", package = "mlbench")
attach(PimaIndiansDiabetes)
## The following object is masked from package:datasets:
## 
##     pressure
args <- lapply(PimaIndiansDiabetes, list)

Fitting a logistic regression to assess predictors importance

PimaIndiansDiabetes <- data.frame(mapply( FUN = c,args))
formula <- paste0(paste(names(PimaIndiansDiabetes)[length(PimaIndiansDiabetes)], collapse="+") ,"~", paste(names(PimaIndiansDiabetes)[1:(length(PimaIndiansDiabetes)-1)], collapse="+"))
logRegModel <- glm(formula = formula, family=binomial, data=PimaIndiansDiabetes)
logRegModel  
## 
## Call:  glm(formula = formula, family = binomial, data = PimaIndiansDiabetes)
## 
## Coefficients:
## (Intercept)     pregnant      glucose     pressure      triceps  
##   -8.404696     0.123182     0.035164    -0.013296     0.000619  
##     insulin         mass     pedigree          age  
##   -0.001192     0.089701     0.945180     0.014869  
## 
## Degrees of Freedom: 767 Total (i.e. Null);  759 Residual
## Null Deviance:       993.5 
## Residual Deviance: 723.4     AIC: 741.4

Filtering the most important predictors from GLM model

summary(logRegModel)
## 
## Call:
## glm(formula = formula, family = binomial, data = PimaIndiansDiabetes)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5566  -0.7274  -0.4159   0.7267   2.9297  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -8.4046964  0.7166359 -11.728  < 2e-16 ***
## pregnant     0.1231823  0.0320776   3.840 0.000123 ***
## glucose      0.0351637  0.0037087   9.481  < 2e-16 ***
## pressure    -0.0132955  0.0052336  -2.540 0.011072 *  
## triceps      0.0006190  0.0068994   0.090 0.928515    
## insulin     -0.0011917  0.0009012  -1.322 0.186065    
## mass         0.0897010  0.0150876   5.945 2.76e-09 ***
## pedigree     0.9451797  0.2991475   3.160 0.001580 ** 
## age          0.0148690  0.0093348   1.593 0.111192    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 993.48  on 767  degrees of freedom
## Residual deviance: 723.45  on 759  degrees of freedom
## AIC: 741.45
## 
## Number of Fisher Scoring iterations: 5
  # features selection - n highest logistic model coefficients
coeff.list <- exp(coef(logRegModel))[2:ncol(PimaIndiansDiabetes)]
coeff.list <- coeff.list[c(order(abs(coeff.list),decreasing=TRUE)[1:(ncol(PimaIndiansDiabetes)-1)])]
predictors.names <- c(names(coeff.list),names(PimaIndiansDiabetes)[length(PimaIndiansDiabetes)])
  
# filter df with n most important predictors
PimaIndiansDiabetes.proj <- PimaIndiansDiabetes[,c(predictors.names)]

Dimension Reduction and Faeature Selstion (PCA)

PimaIndiansDiabetes.pca <- princomp(PimaIndiansDiabetes[,1:(length(PimaIndiansDiabetes)-1)],
                                    center = TRUE,
                                    scale. = TRUE)
## Warning: In princomp.default(PimaIndiansDiabetes[, 1:(length(PimaIndiansDiabetes) - 
##     1)], center = TRUE, scale. = TRUE) :
##  extra arguments 'center', 'scale.' will be disregarded
View(PimaIndiansDiabetes.pca$scores)
target.numeric <- ifelse (PimaIndiansDiabetes$diabetes == "neg", 0 ,1)
pca.sisense.result <- data.frame(PimaIndiansDiabetes.pca$scores[,1:2],target.numeric)
x <- pca.sisense.result$Comp.1
y <- pca.sisense.result$Comp.2
pca.sisense.result$class <- as.factor(pca.sisense.result$target.numeric)
ggplot(pca.sisense.result, aes(Comp.1, Comp.2)) + 
  geom_point(aes(colour = class)) 

Partitioning the data to Training/Testing sets

names(PimaIndiansDiabetes.proj)
## [1] "pedigree" "pregnant" "mass"     "glucose"  "age"      "triceps" 
## [7] "insulin"  "pressure" "diabetes"
intrain <- createDataPartition(y=PimaIndiansDiabetes.proj$diabetes,p=0.7,list=FALSE)
training <- PimaIndiansDiabetes.proj[intrain,]
testing<-PimaIndiansDiabetes.proj[-intrain,]

formula <- paste0(paste(names(PimaIndiansDiabetes.proj)[length(PimaIndiansDiabetes.proj)], collapse="+") ,"~", paste(names(PimaIndiansDiabetes.proj)[1:(length(PimaIndiansDiabetes.proj)-1)], collapse="+"))

Fitting a classificaiton tree

ct <- ctree(diabetes ~ ., data = training)
svg(filename="Diabites_Decision_Tree.svg",
    width=12,
    height=10,
    pointsize=12)
  plot(as.simpleparty(ct))
  dev.off()
## png 
##   2
## Predict Diabites Risk on new patients
predictions.probs <- predict(ct, testing,type = c("prob"))
predictions.class <- predict(ct, testing,type = c("response"))
table(predictions.class, testing$diabetes )
##                  
## predictions.class neg pos
##               neg 122  20
##               pos  28  60
## confusion matrix stats on testing set
cm <- confusionMatrix(testing$diabetes, predictions.class, positive = NULL, 
                 dnn = c("Prediction", "Reference"))
# printing confusion matrix
cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction neg pos
##        neg 122  28
##        pos  20  60
##                                          
##                Accuracy : 0.7913         
##                  95% CI : (0.733, 0.8419)
##     No Information Rate : 0.6174         
##     P-Value [Acc > NIR] : 1.155e-08      
##                                          
##                   Kappa : 0.5505         
##  Mcnemar's Test P-Value : 0.3123         
##                                          
##             Sensitivity : 0.8592         
##             Specificity : 0.6818         
##          Pos Pred Value : 0.8133         
##          Neg Pred Value : 0.7500         
##              Prevalence : 0.6174         
##          Detection Rate : 0.5304         
##    Detection Prevalence : 0.6522         
##       Balanced Accuracy : 0.7705         
##                                          
##        'Positive' Class : neg            
## 
roc_pred <- prediction(predictions.probs[,2], testing$diabetes)
#png(filename=paste("decision_tree_roc.png"),width = 1200, height = 800)
plot(performance(roc_pred, measure="sens", x.measure="spec"), colorize=TRUE)
auc.perf = performance(roc_pred, measure = "auc")
title(unlist(auc.perf@y.values))

#dev.off()
Sisense Diabetes Widget

Sisense Diabetes Widget

To optimize prediction accuracy - Ensemble Models

fit.rf <-randomForest(diabetes ~ ., data=training)
print(fit.rf)
## 
## Call:
##  randomForest(formula = diabetes ~ ., data = training) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 25.84%
## Confusion matrix:
##     neg pos class.error
## neg 299  51   0.1457143
## pos  88 100   0.4680851
#importance(fit.rf)
rf.predictions.class <- predict(fit.rf, testing)
rf.predictions.probs <- predict(fit.rf, testing,type = c("prob"))
cm.rf <- confusionMatrix(testing$diabetes, rf.predictions.class, positive = NULL, 
                 dnn = c("Prediction", "Reference"))
# printing confusion matrix
cm.rf
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction neg pos
##        neg 134  16
##        pos  30  50
##                                           
##                Accuracy : 0.8             
##                  95% CI : (0.7424, 0.8497)
##     No Information Rate : 0.713           
##     P-Value [Acc > NIR] : 0.001687        
##                                           
##                   Kappa : 0.5404          
##  Mcnemar's Test P-Value : 0.055270        
##                                           
##             Sensitivity : 0.8171          
##             Specificity : 0.7576          
##          Pos Pred Value : 0.8933          
##          Neg Pred Value : 0.6250          
##              Prevalence : 0.7130          
##          Detection Rate : 0.5826          
##    Detection Prevalence : 0.6522          
##       Balanced Accuracy : 0.7873          
##                                           
##        'Positive' Class : neg             
## 
roc_pred.rf <- prediction(rf.predictions.probs[,2], testing$diabetes)
plot(performance(roc_pred.rf, measure="sens", x.measure="spec"), colorize=TRUE)
rf.auc.perf = performance(roc_pred.rf, measure = "auc")
title(unlist(rf.auc.perf@y.values))

- an AUC improvement of

unlist(rf.auc.perf@y.values) - unlist(auc.perf@y.values)
## [1] 0.01833333

NOT BAD !

Returning diabetes risk probabilities to sisense

predictions.probs <- predict(ct, training,type = c("prob"))
predictions.probs[,2]
##          2          4          5          6          7         11 
## 0.28125000 0.13274336 0.59036145 0.02150538 0.13274336 0.28125000 
##         12         13         14         18         19         20 
## 0.78571429 0.23076923 0.78571429 0.28125000 0.28125000 0.28125000 
##         21         22         23         24         27         28 
## 0.13274336 0.28125000 0.78571429 0.28125000 0.59036145 0.02150538 
##         30         31         32         33         34         35 
## 0.28125000 0.28125000 0.78571429 0.02150538 0.02150538 0.28125000 
##         36         37         39         40         41         42 
## 0.02150538 0.59036145 0.13274336 0.66666667 0.78571429 0.59036145 
##         44         45         46         47         48         49 
## 0.78571429 0.78571429 0.78571429 0.23076923 0.13274336 0.28125000 
##         51         52         54         55         56         60 
## 0.02150538 0.02150538 0.78571429 0.59036145 0.02150538 0.13274336 
##         61         62         64         66         68         70 
## 0.02150538 0.59036145 0.23076923 0.28125000 0.66666667 0.23076923 
##         71         72         73         74         77         78 
## 0.66666667 0.23076923 0.66666667 0.59036145 0.28125000 0.13274336 
##         79         80         81         85         87         88 
## 0.59036145 0.02150538 0.02150538 0.59036145 0.28125000 0.13274336 
##         89         91         92         93         94         95 
## 0.59036145 0.02150538 0.28125000 0.28125000 0.23076923 0.23076923 
##         96        100        102        103        104        105 
## 0.59036145 0.28125000 0.23076923 0.02150538 0.13274336 0.13274336 
##        107        108        111        113        114        115 
## 0.02150538 0.23076923 0.78571429 0.13274336 0.13274336 0.78571429 
##        116        117        118        119        121        123 
## 0.59036145 0.28125000 0.13274336 0.13274336 0.78571429 0.13274336 
##        125        126        127        129        130        131 
## 0.13274336 0.13274336 0.28125000 0.28125000 0.66666667 0.78571429 
##        134        136        138        140        141        142 
## 0.28125000 0.28125000 0.13274336 0.28125000 0.23076923 0.28125000 
##        144        145        146        147        148        149 
## 0.28125000 0.59036145 0.02150538 0.28125000 0.66666667 0.59036145 
##        150        151        152        153        154        155 
## 0.13274336 0.59036145 0.02150538 0.78571429 0.59036145 0.78571429 
##        156        157        158        159        160        163 
## 0.59036145 0.02150538 0.02150538 0.13274336 0.78571429 0.13274336 
##        166        167        168        169        171        175 
## 0.66666667 0.59036145 0.66666667 0.28125000 0.28125000 0.28125000 
##        176        177        178        180        185        187 
## 0.78571429 0.28125000 0.59036145 0.59036145 0.23076923 0.78571429 
##        188        189        190        192        194        195 
## 0.59036145 0.66666667 0.59036145 0.28125000 0.59036145 0.02150538 
##        196        197        198        200        202        204 
## 0.78571429 0.02150538 0.02150538 0.59036145 0.59036145 0.02150538 
##        205        206        207        209        210        211 
## 0.28125000 0.02150538 0.78571429 0.13274336 0.78571429 0.13274336 
##        214        215        216        217        218        219 
## 0.59036145 0.28125000 0.59036145 0.13274336 0.28125000 0.66666667 
##        222        223        224        225        227        229 
## 0.78571429 0.02150538 0.23076923 0.02150538 0.13274336 0.78571429 
##        230        231        234        235        236        237 
## 0.13274336 0.59036145 0.28125000 0.13274336 0.78571429 0.78571429 
##        239        240        242        245        246        247 
## 0.78571429 0.02150538 0.13274336 0.59036145 0.78571429 0.28125000 
##        248        249        250        251        252        253 
## 0.78571429 0.28125000 0.13274336 0.28125000 0.23076923 0.02150538 
##        256        257        258        260        261        262 
## 0.13274336 0.28125000 0.13274336 0.78571429 0.78571429 0.59036145 
##        263        264        265        266        267        268 
## 0.13274336 0.59036145 0.28125000 0.66666667 0.59036145 0.59036145 
##        270        271        272        276        277        279 
## 0.23076923 0.66666667 0.02150538 0.13274336 0.28125000 0.02150538 
##        280        281        282        284        285        286 
## 0.02150538 0.59036145 0.59036145 0.78571429 0.28125000 0.23076923 
##        287        288        289        291        292        293 
## 0.78571429 0.66666667 0.02150538 0.13274336 0.13274336 0.59036145 
##        296        297        299        300        301        302 
## 0.59036145 0.23076923 0.28125000 0.02150538 0.78571429 0.59036145 
##        303        304        306        307        308        309 
## 0.28125000 0.28125000 0.28125000 0.78571429 0.23076923 0.59036145 
##        311        312        314        318        322        324 
## 0.02150538 0.13274336 0.13274336 0.78571429 0.13274336 0.23076923 
##        326        327        328        329        331        332 
## 0.78571429 0.66666667 0.78571429 0.13274336 0.02150538 0.13274336 
##        334        335        336        338        343        346 
## 0.02150538 0.02150538 0.78571429 0.28125000 0.13274336 0.28125000 
##        347        348        349        350        351        353 
## 0.23076923 0.02150538 0.02150538 0.28125000 0.28125000 0.28125000 
##        354        355        356        357        359        361 
## 0.13274336 0.13274336 0.78571429 0.66666667 0.28125000 0.78571429 
##        362        363        364        365        366        367 
## 0.78571429 0.28125000 0.59036145 0.59036145 0.28125000 0.28125000 
##        368        370        371        373        377        381 
## 0.02150538 0.59036145 0.78571429 0.13274336 0.02150538 0.13274336 
##        382        383        384        385        386        387 
## 0.02150538 0.02150538 0.02150538 0.02150538 0.02150538 0.66666667 
##        388        390        391        392        393        394 
## 0.28125000 0.66666667 0.28125000 0.78571429 0.23076923 0.02150538 
##        395        397        398        402        407        408 
## 0.78571429 0.02150538 0.59036145 0.23076923 0.28125000 0.02150538 
##        409        410        412        413        414        415 
## 0.78571429 0.78571429 0.13274336 0.59036145 0.23076923 0.59036145 
##        416        417        418        420        423        424 
## 0.78571429 0.13274336 0.59036145 0.23076923 0.13274336 0.13274336 
##        425        426        427        428        429        430 
## 0.59036145 0.78571429 0.02150538 0.78571429 0.59036145 0.28125000 
##        431        432        433        434        435        437 
## 0.02150538 0.28125000 0.13274336 0.23076923 0.02150538 0.59036145 
##        438        439        442        443        445        446 
## 0.23076923 0.02150538 0.13274336 0.13274336 0.28125000 0.78571429 
##        447        448        449        450        451        452 
## 0.02150538 0.13274336 0.13274336 0.13274336 0.02150538 0.23076923 
##        453        454        458        459        460        461 
## 0.13274336 0.02150538 0.13274336 0.59036145 0.23076923 0.02150538 
##        463        464        466        467        468        469 
## 0.66666667 0.28125000 0.02150538 0.13274336 0.13274336 0.28125000 
##        470        471        474        475        477        479 
## 0.59036145 0.59036145 0.23076923 0.13274336 0.66666667 0.02150538 
##        480        481        482        486        488        489 
## 0.23076923 0.78571429 0.28125000 0.59036145 0.78571429 0.02150538 
##        491        492        495        496        497        498 
## 0.13274336 0.28125000 0.02150538 0.78571429 0.02150538 0.13274336 
##        499        500        501        502        503        505 
## 0.78571429 0.23076923 0.02150538 0.28125000 0.66666667 0.28125000 
##        506        507        509        510        511        513 
## 0.28125000 0.78571429 0.13274336 0.02150538 0.28125000 0.02150538 
##        514        515        516        517        519        520 
## 0.13274336 0.02150538 0.78571429 0.59036145 0.28125000 0.23076923 
##        521        522        525        526        529        530 
## 0.02150538 0.13274336 0.13274336 0.02150538 0.13274336 0.02150538 
##        531        532        533        534        535        536 
## 0.13274336 0.13274336 0.66666667 0.28125000 0.13274336 0.59036145 
##        537        538        539        540        541        542 
## 0.28125000 0.02150538 0.13274336 0.59036145 0.66666667 0.59036145 
##        544        545        546        548        549        551 
## 0.13274336 0.28125000 0.78571429 0.59036145 0.78571429 0.13274336 
##        552        553        554        556        560        562 
## 0.13274336 0.28125000 0.13274336 0.02150538 0.28125000 0.78571429 
##        563        564        565        566        568        569 
## 0.13274336 0.28125000 0.13274336 0.02150538 0.28125000 0.59036145 
##        573        575        576        578        579        581 
## 0.13274336 0.59036145 0.13274336 0.13274336 0.23076923 0.59036145 
##        582        583        584        585        586        587 
## 0.02150538 0.28125000 0.28125000 0.66666667 0.02150538 0.59036145 
##        588        590        591        594        595        596 
## 0.02150538 0.02150538 0.66666667 0.13274336 0.66666667 0.78571429 
##        598        599        600        601        602        603 
## 0.13274336 0.78571429 0.02150538 0.13274336 0.02150538 0.28125000 
##        604        605        606        608        609        613 
## 0.59036145 0.78571429 0.13274336 0.02150538 0.59036145 0.78571429 
##        614        615        616        617        619        620 
## 0.13274336 0.59036145 0.02150538 0.28125000 0.66666667 0.13274336 
##        621        622        623        625        627        628 
## 0.28125000 0.02150538 0.78571429 0.13274336 0.02150538 0.59036145 
##        629        631        632        633        634        635 
## 0.59036145 0.66666667 0.13274336 0.02150538 0.23076923 0.02150538 
##        637        638        639        640        641        642 
## 0.28125000 0.13274336 0.66666667 0.02150538 0.13274336 0.59036145 
##        645        647        649        651        652        654 
## 0.13274336 0.78571429 0.23076923 0.02150538 0.13274336 0.13274336 
##        655        656        657        659        660        661 
## 0.13274336 0.78571429 0.02150538 0.28125000 0.13274336 0.78571429 
##        662        663        664        665        666        667 
## 0.78571429 0.78571429 0.59036145 0.28125000 0.13274336 0.59036145 
##        670        671        672        673        674        677 
## 0.59036145 0.78571429 0.02150538 0.28125000 0.13274336 0.78571429 
##        678        679        681        682        686        687 
## 0.13274336 0.13274336 0.02150538 0.78571429 0.59036145 0.23076923 
##        688        689        690        692        694        696 
## 0.28125000 0.23076923 0.59036145 0.78571429 0.59036145 0.59036145 
##        698        699        700        703        704        707 
## 0.02150538 0.66666667 0.13274336 0.78571429 0.59036145 0.02150538 
##        708        709        711        712        713        714 
## 0.13274336 0.78571429 0.78571429 0.28125000 0.59036145 0.23076923 
##        715        716        717        718        719        720 
## 0.28125000 0.78571429 0.78571429 0.02150538 0.13274336 0.28125000 
##        722        723        725        726        727        729 
## 0.13274336 0.23076923 0.28125000 0.28125000 0.13274336 0.78571429 
##        731        732        733        734        735        736 
## 0.23076923 0.13274336 0.78571429 0.13274336 0.02150538 0.28125000 
##        737        742        743        744        745        746 
## 0.13274336 0.13274336 0.13274336 0.59036145 0.59036145 0.28125000 
##        747        748        749        750        751        752 
## 0.59036145 0.66666667 0.78571429 0.78571429 0.59036145 0.28125000 
##        754        756        757        758        759        761 
## 0.78571429 0.59036145 0.59036145 0.28125000 0.13274336 0.13274336 
##        762        765        766        768 
## 0.78571429 0.13274336 0.02150538 0.13274336