Diabetes Risk Prediction Model

Nir Regev
Chief Data Scientist
Sisense Ltd.

June 14, 2016

library(caret)
## Warning: package 'caret' was built under R version 3.2.5
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.5
library(rpart)
## Warning: package 'rpart' was built under R version 3.2.5
library(party)
## Warning: package 'party' was built under R version 3.2.5
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
library(partykit)
## Warning: package 'partykit' was built under R version 3.2.5
## 
## Attaching package: 'partykit'
## The following objects are masked from 'package:party':
## 
##     cforest, ctree, ctree_control, edge_simple, mob, mob_control,
##     node_barplot, node_bivplot, node_boxplot, node_inner,
##     node_surv, node_terminal
library(Formula)
library(e1071)
## Warning: package 'e1071' was built under R version 3.2.5
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.2.5
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.2.5
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.2.5
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(ggplot2)

Starting with Sisense Diabetes Data Widget

Sisense Diabetes Widget Attribute Information:

  1. Number of times pregnant
  2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test
  3. Diastolic blood pressure (mm Hg)
  4. Triceps skin fold thickness (mm)
  5. 2-Hour serum insulin (mu U/ml)
  6. Body mass index (weight in kg/(height in m)^2)
  7. Diabetes pedigree function
  8. Age (years)
  9. Class variable (0 or 1)

A Generic Model For Diabetes Classificaiton

setwd("C:/Users/Nir.Regev/Documents/EXL")
data("PimaIndiansDiabetes", package = "mlbench")
attach(PimaIndiansDiabetes)
## The following object is masked from package:datasets:
## 
##     pressure
args <- lapply(PimaIndiansDiabetes, list)

Fitting a logistic regression to assess predictors importance

PimaIndiansDiabetes <- data.frame(mapply( FUN = c,args))
formula <- paste0(paste(names(PimaIndiansDiabetes)[length(PimaIndiansDiabetes)], collapse="+") ,"~", paste(names(PimaIndiansDiabetes)[1:(length(PimaIndiansDiabetes)-1)], collapse="+"))
logRegModel <- glm(formula = formula, family=binomial, data=PimaIndiansDiabetes)
logRegModel  
## 
## Call:  glm(formula = formula, family = binomial, data = PimaIndiansDiabetes)
## 
## Coefficients:
## (Intercept)     pregnant      glucose     pressure      triceps  
##   -8.404696     0.123182     0.035164    -0.013296     0.000619  
##     insulin         mass     pedigree          age  
##   -0.001192     0.089701     0.945180     0.014869  
## 
## Degrees of Freedom: 767 Total (i.e. Null);  759 Residual
## Null Deviance:       993.5 
## Residual Deviance: 723.4     AIC: 741.4

FEATURE SELECTION - Filtering the most important predictors from GLM model

summary(logRegModel)
## 
## Call:
## glm(formula = formula, family = binomial, data = PimaIndiansDiabetes)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5566  -0.7274  -0.4159   0.7267   2.9297  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -8.4046964  0.7166359 -11.728  < 2e-16 ***
## pregnant     0.1231823  0.0320776   3.840 0.000123 ***
## glucose      0.0351637  0.0037087   9.481  < 2e-16 ***
## pressure    -0.0132955  0.0052336  -2.540 0.011072 *  
## triceps      0.0006190  0.0068994   0.090 0.928515    
## insulin     -0.0011917  0.0009012  -1.322 0.186065    
## mass         0.0897010  0.0150876   5.945 2.76e-09 ***
## pedigree     0.9451797  0.2991475   3.160 0.001580 ** 
## age          0.0148690  0.0093348   1.593 0.111192    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 993.48  on 767  degrees of freedom
## Residual deviance: 723.45  on 759  degrees of freedom
## AIC: 741.45
## 
## Number of Fisher Scoring iterations: 5
# FEATURES SELECTION - check which predictors have coefficients significance level grather than 0.05 (not sinificant)
# and exclude out
coeff.df <- data.frame(summary(logRegModel)$coefficients)
colnames(coeff.df)[ncol(coeff.df)] <- "significance"
coeff.df <- subset(coeff.df, significance < 0.05 )
predictors.names <- rownames(coeff.df)[2:nrow(coeff.df)]
PimaIndiansDiabetes.proj <- cbind(PimaIndiansDiabetes[,c(predictors.names)],PimaIndiansDiabetes$diabetes)
colnames(PimaIndiansDiabetes.proj)[ncol(PimaIndiansDiabetes.proj)] <- "diabetes"

Dimension Reduction and Faeature Selstion (PCA)

PimaIndiansDiabetes.pca <- princomp(PimaIndiansDiabetes[,1:(length(PimaIndiansDiabetes)-1)],
                                    center = TRUE,
                                    scale. = TRUE)
View(PimaIndiansDiabetes.pca$scores)
target.numeric <- ifelse (PimaIndiansDiabetes$diabetes == "neg", 0 ,1)
pca.sisense.result <- data.frame(PimaIndiansDiabetes.pca$scores[,1:2],target.numeric)
x <- pca.sisense.result$Comp.1
y <- pca.sisense.result$Comp.2
pca.sisense.result$class <- as.factor(pca.sisense.result$target.numeric)
ggplot(pca.sisense.result, aes(Comp.1, Comp.2)) + 
  geom_point(aes(colour = class)) 

Partitioning the data to Training/Testing sets

names(PimaIndiansDiabetes.proj)
## [1] "pregnant" "glucose"  "pressure" "mass"     "pedigree" "diabetes"
intrain <- createDataPartition(y=PimaIndiansDiabetes.proj$diabetes,p=0.7,list=FALSE)
training <- PimaIndiansDiabetes.proj[intrain,]
testing<-PimaIndiansDiabetes.proj[-intrain,]

formula <- paste0(paste(names(PimaIndiansDiabetes.proj)[length(PimaIndiansDiabetes.proj)], collapse="+") ,"~", paste(names(PimaIndiansDiabetes.proj)[1:(length(PimaIndiansDiabetes.proj)-1)], collapse="+"))

Fitting a classificaiton tree

ct <- ctree(diabetes ~ ., data = training)
svg(filename="Diabites_Decision_Tree.svg",
    width=12,
    height=10,
    pointsize=12)
  plot(as.simpleparty(ct))
  dev.off()
## png 
##   2
## Predict Diabites Risk on new patients
predictions.probs <- predict(ct, testing,type = c("prob"))
predictions.class <- predict(ct, testing,type = c("response"))
table(predictions.class, testing$diabetes )
##                  
## predictions.class neg pos
##               neg 130  44
##               pos  20  36
## confusion matrix stats on testing set
cm <- confusionMatrix(testing$diabetes, predictions.class, positive = NULL, 
                 dnn = c("Prediction", "Reference"))
# printing confusion matrix
cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction neg pos
##        neg 130  20
##        pos  44  36
##                                          
##                Accuracy : 0.7217         
##                  95% CI : (0.659, 0.7786)
##     No Information Rate : 0.7565         
##     P-Value [Acc > NIR] : 0.90280        
##                                          
##                   Kappa : 0.3405         
##  Mcnemar's Test P-Value : 0.00404        
##                                          
##             Sensitivity : 0.7471         
##             Specificity : 0.6429         
##          Pos Pred Value : 0.8667         
##          Neg Pred Value : 0.4500         
##              Prevalence : 0.7565         
##          Detection Rate : 0.5652         
##    Detection Prevalence : 0.6522         
##       Balanced Accuracy : 0.6950         
##                                          
##        'Positive' Class : neg            
## 
roc_pred <- prediction(predictions.probs[,2], testing$diabetes)
#png(filename=paste("decision_tree_roc.png"),width = 1200, height = 800)
plot(performance(roc_pred, measure="sens", x.measure="spec"), colorize=TRUE)
auc.perf = performance(roc_pred, measure = "auc")
title(unlist(auc.perf@y.values))

#dev.off()

Sisense Diabetes Widget

To optimize prediction accuracy - Ensemble Models

fit.rf <-randomForest(diabetes ~ ., data=training)
print(fit.rf)
## 
## Call:
##  randomForest(formula = diabetes ~ ., data = training) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 21.19%
## Confusion matrix:
##     neg pos class.error
## neg 305  45   0.1285714
## pos  69 119   0.3670213
#importance(fit.rf)
rf.predictions.class <- predict(fit.rf, testing)
rf.predictions.probs <- predict(fit.rf, testing,type = c("prob"))
cm.rf <- confusionMatrix(testing$diabetes, rf.predictions.class, positive = NULL, 
                 dnn = c("Prediction", "Reference"))
# printing confusion matrix
cm.rf
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction neg pos
##        neg 129  21
##        pos  40  40
##                                           
##                Accuracy : 0.7348          
##                  95% CI : (0.6728, 0.7906)
##     No Information Rate : 0.7348          
##     P-Value [Acc > NIR] : 0.53439         
##                                           
##                   Kappa : 0.3811          
##  Mcnemar's Test P-Value : 0.02119         
##                                           
##             Sensitivity : 0.7633          
##             Specificity : 0.6557          
##          Pos Pred Value : 0.8600          
##          Neg Pred Value : 0.5000          
##              Prevalence : 0.7348          
##          Detection Rate : 0.5609          
##    Detection Prevalence : 0.6522          
##       Balanced Accuracy : 0.7095          
##                                           
##        'Positive' Class : neg             
## 
roc_pred.rf <- prediction(rf.predictions.probs[,2], testing$diabetes)
plot(performance(roc_pred.rf, measure="sens", x.measure="spec"), colorize=TRUE)
rf.auc.perf = performance(roc_pred.rf, measure = "auc")
title(unlist(rf.auc.perf@y.values))

- an AUC improvement of

unlist(rf.auc.perf@y.values) - unlist(auc.perf@y.values)
## [1] 0.03920833

NOT BAD !

Returning diabetes risk probabilities to sisense

predictions.probs <- predict(ct, training,type = c("prob"))
predictions.probs[,2]
##          1          3          4          5          7          9 
## 0.54347826 0.84810127 0.17518248 0.36842105 0.17518248 0.84810127 
##         10         12         13         14         15         16 
## 0.05970149 0.84810127 0.05970149 0.84810127 0.84810127 0.17518248 
##         17         19         20         21         22         23 
## 0.36842105 0.17518248 0.36842105 0.36842105 0.17518248 0.84810127 
##         24         26         27         29         30         31 
## 0.78571429 0.78571429 0.54347826 0.54347826 0.36842105 0.36842105 
##         32         33         35         36         37         38 
## 0.84810127 0.05970149 0.05970149 0.05970149 0.78571429 0.17518248 
##         39         40         42         44         46         47 
## 0.17518248 0.36842105 0.36842105 0.84810127 0.84810127 0.54347826 
##         48         50         52         53         54         55 
## 0.17518248 0.05970149 0.05970149 0.05970149 0.84810127 0.54347826 
##         56         58         60         62         63         64 
## 0.05970149 0.17518248 0.17518248 0.78571429 0.05970149 0.05970149 
##         65         66         67         68         69         70 
## 0.36842105 0.17518248 0.36842105 0.36842105 0.05970149 0.54347826 
##         72         73         74         75         77         79 
## 0.36842105 0.78571429 0.36842105 0.17518248 0.17518248 0.36842105 
##         81         82         83         84         85         87 
## 0.05970149 0.05970149 0.17518248 0.05970149 0.36842105 0.17518248 
##         89         90         91         93         94         95 
## 0.78571429 0.05970149 0.05970149 0.17518248 0.05970149 0.05970149 
##         96         98         99        101        103        104 
## 0.54347826 0.05970149 0.17518248 0.84810127 0.05970149 0.05970149 
##        106        109        110        112        113        115 
## 0.36842105 0.17518248 0.17518248 0.54347826 0.17518248 0.84810127 
##        118        119        121        123        124        126 
## 0.17518248 0.17518248 0.84810127 0.17518248 0.05970149 0.17518248 
##        127        130        132        133        135        136 
## 0.36842105 0.17518248 0.78571429 0.84810127 0.05970149 0.36842105 
##        137        138        139        140        141        142 
## 0.17518248 0.17518248 0.36842105 0.17518248 0.05970149 0.17518248 
##        143        146        147        149        151        152 
## 0.17518248 0.05970149 0.17518248 0.54347826 0.36842105 0.05970149 
##        154        155        156        157        158        159 
## 0.54347826 0.84810127 0.54347826 0.05970149 0.05970149 0.17518248 
##        161        162        164        169        170        171 
## 0.54347826 0.17518248 0.17518248 0.36842105 0.36842105 0.17518248 
##        172        173        174        175        176        177 
## 0.36842105 0.17518248 0.17518248 0.17518248 0.84810127 0.17518248 
##        178        181        184        185        186        187 
## 0.36842105 0.05970149 0.05970149 0.05970149 0.84810127 0.84810127 
##        188        189        190        191        192        194 
## 0.36842105 0.78571429 0.36842105 0.05970149 0.78571429 0.78571429 
##        195        196        197        199        201        202 
## 0.05970149 0.84810127 0.05970149 0.36842105 0.36842105 0.36842105 
##        203        204        205        206        207        208 
## 0.05970149 0.05970149 0.17518248 0.05970149 0.84810127 0.84810127 
##        209        211        213        215        216        217 
## 0.17518248 0.05970149 0.84810127 0.78571429 0.54347826 0.36842105 
##        218        219        220        221        224        226 
## 0.36842105 0.17518248 0.36842105 0.84810127 0.36842105 0.17518248 
##        228        229        230        233        234        236 
## 0.84810127 0.84810127 0.36842105 0.05970149 0.36842105 0.84810127 
##        237        238        240        241        243        244 
## 0.84810127 0.84810127 0.05970149 0.17518248 0.05970149 0.05970149 
##        245        246        249        250        251        252 
## 0.54347826 0.84810127 0.78571429 0.36842105 0.17518248 0.36842105 
##        254        255        256        261        263        265 
## 0.17518248 0.05970149 0.36842105 0.84810127 0.17518248 0.36842105 
##        266        267        269        275        276        277 
## 0.17518248 0.36842105 0.05970149 0.17518248 0.17518248 0.05970149 
##        278        279        280        281        282        286 
## 0.05970149 0.05970149 0.05970149 0.54347826 0.78571429 0.05970149 
##        287        288        290        291        293        295 
## 0.54347826 0.36842105 0.17518248 0.17518248 0.36842105 0.84810127 
##        296        297        298        299        301        302 
## 0.54347826 0.54347826 0.36842105 0.17518248 0.84810127 0.54347826 
##        305        306        307        310        311        313 
## 0.54347826 0.36842105 0.84810127 0.36842105 0.05970149 0.54347826 
##        315        319        321        322        323        324 
## 0.36842105 0.36842105 0.05970149 0.36842105 0.05970149 0.54347826 
##        325        327        328        329        332        333 
## 0.36842105 0.36842105 0.84810127 0.17518248 0.17518248 0.84810127 
##        334        339        340        342        343        344 
## 0.05970149 0.54347826 0.84810127 0.05970149 0.17518248 0.36842105 
##        347        348        349        350        352        355 
## 0.36842105 0.05970149 0.05970149 0.17518248 0.36842105 0.17518248 
##        356        358        359        360        361        362 
## 0.84810127 0.78571429 0.17518248 0.84810127 0.84810127 0.84810127 
##        363        364        366        371        372        373 
## 0.17518248 0.54347826 0.17518248 0.84810127 0.05970149 0.17518248 
##        374        375        377        378        380        381 
## 0.17518248 0.36842105 0.05970149 0.17518248 0.17518248 0.17518248 
##        382        383        385        387        388        389 
## 0.05970149 0.05970149 0.05970149 0.36842105 0.17518248 0.54347826 
##        393        394        395        396        398        399 
## 0.05970149 0.05970149 0.84810127 0.05970149 0.36842105 0.05970149 
##        400        404        405        407        409        410 
## 0.84810127 0.17518248 0.84810127 0.36842105 0.84810127 0.84810127 
##        411        412        413        416        417        418 
## 0.17518248 0.36842105 0.36842105 0.84810127 0.05970149 0.54347826 
##        422        423        425        427        428        429 
## 0.05970149 0.17518248 0.54347826 0.05970149 0.84810127 0.36842105 
##        430        431        433        436        437        438 
## 0.17518248 0.05970149 0.17518248 0.36842105 0.78571429 0.54347826 
##        439        440        441        442        443        444 
## 0.05970149 0.17518248 0.84810127 0.17518248 0.36842105 0.17518248 
##        445        446        447        448        449        450 
## 0.36842105 0.84810127 0.05970149 0.17518248 0.17518248 0.36842105 
##        451        452        453        454        455        456 
## 0.05970149 0.36842105 0.17518248 0.05970149 0.17518248 0.84810127 
##        457        458        459        460        461        462 
## 0.05970149 0.17518248 0.54347826 0.05970149 0.05970149 0.05970149 
##        463        464        465        467        469        470 
## 0.17518248 0.05970149 0.05970149 0.05970149 0.78571429 0.54347826 
##        471        474        475        476        478        479 
## 0.54347826 0.36842105 0.36842105 0.05970149 0.05970149 0.05970149 
##        481        484        485        486        487        488 
## 0.84810127 0.17518248 0.54347826 0.36842105 0.36842105 0.84810127 
##        490        493        494        495        496        497 
## 0.84810127 0.17518248 0.36842105 0.05970149 0.84810127 0.05970149 
##        499        500        501        502        504        506 
## 0.84810127 0.54347826 0.05970149 0.17518248 0.17518248 0.17518248 
##        507        508        511        512        514        516 
## 0.84810127 0.36842105 0.17518248 0.05970149 0.05970149 0.84810127 
##        517        519        520        521        522        523 
## 0.54347826 0.17518248 0.05970149 0.05970149 0.36842105 0.05970149 
##        524        526        527        528        529        530 
## 0.78571429 0.05970149 0.05970149 0.05970149 0.36842105 0.05970149 
##        531        532        533        534        535        536 
## 0.36842105 0.17518248 0.17518248 0.17518248 0.17518248 0.36842105 
##        537        538        539        541        542        544 
## 0.17518248 0.05970149 0.36842105 0.17518248 0.36842105 0.17518248 
##        546        547        548        549        550        551 
## 0.84810127 0.84810127 0.36842105 0.84810127 0.84810127 0.05970149 
##        552        553        554        556        557        558 
## 0.17518248 0.05970149 0.17518248 0.05970149 0.17518248 0.05970149 
##        559        560        564        565        568        569 
## 0.17518248 0.17518248 0.05970149 0.17518248 0.17518248 0.54347826 
##        573        574        575        576        578        579 
## 0.36842105 0.17518248 0.36842105 0.36842105 0.36842105 0.05970149 
##        581        582        583        585        587        588 
## 0.54347826 0.05970149 0.05970149 0.78571429 0.78571429 0.05970149 
##        589        590        591        592        594        597 
## 0.84810127 0.05970149 0.78571429 0.36842105 0.17518248 0.17518248 
##        598        599        602        603        604        606 
## 0.05970149 0.84810127 0.05970149 0.05970149 0.54347826 0.36842105 
##        607        608        609        611        614        615 
## 0.84810127 0.05970149 0.54347826 0.17518248 0.17518248 0.78571429 
##        617        618        619        620        621        622 
## 0.36842105 0.05970149 0.78571429 0.36842105 0.36842105 0.05970149 
##        629        630        631        632        633        635 
## 0.36842105 0.05970149 0.05970149 0.17518248 0.05970149 0.05970149 
##        636        639        640        641        642        643 
## 0.17518248 0.17518248 0.05970149 0.17518248 0.36842105 0.54347826 
##        644        645        646        648        649        650 
## 0.17518248 0.05970149 0.54347826 0.84810127 0.78571429 0.05970149 
##        651        652        654        655        657        658 
## 0.05970149 0.36842105 0.05970149 0.17518248 0.05970149 0.36842105 
##        659        660        662        664        666        669 
## 0.78571429 0.17518248 0.84810127 0.54347826 0.36842105 0.17518248 
##        670        672        673        674        675        676 
## 0.54347826 0.05970149 0.17518248 0.36842105 0.17518248 0.84810127 
##        678        679        681        682        683        685 
## 0.17518248 0.36842105 0.05970149 0.84810127 0.17518248 0.05970149 
##        686        687        690        692        694        695 
## 0.36842105 0.05970149 0.54347826 0.84810127 0.36842105 0.05970149 
##        696        697        700        701        705        706 
## 0.36842105 0.84810127 0.36842105 0.36842105 0.36842105 0.17518248 
##        708        710        712        713        714        715 
## 0.36842105 0.17518248 0.36842105 0.78571429 0.05970149 0.17518248 
##        716        717        718        719        720        721 
## 0.84810127 0.84810127 0.05970149 0.17518248 0.17518248 0.17518248 
##        722        723        724        725        726        727 
## 0.36842105 0.54347826 0.36842105 0.36842105 0.36842105 0.36842105 
##        729        730        731        732        733        734 
## 0.84810127 0.17518248 0.36842105 0.78571429 0.84810127 0.17518248 
##        735        736        738        739        740        741 
## 0.05970149 0.17518248 0.17518248 0.17518248 0.17518248 0.78571429 
##        742        743        744        745        746        747 
## 0.17518248 0.36842105 0.78571429 0.54347826 0.17518248 0.54347826 
##        748        750        752        753        754        756 
## 0.17518248 0.84810127 0.36842105 0.05970149 0.84810127 0.36842105 
##        757        758        759        760        762        763 
## 0.36842105 0.36842105 0.17518248 0.84810127 0.84810127 0.05970149 
##        764        765        766        768 
## 0.17518248 0.36842105 0.05970149 0.17518248