Diabetes Risk Prediction Model

Nir Regev
Principal Data Scientist
Sisense Ltd.

April 16, 2016

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(rpart)
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
library("partykit")
## 
## Attaching package: 'partykit'
## The following objects are masked from 'package:party':
## 
##     cforest, ctree, ctree_control, edge_simple, mob, mob_control,
##     node_barplot, node_bivplot, node_boxplot, node_inner,
##     node_surv, node_terminal
library("Formula")
library("e1071")

Starting with Sisense Diabetes Data Widget

Sisense Diabetes Widget

A Generic Model For Diabetes Classificaiton

setwd("C:/Users/Nir.Regev/Documents/EXL")
data("PimaIndiansDiabetes", package = "mlbench")
attach(PimaIndiansDiabetes)
## The following object is masked from package:datasets:
## 
##     pressure
args <- lapply(PimaIndiansDiabetes, list)

Fitting a logistic regression to assess predictors importance

PimaIndiansDiabetes <- data.frame(mapply( FUN = c,args))
formula <- paste0(paste(names(PimaIndiansDiabetes)[length(PimaIndiansDiabetes)], collapse="+") ,"~", paste(names(PimaIndiansDiabetes)[1:(length(PimaIndiansDiabetes)-1)], collapse="+"))
logRegModel <- glm(formula = formula, family=binomial, data=PimaIndiansDiabetes)
logRegModel  
## 
## Call:  glm(formula = formula, family = binomial, data = PimaIndiansDiabetes)
## 
## Coefficients:
## (Intercept)     pregnant      glucose     pressure      triceps  
##   -8.404696     0.123182     0.035164    -0.013296     0.000619  
##     insulin         mass     pedigree          age  
##   -0.001192     0.089701     0.945180     0.014869  
## 
## Degrees of Freedom: 767 Total (i.e. Null);  759 Residual
## Null Deviance:       993.5 
## Residual Deviance: 723.4     AIC: 741.4

Filtering the most important predictors from GLM model

summary(logRegModel)
## 
## Call:
## glm(formula = formula, family = binomial, data = PimaIndiansDiabetes)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5566  -0.7274  -0.4159   0.7267   2.9297  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -8.4046964  0.7166359 -11.728  < 2e-16 ***
## pregnant     0.1231823  0.0320776   3.840 0.000123 ***
## glucose      0.0351637  0.0037087   9.481  < 2e-16 ***
## pressure    -0.0132955  0.0052336  -2.540 0.011072 *  
## triceps      0.0006190  0.0068994   0.090 0.928515    
## insulin     -0.0011917  0.0009012  -1.322 0.186065    
## mass         0.0897010  0.0150876   5.945 2.76e-09 ***
## pedigree     0.9451797  0.2991475   3.160 0.001580 ** 
## age          0.0148690  0.0093348   1.593 0.111192    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 993.48  on 767  degrees of freedom
## Residual deviance: 723.45  on 759  degrees of freedom
## AIC: 741.45
## 
## Number of Fisher Scoring iterations: 5
  # features selection - n highest logistic model coefficients
coeff.list <- exp(coef(logRegModel))[2:ncol(PimaIndiansDiabetes)]
coeff.list <- coeff.list[c(order(abs(coeff.list),decreasing=TRUE)[1:(ncol(PimaIndiansDiabetes)-1)])]
predictors.names <- c(names(coeff.list),names(PimaIndiansDiabetes)[length(PimaIndiansDiabetes)])
  
# filter df with n most important predictors
PimaIndiansDiabetes.proj <- PimaIndiansDiabetes[,c(predictors.names)]

Partitioning the data to Training/Testing sets

names(PimaIndiansDiabetes.proj)
## [1] "pedigree" "pregnant" "mass"     "glucose"  "age"      "triceps" 
## [7] "insulin"  "pressure" "diabetes"
intrain <- createDataPartition(y=PimaIndiansDiabetes.proj$diabetes,p=0.7,list=FALSE)
training <- PimaIndiansDiabetes.proj[intrain,]
testing<-PimaIndiansDiabetes.proj[-intrain,]

formula <- paste0(paste(names(PimaIndiansDiabetes.proj)[length(PimaIndiansDiabetes.proj)], collapse="+") ,"~", paste(names(PimaIndiansDiabetes.proj)[1:(length(PimaIndiansDiabetes.proj)-1)], collapse="+"))

Fitting a classificaiton tree

ct <- ctree(diabetes ~ ., data = training)
svg(filename="Diabites_Decision_Tree.svg",
    width=12,
    height=10,
    pointsize=12)
  plot(as.simpleparty(ct))
  dev.off()
## png 
##   2
## Predict Diabites Risk on new patients
predictions.probs <- predict(ct, testing,type = c("prob"))
predictions.class <- predict(ct, testing,type = c("response"))
table(predictions.class, testing$diabetes )
##                  
## predictions.class neg pos
##               neg 131  42
##               pos  19  38
## confusion matrix stats
cm <- confusionMatrix(testing$diabetes, predictions.class, positive = NULL, 
                 dnn = c("Prediction", "Reference"))
# printing confusion matrix
cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction neg pos
##        neg 131  19
##        pos  42  38
##                                           
##                Accuracy : 0.7348          
##                  95% CI : (0.6728, 0.7906)
##     No Information Rate : 0.7522          
##     P-Value [Acc > NIR] : 0.75614         
##                                           
##                   Kappa : 0.3734          
##  Mcnemar's Test P-Value : 0.00485         
##                                           
##             Sensitivity : 0.7572          
##             Specificity : 0.6667          
##          Pos Pred Value : 0.8733          
##          Neg Pred Value : 0.4750          
##              Prevalence : 0.7522          
##          Detection Rate : 0.5696          
##    Detection Prevalence : 0.6522          
##       Balanced Accuracy : 0.7119          
##                                           
##        'Positive' Class : neg             
## 

Sisense Diabetes Widget