library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

library(rpart)
library(party)

## Loading required package: grid

## Loading required package: mvtnorm

## Loading required package: modeltools

## Loading required package: stats4

## Loading required package: strucchange

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: sandwich

library("partykit")

## 
## Attaching package: 'partykit'

## The following objects are masked from 'package:party':
## 
##     cforest, ctree, ctree_control, edge_simple, mob, mob_control,
##     node_barplot, node_bivplot, node_boxplot, node_inner,
##     node_surv, node_terminal

library("Formula")
library("e1071")

Starting with Sisense Diabetes Data Widget

A Generic Model For Diabetes Classificaiton

This model recieves Sisense widget, a diabetes predictors/risk factors data set
Fits a logistic regression to assess predictor importance
Then train a classificaiton tree to predict high risk patients giving new patients’ data (which was not used for training the system)
Loading Diabetes data set and wrap it

setwd("C:/Users/Nir.Regev/Documents/EXL")
data("PimaIndiansDiabetes", package = "mlbench")
attach(PimaIndiansDiabetes)

## The following object is masked from package:datasets:
## 
##     pressure

args <- lapply(PimaIndiansDiabetes, list)

args is sent from Sisense widget

Fitting a logistic regression to assess predictors importance

Extracting args sent from widget, to a data frame
Fitting a GLM (General Linear Model) with link function ‘probit’
Target variable ‘diabetes’ estimated to be distributed binomial
This is a generic implementation - no assumption on data or data scheme

PimaIndiansDiabetes <- data.frame(mapply( FUN = c,args))
formula <- paste0(paste(names(PimaIndiansDiabetes)[length(PimaIndiansDiabetes)], collapse="+") ,"~", paste(names(PimaIndiansDiabetes)[1:(length(PimaIndiansDiabetes)-1)], collapse="+"))
logRegModel <- glm(formula = formula, family=binomial, data=PimaIndiansDiabetes)
logRegModel

## 
## Call:  glm(formula = formula, family = binomial, data = PimaIndiansDiabetes)
## 
## Coefficients:
## (Intercept)     pregnant      glucose     pressure      triceps  
##   -8.404696     0.123182     0.035164    -0.013296     0.000619  
##     insulin         mass     pedigree          age  
##   -0.001192     0.089701     0.945180     0.014869  
## 
## Degrees of Freedom: 767 Total (i.e. Null);  759 Residual
## Null Deviance:       993.5 
## Residual Deviance: 723.4     AIC: 741.4

Filtering the most important predictors from GLM model

Extracting the N most important GLM coefficients

summary(logRegModel)

## 
## Call:
## glm(formula = formula, family = binomial, data = PimaIndiansDiabetes)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5566  -0.7274  -0.4159   0.7267   2.9297  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -8.4046964  0.7166359 -11.728  < 2e-16 ***
## pregnant     0.1231823  0.0320776   3.840 0.000123 ***
## glucose      0.0351637  0.0037087   9.481  < 2e-16 ***
## pressure    -0.0132955  0.0052336  -2.540 0.011072 *  
## triceps      0.0006190  0.0068994   0.090 0.928515    
## insulin     -0.0011917  0.0009012  -1.322 0.186065    
## mass         0.0897010  0.0150876   5.945 2.76e-09 ***
## pedigree     0.9451797  0.2991475   3.160 0.001580 ** 
## age          0.0148690  0.0093348   1.593 0.111192    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 993.48  on 767  degrees of freedom
## Residual deviance: 723.45  on 759  degrees of freedom
## AIC: 741.45
## 
## Number of Fisher Scoring iterations: 5

  # features selection - n highest logistic model coefficients
coeff.list <- exp(coef(logRegModel))[2:ncol(PimaIndiansDiabetes)]
coeff.list <- coeff.list[c(order(abs(coeff.list),decreasing=TRUE)[1:(ncol(PimaIndiansDiabetes)-1)])]
predictors.names <- c(names(coeff.list),names(PimaIndiansDiabetes)[length(PimaIndiansDiabetes)])
  
# filter df with n most important predictors
PimaIndiansDiabetes.proj <- PimaIndiansDiabetes[,c(predictors.names)]

Partitioning the data to Training/Testing sets

Best Practice to train the model on training data and test/predict on new data which the system has not ‘seen’ before
Creating a generic training formula (target ~ predictor list)

names(PimaIndiansDiabetes.proj)

## [1] "pedigree" "pregnant" "mass"     "glucose"  "age"      "triceps" 
## [7] "insulin"  "pressure" "diabetes"

intrain <- createDataPartition(y=PimaIndiansDiabetes.proj$diabetes,p=0.7,list=FALSE)
training <- PimaIndiansDiabetes.proj[intrain,]
testing<-PimaIndiansDiabetes.proj[-intrain,]

formula <- paste0(paste(names(PimaIndiansDiabetes.proj)[length(PimaIndiansDiabetes.proj)], collapse="+") ,"~", paste(names(PimaIndiansDiabetes.proj)[1:(length(PimaIndiansDiabetes.proj)-1)], collapse="+"))

Fitting a classificaiton tree

Training a classificaiton tree to predict diabetes
Test predictions on new patients data

ct <- ctree(diabetes ~ ., data = training)
svg(filename="Diabites_Decision_Tree.svg",
    width=12,
    height=10,
    pointsize=12)
  plot(as.simpleparty(ct))
  dev.off()

## png 
##   2

## Predict Diabites Risk on new patients
predictions.probs <- predict(ct, testing,type = c("prob"))
predictions.class <- predict(ct, testing,type = c("response"))
table(predictions.class, testing$diabetes )

##                  
## predictions.class neg pos
##               neg 131  42
##               pos  19  38

## confusion matrix stats
cm <- confusionMatrix(testing$diabetes, predictions.class, positive = NULL, 
                 dnn = c("Prediction", "Reference"))
# printing confusion matrix
cm

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction neg pos
##        neg 131  19
##        pos  42  38
##                                           
##                Accuracy : 0.7348          
##                  95% CI : (0.6728, 0.7906)
##     No Information Rate : 0.7522          
##     P-Value [Acc > NIR] : 0.75614         
##                                           
##                   Kappa : 0.3734          
##  Mcnemar's Test P-Value : 0.00485         
##                                           
##             Sensitivity : 0.7572          
##             Specificity : 0.6667          
##          Pos Pred Value : 0.8733          
##          Neg Pred Value : 0.4750          
##              Prevalence : 0.7522          
##          Detection Rate : 0.5696          
##    Detection Prevalence : 0.6522          
##       Balanced Accuracy : 0.7119          
##                                           
##        'Positive' Class : neg             
##

Sisense Diabetes Widget

Diabetes Risk Prediction Model

Starting with Sisense Diabetes Data Widget

A Generic Model For Diabetes Classificaiton

Fitting a logistic regression to assess predictors importance

Filtering the most important predictors from GLM model

Partitioning the data to Training/Testing sets

Fitting a classificaiton tree