Nir Regev
Principal Data Scientist
Sisense Ltd.
April 16, 2016
library(caret)## Loading required package: lattice
## Loading required package: ggplot2
library(rpart)
library(party)## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
library("partykit")##
## Attaching package: 'partykit'
## The following objects are masked from 'package:party':
##
## cforest, ctree, ctree_control, edge_simple, mob, mob_control,
## node_barplot, node_bivplot, node_boxplot, node_inner,
## node_surv, node_terminal
library("Formula")
library("e1071")Then train a classificaiton tree to predict high risk patients giving new patients’ data (which was not used for training the system)
Loading Diabetes data set and wrap it
setwd("C:/Users/Nir.Regev/Documents/EXL")
data("PimaIndiansDiabetes", package = "mlbench")
attach(PimaIndiansDiabetes)## The following object is masked from package:datasets:
##
## pressure
args <- lapply(PimaIndiansDiabetes, list)PimaIndiansDiabetes <- data.frame(mapply( FUN = c,args))
formula <- paste0(paste(names(PimaIndiansDiabetes)[length(PimaIndiansDiabetes)], collapse="+") ,"~", paste(names(PimaIndiansDiabetes)[1:(length(PimaIndiansDiabetes)-1)], collapse="+"))
logRegModel <- glm(formula = formula, family=binomial, data=PimaIndiansDiabetes)
logRegModel ##
## Call: glm(formula = formula, family = binomial, data = PimaIndiansDiabetes)
##
## Coefficients:
## (Intercept) pregnant glucose pressure triceps
## -8.404696 0.123182 0.035164 -0.013296 0.000619
## insulin mass pedigree age
## -0.001192 0.089701 0.945180 0.014869
##
## Degrees of Freedom: 767 Total (i.e. Null); 759 Residual
## Null Deviance: 993.5
## Residual Deviance: 723.4 AIC: 741.4
summary(logRegModel)##
## Call:
## glm(formula = formula, family = binomial, data = PimaIndiansDiabetes)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5566 -0.7274 -0.4159 0.7267 2.9297
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.4046964 0.7166359 -11.728 < 2e-16 ***
## pregnant 0.1231823 0.0320776 3.840 0.000123 ***
## glucose 0.0351637 0.0037087 9.481 < 2e-16 ***
## pressure -0.0132955 0.0052336 -2.540 0.011072 *
## triceps 0.0006190 0.0068994 0.090 0.928515
## insulin -0.0011917 0.0009012 -1.322 0.186065
## mass 0.0897010 0.0150876 5.945 2.76e-09 ***
## pedigree 0.9451797 0.2991475 3.160 0.001580 **
## age 0.0148690 0.0093348 1.593 0.111192
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 993.48 on 767 degrees of freedom
## Residual deviance: 723.45 on 759 degrees of freedom
## AIC: 741.45
##
## Number of Fisher Scoring iterations: 5
# features selection - n highest logistic model coefficients
coeff.list <- exp(coef(logRegModel))[2:ncol(PimaIndiansDiabetes)]
coeff.list <- coeff.list[c(order(abs(coeff.list),decreasing=TRUE)[1:(ncol(PimaIndiansDiabetes)-1)])]
predictors.names <- c(names(coeff.list),names(PimaIndiansDiabetes)[length(PimaIndiansDiabetes)])
# filter df with n most important predictors
PimaIndiansDiabetes.proj <- PimaIndiansDiabetes[,c(predictors.names)]names(PimaIndiansDiabetes.proj)## [1] "pedigree" "pregnant" "mass" "glucose" "age" "triceps"
## [7] "insulin" "pressure" "diabetes"
intrain <- createDataPartition(y=PimaIndiansDiabetes.proj$diabetes,p=0.7,list=FALSE)
training <- PimaIndiansDiabetes.proj[intrain,]
testing<-PimaIndiansDiabetes.proj[-intrain,]
formula <- paste0(paste(names(PimaIndiansDiabetes.proj)[length(PimaIndiansDiabetes.proj)], collapse="+") ,"~", paste(names(PimaIndiansDiabetes.proj)[1:(length(PimaIndiansDiabetes.proj)-1)], collapse="+"))ct <- ctree(diabetes ~ ., data = training)
svg(filename="Diabites_Decision_Tree.svg",
width=12,
height=10,
pointsize=12)
plot(as.simpleparty(ct))
dev.off()## png
## 2
## Predict Diabites Risk on new patients
predictions.probs <- predict(ct, testing,type = c("prob"))
predictions.class <- predict(ct, testing,type = c("response"))
table(predictions.class, testing$diabetes )##
## predictions.class neg pos
## neg 131 42
## pos 19 38
## confusion matrix stats
cm <- confusionMatrix(testing$diabetes, predictions.class, positive = NULL,
dnn = c("Prediction", "Reference"))
# printing confusion matrix
cm## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 131 19
## pos 42 38
##
## Accuracy : 0.7348
## 95% CI : (0.6728, 0.7906)
## No Information Rate : 0.7522
## P-Value [Acc > NIR] : 0.75614
##
## Kappa : 0.3734
## Mcnemar's Test P-Value : 0.00485
##
## Sensitivity : 0.7572
## Specificity : 0.6667
## Pos Pred Value : 0.8733
## Neg Pred Value : 0.4750
## Prevalence : 0.7522
## Detection Rate : 0.5696
## Detection Prevalence : 0.6522
## Balanced Accuracy : 0.7119
##
## 'Positive' Class : neg
##