Nir Regev
Principal Data Scientist
Sisense Ltd.
April 16, 2016
library(caret)## Loading required package: lattice
## Loading required package: ggplot2
library(rpart)
library(party)## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
library(partykit)##
## Attaching package: 'partykit'
## The following objects are masked from 'package:party':
##
## cforest, ctree, ctree_control, edge_simple, mob, mob_control,
## node_barplot, node_bivplot, node_boxplot, node_inner,
## node_surv, node_terminal
library(Formula)
library(e1071)
library(ROCR)## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(randomForest)## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(ggplot2) Attribute Information:
Then train a classificaiton tree to predict high risk patients giving new patients’ data (which was not used for training the system)
Loading Diabetes data set and wrap it
setwd("C:/Users/Nir.Regev/Documents/EXL")
data("PimaIndiansDiabetes", package = "mlbench")
attach(PimaIndiansDiabetes)## The following object is masked from package:datasets:
##
## pressure
args <- lapply(PimaIndiansDiabetes, list)PimaIndiansDiabetes <- data.frame(mapply( FUN = c,args))
formula <- paste0(paste(names(PimaIndiansDiabetes)[length(PimaIndiansDiabetes)], collapse="+") ,"~", paste(names(PimaIndiansDiabetes)[1:(length(PimaIndiansDiabetes)-1)], collapse="+"))
logRegModel <- glm(formula = formula, family=binomial, data=PimaIndiansDiabetes)
logRegModel ##
## Call: glm(formula = formula, family = binomial, data = PimaIndiansDiabetes)
##
## Coefficients:
## (Intercept) pregnant glucose pressure triceps
## -8.404696 0.123182 0.035164 -0.013296 0.000619
## insulin mass pedigree age
## -0.001192 0.089701 0.945180 0.014869
##
## Degrees of Freedom: 767 Total (i.e. Null); 759 Residual
## Null Deviance: 993.5
## Residual Deviance: 723.4 AIC: 741.4
summary(logRegModel)##
## Call:
## glm(formula = formula, family = binomial, data = PimaIndiansDiabetes)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5566 -0.7274 -0.4159 0.7267 2.9297
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.4046964 0.7166359 -11.728 < 2e-16 ***
## pregnant 0.1231823 0.0320776 3.840 0.000123 ***
## glucose 0.0351637 0.0037087 9.481 < 2e-16 ***
## pressure -0.0132955 0.0052336 -2.540 0.011072 *
## triceps 0.0006190 0.0068994 0.090 0.928515
## insulin -0.0011917 0.0009012 -1.322 0.186065
## mass 0.0897010 0.0150876 5.945 2.76e-09 ***
## pedigree 0.9451797 0.2991475 3.160 0.001580 **
## age 0.0148690 0.0093348 1.593 0.111192
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 993.48 on 767 degrees of freedom
## Residual deviance: 723.45 on 759 degrees of freedom
## AIC: 741.45
##
## Number of Fisher Scoring iterations: 5
# features selection - n highest logistic model coefficients
coeff.list <- exp(coef(logRegModel))[2:ncol(PimaIndiansDiabetes)]
coeff.list <- coeff.list[c(order(abs(coeff.list),decreasing=TRUE)[1:(ncol(PimaIndiansDiabetes)-1)])]
predictors.names <- c(names(coeff.list),names(PimaIndiansDiabetes)[length(PimaIndiansDiabetes)])
# filter df with n most important predictors
PimaIndiansDiabetes.proj <- PimaIndiansDiabetes[,c(predictors.names)]PimaIndiansDiabetes.pca <- princomp(PimaIndiansDiabetes[,1:(length(PimaIndiansDiabetes)-1)],
center = TRUE,
scale. = TRUE)## Warning: In princomp.default(PimaIndiansDiabetes[, 1:(length(PimaIndiansDiabetes) -
## 1)], center = TRUE, scale. = TRUE) :
## extra arguments 'center', 'scale.' will be disregarded
View(PimaIndiansDiabetes.pca$scores)
target.numeric <- ifelse (PimaIndiansDiabetes$diabetes == "neg", 0 ,1)
pca.sisense.result <- data.frame(PimaIndiansDiabetes.pca$scores[,1:2],target.numeric)
x <- pca.sisense.result$Comp.1
y <- pca.sisense.result$Comp.2
pca.sisense.result$class <- as.factor(pca.sisense.result$target.numeric)
ggplot(pca.sisense.result, aes(Comp.1, Comp.2)) +
geom_point(aes(colour = class)) names(PimaIndiansDiabetes.proj)## [1] "pedigree" "pregnant" "mass" "glucose" "age" "triceps"
## [7] "insulin" "pressure" "diabetes"
intrain <- createDataPartition(y=PimaIndiansDiabetes.proj$diabetes,p=0.7,list=FALSE)
training <- PimaIndiansDiabetes.proj[intrain,]
testing<-PimaIndiansDiabetes.proj[-intrain,]
formula <- paste0(paste(names(PimaIndiansDiabetes.proj)[length(PimaIndiansDiabetes.proj)], collapse="+") ,"~", paste(names(PimaIndiansDiabetes.proj)[1:(length(PimaIndiansDiabetes.proj)-1)], collapse="+"))ct <- ctree(diabetes ~ ., data = training)
svg(filename="Diabites_Decision_Tree.svg",
width=12,
height=10,
pointsize=12)
plot(as.simpleparty(ct))
dev.off()## png
## 2
## Predict Diabites Risk on new patients
predictions.probs <- predict(ct, testing,type = c("prob"))
predictions.class <- predict(ct, testing,type = c("response"))
table(predictions.class, testing$diabetes )##
## predictions.class neg pos
## neg 122 20
## pos 28 60
## confusion matrix stats on testing set
cm <- confusionMatrix(testing$diabetes, predictions.class, positive = NULL,
dnn = c("Prediction", "Reference"))
# printing confusion matrix
cm## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 122 28
## pos 20 60
##
## Accuracy : 0.7913
## 95% CI : (0.733, 0.8419)
## No Information Rate : 0.6174
## P-Value [Acc > NIR] : 1.155e-08
##
## Kappa : 0.5505
## Mcnemar's Test P-Value : 0.3123
##
## Sensitivity : 0.8592
## Specificity : 0.6818
## Pos Pred Value : 0.8133
## Neg Pred Value : 0.7500
## Prevalence : 0.6174
## Detection Rate : 0.5304
## Detection Prevalence : 0.6522
## Balanced Accuracy : 0.7705
##
## 'Positive' Class : neg
##
roc_pred <- prediction(predictions.probs[,2], testing$diabetes)
#png(filename=paste("decision_tree_roc.png"),width = 1200, height = 800)
plot(performance(roc_pred, measure="sens", x.measure="spec"), colorize=TRUE)
auc.perf = performance(roc_pred, measure = "auc")
title(unlist(auc.perf@y.values))#dev.off()Sisense Diabetes Widget
fit.rf <-randomForest(diabetes ~ ., data=training)
print(fit.rf)##
## Call:
## randomForest(formula = diabetes ~ ., data = training)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 25.84%
## Confusion matrix:
## neg pos class.error
## neg 299 51 0.1457143
## pos 88 100 0.4680851
#importance(fit.rf)
rf.predictions.class <- predict(fit.rf, testing)
rf.predictions.probs <- predict(fit.rf, testing,type = c("prob"))
cm.rf <- confusionMatrix(testing$diabetes, rf.predictions.class, positive = NULL,
dnn = c("Prediction", "Reference"))
# printing confusion matrix
cm.rf## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 134 16
## pos 30 50
##
## Accuracy : 0.8
## 95% CI : (0.7424, 0.8497)
## No Information Rate : 0.713
## P-Value [Acc > NIR] : 0.001687
##
## Kappa : 0.5404
## Mcnemar's Test P-Value : 0.055270
##
## Sensitivity : 0.8171
## Specificity : 0.7576
## Pos Pred Value : 0.8933
## Neg Pred Value : 0.6250
## Prevalence : 0.7130
## Detection Rate : 0.5826
## Detection Prevalence : 0.6522
## Balanced Accuracy : 0.7873
##
## 'Positive' Class : neg
##
roc_pred.rf <- prediction(rf.predictions.probs[,2], testing$diabetes)
plot(performance(roc_pred.rf, measure="sens", x.measure="spec"), colorize=TRUE)
rf.auc.perf = performance(roc_pred.rf, measure = "auc")
title(unlist(rf.auc.perf@y.values)) - an AUC improvement of
unlist(rf.auc.perf@y.values) - unlist(auc.perf@y.values)## [1] 0.01833333
NOT BAD !
predictions.probs <- predict(ct, training,type = c("prob"))
predictions.probs[,2]## 2 4 5 6 7 11
## 0.28125000 0.13274336 0.59036145 0.02150538 0.13274336 0.28125000
## 12 13 14 18 19 20
## 0.78571429 0.23076923 0.78571429 0.28125000 0.28125000 0.28125000
## 21 22 23 24 27 28
## 0.13274336 0.28125000 0.78571429 0.28125000 0.59036145 0.02150538
## 30 31 32 33 34 35
## 0.28125000 0.28125000 0.78571429 0.02150538 0.02150538 0.28125000
## 36 37 39 40 41 42
## 0.02150538 0.59036145 0.13274336 0.66666667 0.78571429 0.59036145
## 44 45 46 47 48 49
## 0.78571429 0.78571429 0.78571429 0.23076923 0.13274336 0.28125000
## 51 52 54 55 56 60
## 0.02150538 0.02150538 0.78571429 0.59036145 0.02150538 0.13274336
## 61 62 64 66 68 70
## 0.02150538 0.59036145 0.23076923 0.28125000 0.66666667 0.23076923
## 71 72 73 74 77 78
## 0.66666667 0.23076923 0.66666667 0.59036145 0.28125000 0.13274336
## 79 80 81 85 87 88
## 0.59036145 0.02150538 0.02150538 0.59036145 0.28125000 0.13274336
## 89 91 92 93 94 95
## 0.59036145 0.02150538 0.28125000 0.28125000 0.23076923 0.23076923
## 96 100 102 103 104 105
## 0.59036145 0.28125000 0.23076923 0.02150538 0.13274336 0.13274336
## 107 108 111 113 114 115
## 0.02150538 0.23076923 0.78571429 0.13274336 0.13274336 0.78571429
## 116 117 118 119 121 123
## 0.59036145 0.28125000 0.13274336 0.13274336 0.78571429 0.13274336
## 125 126 127 129 130 131
## 0.13274336 0.13274336 0.28125000 0.28125000 0.66666667 0.78571429
## 134 136 138 140 141 142
## 0.28125000 0.28125000 0.13274336 0.28125000 0.23076923 0.28125000
## 144 145 146 147 148 149
## 0.28125000 0.59036145 0.02150538 0.28125000 0.66666667 0.59036145
## 150 151 152 153 154 155
## 0.13274336 0.59036145 0.02150538 0.78571429 0.59036145 0.78571429
## 156 157 158 159 160 163
## 0.59036145 0.02150538 0.02150538 0.13274336 0.78571429 0.13274336
## 166 167 168 169 171 175
## 0.66666667 0.59036145 0.66666667 0.28125000 0.28125000 0.28125000
## 176 177 178 180 185 187
## 0.78571429 0.28125000 0.59036145 0.59036145 0.23076923 0.78571429
## 188 189 190 192 194 195
## 0.59036145 0.66666667 0.59036145 0.28125000 0.59036145 0.02150538
## 196 197 198 200 202 204
## 0.78571429 0.02150538 0.02150538 0.59036145 0.59036145 0.02150538
## 205 206 207 209 210 211
## 0.28125000 0.02150538 0.78571429 0.13274336 0.78571429 0.13274336
## 214 215 216 217 218 219
## 0.59036145 0.28125000 0.59036145 0.13274336 0.28125000 0.66666667
## 222 223 224 225 227 229
## 0.78571429 0.02150538 0.23076923 0.02150538 0.13274336 0.78571429
## 230 231 234 235 236 237
## 0.13274336 0.59036145 0.28125000 0.13274336 0.78571429 0.78571429
## 239 240 242 245 246 247
## 0.78571429 0.02150538 0.13274336 0.59036145 0.78571429 0.28125000
## 248 249 250 251 252 253
## 0.78571429 0.28125000 0.13274336 0.28125000 0.23076923 0.02150538
## 256 257 258 260 261 262
## 0.13274336 0.28125000 0.13274336 0.78571429 0.78571429 0.59036145
## 263 264 265 266 267 268
## 0.13274336 0.59036145 0.28125000 0.66666667 0.59036145 0.59036145
## 270 271 272 276 277 279
## 0.23076923 0.66666667 0.02150538 0.13274336 0.28125000 0.02150538
## 280 281 282 284 285 286
## 0.02150538 0.59036145 0.59036145 0.78571429 0.28125000 0.23076923
## 287 288 289 291 292 293
## 0.78571429 0.66666667 0.02150538 0.13274336 0.13274336 0.59036145
## 296 297 299 300 301 302
## 0.59036145 0.23076923 0.28125000 0.02150538 0.78571429 0.59036145
## 303 304 306 307 308 309
## 0.28125000 0.28125000 0.28125000 0.78571429 0.23076923 0.59036145
## 311 312 314 318 322 324
## 0.02150538 0.13274336 0.13274336 0.78571429 0.13274336 0.23076923
## 326 327 328 329 331 332
## 0.78571429 0.66666667 0.78571429 0.13274336 0.02150538 0.13274336
## 334 335 336 338 343 346
## 0.02150538 0.02150538 0.78571429 0.28125000 0.13274336 0.28125000
## 347 348 349 350 351 353
## 0.23076923 0.02150538 0.02150538 0.28125000 0.28125000 0.28125000
## 354 355 356 357 359 361
## 0.13274336 0.13274336 0.78571429 0.66666667 0.28125000 0.78571429
## 362 363 364 365 366 367
## 0.78571429 0.28125000 0.59036145 0.59036145 0.28125000 0.28125000
## 368 370 371 373 377 381
## 0.02150538 0.59036145 0.78571429 0.13274336 0.02150538 0.13274336
## 382 383 384 385 386 387
## 0.02150538 0.02150538 0.02150538 0.02150538 0.02150538 0.66666667
## 388 390 391 392 393 394
## 0.28125000 0.66666667 0.28125000 0.78571429 0.23076923 0.02150538
## 395 397 398 402 407 408
## 0.78571429 0.02150538 0.59036145 0.23076923 0.28125000 0.02150538
## 409 410 412 413 414 415
## 0.78571429 0.78571429 0.13274336 0.59036145 0.23076923 0.59036145
## 416 417 418 420 423 424
## 0.78571429 0.13274336 0.59036145 0.23076923 0.13274336 0.13274336
## 425 426 427 428 429 430
## 0.59036145 0.78571429 0.02150538 0.78571429 0.59036145 0.28125000
## 431 432 433 434 435 437
## 0.02150538 0.28125000 0.13274336 0.23076923 0.02150538 0.59036145
## 438 439 442 443 445 446
## 0.23076923 0.02150538 0.13274336 0.13274336 0.28125000 0.78571429
## 447 448 449 450 451 452
## 0.02150538 0.13274336 0.13274336 0.13274336 0.02150538 0.23076923
## 453 454 458 459 460 461
## 0.13274336 0.02150538 0.13274336 0.59036145 0.23076923 0.02150538
## 463 464 466 467 468 469
## 0.66666667 0.28125000 0.02150538 0.13274336 0.13274336 0.28125000
## 470 471 474 475 477 479
## 0.59036145 0.59036145 0.23076923 0.13274336 0.66666667 0.02150538
## 480 481 482 486 488 489
## 0.23076923 0.78571429 0.28125000 0.59036145 0.78571429 0.02150538
## 491 492 495 496 497 498
## 0.13274336 0.28125000 0.02150538 0.78571429 0.02150538 0.13274336
## 499 500 501 502 503 505
## 0.78571429 0.23076923 0.02150538 0.28125000 0.66666667 0.28125000
## 506 507 509 510 511 513
## 0.28125000 0.78571429 0.13274336 0.02150538 0.28125000 0.02150538
## 514 515 516 517 519 520
## 0.13274336 0.02150538 0.78571429 0.59036145 0.28125000 0.23076923
## 521 522 525 526 529 530
## 0.02150538 0.13274336 0.13274336 0.02150538 0.13274336 0.02150538
## 531 532 533 534 535 536
## 0.13274336 0.13274336 0.66666667 0.28125000 0.13274336 0.59036145
## 537 538 539 540 541 542
## 0.28125000 0.02150538 0.13274336 0.59036145 0.66666667 0.59036145
## 544 545 546 548 549 551
## 0.13274336 0.28125000 0.78571429 0.59036145 0.78571429 0.13274336
## 552 553 554 556 560 562
## 0.13274336 0.28125000 0.13274336 0.02150538 0.28125000 0.78571429
## 563 564 565 566 568 569
## 0.13274336 0.28125000 0.13274336 0.02150538 0.28125000 0.59036145
## 573 575 576 578 579 581
## 0.13274336 0.59036145 0.13274336 0.13274336 0.23076923 0.59036145
## 582 583 584 585 586 587
## 0.02150538 0.28125000 0.28125000 0.66666667 0.02150538 0.59036145
## 588 590 591 594 595 596
## 0.02150538 0.02150538 0.66666667 0.13274336 0.66666667 0.78571429
## 598 599 600 601 602 603
## 0.13274336 0.78571429 0.02150538 0.13274336 0.02150538 0.28125000
## 604 605 606 608 609 613
## 0.59036145 0.78571429 0.13274336 0.02150538 0.59036145 0.78571429
## 614 615 616 617 619 620
## 0.13274336 0.59036145 0.02150538 0.28125000 0.66666667 0.13274336
## 621 622 623 625 627 628
## 0.28125000 0.02150538 0.78571429 0.13274336 0.02150538 0.59036145
## 629 631 632 633 634 635
## 0.59036145 0.66666667 0.13274336 0.02150538 0.23076923 0.02150538
## 637 638 639 640 641 642
## 0.28125000 0.13274336 0.66666667 0.02150538 0.13274336 0.59036145
## 645 647 649 651 652 654
## 0.13274336 0.78571429 0.23076923 0.02150538 0.13274336 0.13274336
## 655 656 657 659 660 661
## 0.13274336 0.78571429 0.02150538 0.28125000 0.13274336 0.78571429
## 662 663 664 665 666 667
## 0.78571429 0.78571429 0.59036145 0.28125000 0.13274336 0.59036145
## 670 671 672 673 674 677
## 0.59036145 0.78571429 0.02150538 0.28125000 0.13274336 0.78571429
## 678 679 681 682 686 687
## 0.13274336 0.13274336 0.02150538 0.78571429 0.59036145 0.23076923
## 688 689 690 692 694 696
## 0.28125000 0.23076923 0.59036145 0.78571429 0.59036145 0.59036145
## 698 699 700 703 704 707
## 0.02150538 0.66666667 0.13274336 0.78571429 0.59036145 0.02150538
## 708 709 711 712 713 714
## 0.13274336 0.78571429 0.78571429 0.28125000 0.59036145 0.23076923
## 715 716 717 718 719 720
## 0.28125000 0.78571429 0.78571429 0.02150538 0.13274336 0.28125000
## 722 723 725 726 727 729
## 0.13274336 0.23076923 0.28125000 0.28125000 0.13274336 0.78571429
## 731 732 733 734 735 736
## 0.23076923 0.13274336 0.78571429 0.13274336 0.02150538 0.28125000
## 737 742 743 744 745 746
## 0.13274336 0.13274336 0.13274336 0.59036145 0.59036145 0.28125000
## 747 748 749 750 751 752
## 0.59036145 0.66666667 0.78571429 0.78571429 0.59036145 0.28125000
## 754 756 757 758 759 761
## 0.78571429 0.59036145 0.59036145 0.28125000 0.13274336 0.13274336
## 762 765 766 768
## 0.78571429 0.13274336 0.02150538 0.13274336