Nir Regev
Chief Data Scientist
Sisense Ltd.
June 14, 2016
library(caret)## Warning: package 'caret' was built under R version 3.2.5
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.5
library(rpart)## Warning: package 'rpart' was built under R version 3.2.5
library(party)## Warning: package 'party' was built under R version 3.2.5
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
library(partykit)## Warning: package 'partykit' was built under R version 3.2.5
##
## Attaching package: 'partykit'
## The following objects are masked from 'package:party':
##
## cforest, ctree, ctree_control, edge_simple, mob, mob_control,
## node_barplot, node_bivplot, node_boxplot, node_inner,
## node_surv, node_terminal
library(Formula)
library(e1071)## Warning: package 'e1071' was built under R version 3.2.5
library(ROCR)## Warning: package 'ROCR' was built under R version 3.2.5
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.2.5
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(randomForest)## Warning: package 'randomForest' was built under R version 3.2.5
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(ggplot2) Attribute Information:
Then train a classificaiton tree to predict high risk patients giving new patients’ data (which was not used for training the system)
Loading Diabetes data set and wrap it
setwd("C:/Users/Nir.Regev/Documents/EXL")
data("PimaIndiansDiabetes", package = "mlbench")
attach(PimaIndiansDiabetes)## The following object is masked from package:datasets:
##
## pressure
args <- lapply(PimaIndiansDiabetes, list)PimaIndiansDiabetes <- data.frame(mapply( FUN = c,args))
formula <- paste0(paste(names(PimaIndiansDiabetes)[length(PimaIndiansDiabetes)], collapse="+") ,"~", paste(names(PimaIndiansDiabetes)[1:(length(PimaIndiansDiabetes)-1)], collapse="+"))
logRegModel <- glm(formula = formula, family=binomial, data=PimaIndiansDiabetes)
logRegModel ##
## Call: glm(formula = formula, family = binomial, data = PimaIndiansDiabetes)
##
## Coefficients:
## (Intercept) pregnant glucose pressure triceps
## -8.404696 0.123182 0.035164 -0.013296 0.000619
## insulin mass pedigree age
## -0.001192 0.089701 0.945180 0.014869
##
## Degrees of Freedom: 767 Total (i.e. Null); 759 Residual
## Null Deviance: 993.5
## Residual Deviance: 723.4 AIC: 741.4
summary(logRegModel)##
## Call:
## glm(formula = formula, family = binomial, data = PimaIndiansDiabetes)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5566 -0.7274 -0.4159 0.7267 2.9297
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.4046964 0.7166359 -11.728 < 2e-16 ***
## pregnant 0.1231823 0.0320776 3.840 0.000123 ***
## glucose 0.0351637 0.0037087 9.481 < 2e-16 ***
## pressure -0.0132955 0.0052336 -2.540 0.011072 *
## triceps 0.0006190 0.0068994 0.090 0.928515
## insulin -0.0011917 0.0009012 -1.322 0.186065
## mass 0.0897010 0.0150876 5.945 2.76e-09 ***
## pedigree 0.9451797 0.2991475 3.160 0.001580 **
## age 0.0148690 0.0093348 1.593 0.111192
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 993.48 on 767 degrees of freedom
## Residual deviance: 723.45 on 759 degrees of freedom
## AIC: 741.45
##
## Number of Fisher Scoring iterations: 5
# FEATURES SELECTION - check which predictors have coefficients significance level grather than 0.05 (not sinificant)
# and exclude out
coeff.df <- data.frame(summary(logRegModel)$coefficients)
colnames(coeff.df)[ncol(coeff.df)] <- "significance"
coeff.df <- subset(coeff.df, significance < 0.05 )
predictors.names <- rownames(coeff.df)[2:nrow(coeff.df)]
PimaIndiansDiabetes.proj <- cbind(PimaIndiansDiabetes[,c(predictors.names)],PimaIndiansDiabetes$diabetes)
colnames(PimaIndiansDiabetes.proj)[ncol(PimaIndiansDiabetes.proj)] <- "diabetes"PimaIndiansDiabetes.pca <- princomp(PimaIndiansDiabetes[,1:(length(PimaIndiansDiabetes)-1)],
center = TRUE,
scale. = TRUE)
View(PimaIndiansDiabetes.pca$scores)
target.numeric <- ifelse (PimaIndiansDiabetes$diabetes == "neg", 0 ,1)
pca.sisense.result <- data.frame(PimaIndiansDiabetes.pca$scores[,1:2],target.numeric)
x <- pca.sisense.result$Comp.1
y <- pca.sisense.result$Comp.2
pca.sisense.result$class <- as.factor(pca.sisense.result$target.numeric)
ggplot(pca.sisense.result, aes(Comp.1, Comp.2)) +
geom_point(aes(colour = class)) names(PimaIndiansDiabetes.proj)## [1] "pregnant" "glucose" "pressure" "mass" "pedigree" "diabetes"
intrain <- createDataPartition(y=PimaIndiansDiabetes.proj$diabetes,p=0.7,list=FALSE)
training <- PimaIndiansDiabetes.proj[intrain,]
testing<-PimaIndiansDiabetes.proj[-intrain,]
formula <- paste0(paste(names(PimaIndiansDiabetes.proj)[length(PimaIndiansDiabetes.proj)], collapse="+") ,"~", paste(names(PimaIndiansDiabetes.proj)[1:(length(PimaIndiansDiabetes.proj)-1)], collapse="+"))ct <- ctree(diabetes ~ ., data = training)
svg(filename="Diabites_Decision_Tree.svg",
width=12,
height=10,
pointsize=12)
plot(as.simpleparty(ct))
dev.off()## png
## 2
## Predict Diabites Risk on new patients
predictions.probs <- predict(ct, testing,type = c("prob"))
predictions.class <- predict(ct, testing,type = c("response"))
table(predictions.class, testing$diabetes )##
## predictions.class neg pos
## neg 130 44
## pos 20 36
## confusion matrix stats on testing set
cm <- confusionMatrix(testing$diabetes, predictions.class, positive = NULL,
dnn = c("Prediction", "Reference"))
# printing confusion matrix
cm## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 130 20
## pos 44 36
##
## Accuracy : 0.7217
## 95% CI : (0.659, 0.7786)
## No Information Rate : 0.7565
## P-Value [Acc > NIR] : 0.90280
##
## Kappa : 0.3405
## Mcnemar's Test P-Value : 0.00404
##
## Sensitivity : 0.7471
## Specificity : 0.6429
## Pos Pred Value : 0.8667
## Neg Pred Value : 0.4500
## Prevalence : 0.7565
## Detection Rate : 0.5652
## Detection Prevalence : 0.6522
## Balanced Accuracy : 0.6950
##
## 'Positive' Class : neg
##
roc_pred <- prediction(predictions.probs[,2], testing$diabetes)
#png(filename=paste("decision_tree_roc.png"),width = 1200, height = 800)
plot(performance(roc_pred, measure="sens", x.measure="spec"), colorize=TRUE)
auc.perf = performance(roc_pred, measure = "auc")
title(unlist(auc.perf@y.values))#dev.off()fit.rf <-randomForest(diabetes ~ ., data=training)
print(fit.rf)##
## Call:
## randomForest(formula = diabetes ~ ., data = training)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 21.19%
## Confusion matrix:
## neg pos class.error
## neg 305 45 0.1285714
## pos 69 119 0.3670213
#importance(fit.rf)
rf.predictions.class <- predict(fit.rf, testing)
rf.predictions.probs <- predict(fit.rf, testing,type = c("prob"))
cm.rf <- confusionMatrix(testing$diabetes, rf.predictions.class, positive = NULL,
dnn = c("Prediction", "Reference"))
# printing confusion matrix
cm.rf## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 129 21
## pos 40 40
##
## Accuracy : 0.7348
## 95% CI : (0.6728, 0.7906)
## No Information Rate : 0.7348
## P-Value [Acc > NIR] : 0.53439
##
## Kappa : 0.3811
## Mcnemar's Test P-Value : 0.02119
##
## Sensitivity : 0.7633
## Specificity : 0.6557
## Pos Pred Value : 0.8600
## Neg Pred Value : 0.5000
## Prevalence : 0.7348
## Detection Rate : 0.5609
## Detection Prevalence : 0.6522
## Balanced Accuracy : 0.7095
##
## 'Positive' Class : neg
##
roc_pred.rf <- prediction(rf.predictions.probs[,2], testing$diabetes)
plot(performance(roc_pred.rf, measure="sens", x.measure="spec"), colorize=TRUE)
rf.auc.perf = performance(roc_pred.rf, measure = "auc")
title(unlist(rf.auc.perf@y.values)) - an AUC improvement of
unlist(rf.auc.perf@y.values) - unlist(auc.perf@y.values)## [1] 0.03920833
NOT BAD !
predictions.probs <- predict(ct, training,type = c("prob"))
predictions.probs[,2]## 1 3 4 5 7 9
## 0.54347826 0.84810127 0.17518248 0.36842105 0.17518248 0.84810127
## 10 12 13 14 15 16
## 0.05970149 0.84810127 0.05970149 0.84810127 0.84810127 0.17518248
## 17 19 20 21 22 23
## 0.36842105 0.17518248 0.36842105 0.36842105 0.17518248 0.84810127
## 24 26 27 29 30 31
## 0.78571429 0.78571429 0.54347826 0.54347826 0.36842105 0.36842105
## 32 33 35 36 37 38
## 0.84810127 0.05970149 0.05970149 0.05970149 0.78571429 0.17518248
## 39 40 42 44 46 47
## 0.17518248 0.36842105 0.36842105 0.84810127 0.84810127 0.54347826
## 48 50 52 53 54 55
## 0.17518248 0.05970149 0.05970149 0.05970149 0.84810127 0.54347826
## 56 58 60 62 63 64
## 0.05970149 0.17518248 0.17518248 0.78571429 0.05970149 0.05970149
## 65 66 67 68 69 70
## 0.36842105 0.17518248 0.36842105 0.36842105 0.05970149 0.54347826
## 72 73 74 75 77 79
## 0.36842105 0.78571429 0.36842105 0.17518248 0.17518248 0.36842105
## 81 82 83 84 85 87
## 0.05970149 0.05970149 0.17518248 0.05970149 0.36842105 0.17518248
## 89 90 91 93 94 95
## 0.78571429 0.05970149 0.05970149 0.17518248 0.05970149 0.05970149
## 96 98 99 101 103 104
## 0.54347826 0.05970149 0.17518248 0.84810127 0.05970149 0.05970149
## 106 109 110 112 113 115
## 0.36842105 0.17518248 0.17518248 0.54347826 0.17518248 0.84810127
## 118 119 121 123 124 126
## 0.17518248 0.17518248 0.84810127 0.17518248 0.05970149 0.17518248
## 127 130 132 133 135 136
## 0.36842105 0.17518248 0.78571429 0.84810127 0.05970149 0.36842105
## 137 138 139 140 141 142
## 0.17518248 0.17518248 0.36842105 0.17518248 0.05970149 0.17518248
## 143 146 147 149 151 152
## 0.17518248 0.05970149 0.17518248 0.54347826 0.36842105 0.05970149
## 154 155 156 157 158 159
## 0.54347826 0.84810127 0.54347826 0.05970149 0.05970149 0.17518248
## 161 162 164 169 170 171
## 0.54347826 0.17518248 0.17518248 0.36842105 0.36842105 0.17518248
## 172 173 174 175 176 177
## 0.36842105 0.17518248 0.17518248 0.17518248 0.84810127 0.17518248
## 178 181 184 185 186 187
## 0.36842105 0.05970149 0.05970149 0.05970149 0.84810127 0.84810127
## 188 189 190 191 192 194
## 0.36842105 0.78571429 0.36842105 0.05970149 0.78571429 0.78571429
## 195 196 197 199 201 202
## 0.05970149 0.84810127 0.05970149 0.36842105 0.36842105 0.36842105
## 203 204 205 206 207 208
## 0.05970149 0.05970149 0.17518248 0.05970149 0.84810127 0.84810127
## 209 211 213 215 216 217
## 0.17518248 0.05970149 0.84810127 0.78571429 0.54347826 0.36842105
## 218 219 220 221 224 226
## 0.36842105 0.17518248 0.36842105 0.84810127 0.36842105 0.17518248
## 228 229 230 233 234 236
## 0.84810127 0.84810127 0.36842105 0.05970149 0.36842105 0.84810127
## 237 238 240 241 243 244
## 0.84810127 0.84810127 0.05970149 0.17518248 0.05970149 0.05970149
## 245 246 249 250 251 252
## 0.54347826 0.84810127 0.78571429 0.36842105 0.17518248 0.36842105
## 254 255 256 261 263 265
## 0.17518248 0.05970149 0.36842105 0.84810127 0.17518248 0.36842105
## 266 267 269 275 276 277
## 0.17518248 0.36842105 0.05970149 0.17518248 0.17518248 0.05970149
## 278 279 280 281 282 286
## 0.05970149 0.05970149 0.05970149 0.54347826 0.78571429 0.05970149
## 287 288 290 291 293 295
## 0.54347826 0.36842105 0.17518248 0.17518248 0.36842105 0.84810127
## 296 297 298 299 301 302
## 0.54347826 0.54347826 0.36842105 0.17518248 0.84810127 0.54347826
## 305 306 307 310 311 313
## 0.54347826 0.36842105 0.84810127 0.36842105 0.05970149 0.54347826
## 315 319 321 322 323 324
## 0.36842105 0.36842105 0.05970149 0.36842105 0.05970149 0.54347826
## 325 327 328 329 332 333
## 0.36842105 0.36842105 0.84810127 0.17518248 0.17518248 0.84810127
## 334 339 340 342 343 344
## 0.05970149 0.54347826 0.84810127 0.05970149 0.17518248 0.36842105
## 347 348 349 350 352 355
## 0.36842105 0.05970149 0.05970149 0.17518248 0.36842105 0.17518248
## 356 358 359 360 361 362
## 0.84810127 0.78571429 0.17518248 0.84810127 0.84810127 0.84810127
## 363 364 366 371 372 373
## 0.17518248 0.54347826 0.17518248 0.84810127 0.05970149 0.17518248
## 374 375 377 378 380 381
## 0.17518248 0.36842105 0.05970149 0.17518248 0.17518248 0.17518248
## 382 383 385 387 388 389
## 0.05970149 0.05970149 0.05970149 0.36842105 0.17518248 0.54347826
## 393 394 395 396 398 399
## 0.05970149 0.05970149 0.84810127 0.05970149 0.36842105 0.05970149
## 400 404 405 407 409 410
## 0.84810127 0.17518248 0.84810127 0.36842105 0.84810127 0.84810127
## 411 412 413 416 417 418
## 0.17518248 0.36842105 0.36842105 0.84810127 0.05970149 0.54347826
## 422 423 425 427 428 429
## 0.05970149 0.17518248 0.54347826 0.05970149 0.84810127 0.36842105
## 430 431 433 436 437 438
## 0.17518248 0.05970149 0.17518248 0.36842105 0.78571429 0.54347826
## 439 440 441 442 443 444
## 0.05970149 0.17518248 0.84810127 0.17518248 0.36842105 0.17518248
## 445 446 447 448 449 450
## 0.36842105 0.84810127 0.05970149 0.17518248 0.17518248 0.36842105
## 451 452 453 454 455 456
## 0.05970149 0.36842105 0.17518248 0.05970149 0.17518248 0.84810127
## 457 458 459 460 461 462
## 0.05970149 0.17518248 0.54347826 0.05970149 0.05970149 0.05970149
## 463 464 465 467 469 470
## 0.17518248 0.05970149 0.05970149 0.05970149 0.78571429 0.54347826
## 471 474 475 476 478 479
## 0.54347826 0.36842105 0.36842105 0.05970149 0.05970149 0.05970149
## 481 484 485 486 487 488
## 0.84810127 0.17518248 0.54347826 0.36842105 0.36842105 0.84810127
## 490 493 494 495 496 497
## 0.84810127 0.17518248 0.36842105 0.05970149 0.84810127 0.05970149
## 499 500 501 502 504 506
## 0.84810127 0.54347826 0.05970149 0.17518248 0.17518248 0.17518248
## 507 508 511 512 514 516
## 0.84810127 0.36842105 0.17518248 0.05970149 0.05970149 0.84810127
## 517 519 520 521 522 523
## 0.54347826 0.17518248 0.05970149 0.05970149 0.36842105 0.05970149
## 524 526 527 528 529 530
## 0.78571429 0.05970149 0.05970149 0.05970149 0.36842105 0.05970149
## 531 532 533 534 535 536
## 0.36842105 0.17518248 0.17518248 0.17518248 0.17518248 0.36842105
## 537 538 539 541 542 544
## 0.17518248 0.05970149 0.36842105 0.17518248 0.36842105 0.17518248
## 546 547 548 549 550 551
## 0.84810127 0.84810127 0.36842105 0.84810127 0.84810127 0.05970149
## 552 553 554 556 557 558
## 0.17518248 0.05970149 0.17518248 0.05970149 0.17518248 0.05970149
## 559 560 564 565 568 569
## 0.17518248 0.17518248 0.05970149 0.17518248 0.17518248 0.54347826
## 573 574 575 576 578 579
## 0.36842105 0.17518248 0.36842105 0.36842105 0.36842105 0.05970149
## 581 582 583 585 587 588
## 0.54347826 0.05970149 0.05970149 0.78571429 0.78571429 0.05970149
## 589 590 591 592 594 597
## 0.84810127 0.05970149 0.78571429 0.36842105 0.17518248 0.17518248
## 598 599 602 603 604 606
## 0.05970149 0.84810127 0.05970149 0.05970149 0.54347826 0.36842105
## 607 608 609 611 614 615
## 0.84810127 0.05970149 0.54347826 0.17518248 0.17518248 0.78571429
## 617 618 619 620 621 622
## 0.36842105 0.05970149 0.78571429 0.36842105 0.36842105 0.05970149
## 629 630 631 632 633 635
## 0.36842105 0.05970149 0.05970149 0.17518248 0.05970149 0.05970149
## 636 639 640 641 642 643
## 0.17518248 0.17518248 0.05970149 0.17518248 0.36842105 0.54347826
## 644 645 646 648 649 650
## 0.17518248 0.05970149 0.54347826 0.84810127 0.78571429 0.05970149
## 651 652 654 655 657 658
## 0.05970149 0.36842105 0.05970149 0.17518248 0.05970149 0.36842105
## 659 660 662 664 666 669
## 0.78571429 0.17518248 0.84810127 0.54347826 0.36842105 0.17518248
## 670 672 673 674 675 676
## 0.54347826 0.05970149 0.17518248 0.36842105 0.17518248 0.84810127
## 678 679 681 682 683 685
## 0.17518248 0.36842105 0.05970149 0.84810127 0.17518248 0.05970149
## 686 687 690 692 694 695
## 0.36842105 0.05970149 0.54347826 0.84810127 0.36842105 0.05970149
## 696 697 700 701 705 706
## 0.36842105 0.84810127 0.36842105 0.36842105 0.36842105 0.17518248
## 708 710 712 713 714 715
## 0.36842105 0.17518248 0.36842105 0.78571429 0.05970149 0.17518248
## 716 717 718 719 720 721
## 0.84810127 0.84810127 0.05970149 0.17518248 0.17518248 0.17518248
## 722 723 724 725 726 727
## 0.36842105 0.54347826 0.36842105 0.36842105 0.36842105 0.36842105
## 729 730 731 732 733 734
## 0.84810127 0.17518248 0.36842105 0.78571429 0.84810127 0.17518248
## 735 736 738 739 740 741
## 0.05970149 0.17518248 0.17518248 0.17518248 0.17518248 0.78571429
## 742 743 744 745 746 747
## 0.17518248 0.36842105 0.78571429 0.54347826 0.17518248 0.54347826
## 748 750 752 753 754 756
## 0.17518248 0.84810127 0.36842105 0.05970149 0.84810127 0.36842105
## 757 758 759 760 762 763
## 0.36842105 0.36842105 0.17518248 0.84810127 0.84810127 0.05970149
## 764 765 766 768
## 0.17518248 0.36842105 0.05970149 0.17518248