ML

library(DT)          # For Data Tables
library(lattice)     # The lattice add-on of Trellis graphics for R
library(knitr)       # For Dynamic Report Generation in R 
library(gplots)      # Various R Programming Tools for Plotting Data

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

library(ggplot2)     # An Implementation of the Grammar of Graphics 
library(ClustOfVar)  # Clustering of variables 
library(ape)         # Analyses of Phylogenetics and Evolution (as.phylo) 
library(Information) # Data Exploration with Information Theory (Weight-of-Evidence and Information Value)

## 
## Attaching package: 'Information'

## The following object is masked from 'package:ape':
## 
##     is.binary

library(ROCR)        # Model Performance and ROC curve
library(caret)       # Classification and Regression Training -  for any machine learning algorithms
library(rpart)       # Recursive partitioning for classification, regression and survival trees
library(rpart.utils) # Tools for parsing and manipulating rpart objects, including generating machine readable rules
library(rpart.plot)  # Plot 'rpart' Models: An Enhanced Version of 'plot.rpart'
library(randomForest)# Leo Breiman and Cutler's Random Forests for Classification and Regression

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(party)       # A computational toolbox for recursive partitioning - Conditional inference Trees

## Loading required package: grid

## Loading required package: mvtnorm

## Loading required package: modeltools

## Loading required package: stats4

## Loading required package: strucchange

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: sandwich

## 
## Attaching package: 'party'

## The following object is masked from 'package:ape':
## 
##     where

library(bnlearn)     # Bayesian Network Structure Learning, Parameter Learning and Inference

## 
## Attaching package: 'bnlearn'

## The following object is masked from 'package:stats':
## 
##     sigma

library(DAAG)        # Data Analysis and Graphics Data and Functions
library(vcd)         # Visualizing Categorical Data
library(kernlab)     # Support Vector Machine

## 
## Attaching package: 'kernlab'

## The following object is masked from 'package:modeltools':
## 
##     prior

## The following object is masked from 'package:ggplot2':
## 
##     alpha

# Following libraries we have load for model 8 and model 9
#library(neuralnet)  # Neural Network 
#library(lars)   # For Least Angle Regression, Lasso and Forward Stagewise
#library(glmnet) # Lasso and Elastic-Net Regularized Generalized Linear Models


require(ggplot2)
library(caret)
library(nnet)
library(NeuralNetTools)
library(mlbench)
data(BreastCancer)
str(BreastCancer)

## 'data.frame':    699 obs. of  11 variables:
##  $ Id             : chr  "1000025" "1002945" "1015425" "1016277" ...
##  $ Cl.thickness   : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 5 5 3 6 4 8 1 2 2 4 ...
##  $ Cell.size      : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 1 1 2 ...
##  $ Cell.shape     : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 2 1 1 ...
##  $ Marg.adhesion  : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 5 1 1 3 8 1 1 1 1 ...
##  $ Epith.c.size   : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 2 7 2 3 2 7 2 2 2 2 ...
##  $ Bare.nuclei    : Factor w/ 10 levels "1","2","3","4",..: 1 10 2 4 1 10 10 1 1 1 ...
##  $ Bl.cromatin    : Factor w/ 10 levels "1","2","3","4",..: 3 3 3 3 3 9 3 3 1 2 ...
##  $ Normal.nucleoli: Factor w/ 10 levels "1","2","3","4",..: 1 2 1 7 1 7 1 1 1 1 ...
##  $ Mitoses        : Factor w/ 9 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 5 1 ...
##  $ Class          : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...

BreastCancer$Id <- NULL

BreastCancer$Cl.thickness <- as.numeric(BreastCancer$Cl.thickness)
BreastCancer$Cell.size <- as.numeric(BreastCancer$Cell.size)
BreastCancer$Cell.shape <- as.numeric(BreastCancer$Cell.shape)
BreastCancer$Marg.adhesion <- as.numeric(BreastCancer$Marg.adhesion)
BreastCancer$Epith.c.size <- as.numeric(BreastCancer$Epith.c.size)
BreastCancer$Bare.nuclei <- as.numeric(BreastCancer$Bare.nuclei)
BreastCancer$Bl.cromatin <- as.numeric(BreastCancer$Bl.cromatin)
BreastCancer$Normal.nucleoli <- as.numeric(BreastCancer$Normal.nucleoli)
BreastCancer$Mitoses <- as.numeric(BreastCancer$Mitoses)
BreastCancer = na.omit(BreastCancer)
str(BreastCancer)

## 'data.frame':    683 obs. of  10 variables:
##  $ Cl.thickness   : num  5 5 3 6 4 8 1 2 2 4 ...
##  $ Cell.size      : num  1 4 1 8 1 10 1 1 1 2 ...
##  $ Cell.shape     : num  1 4 1 8 1 10 1 2 1 1 ...
##  $ Marg.adhesion  : num  1 5 1 1 3 8 1 1 1 1 ...
##  $ Epith.c.size   : num  2 7 2 3 2 7 2 2 2 2 ...
##  $ Bare.nuclei    : num  1 10 2 4 1 10 10 1 1 1 ...
##  $ Bl.cromatin    : num  3 3 3 3 3 9 3 3 1 2 ...
##  $ Normal.nucleoli: num  1 2 1 7 1 7 1 1 1 1 ...
##  $ Mitoses        : num  1 1 1 1 1 1 1 1 5 1 ...
##  $ Class          : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
##  - attr(*, "na.action")=Class 'omit'  Named int [1:16] 24 41 140 146 159 165 236 250 276 293 ...
##   .. ..- attr(*, "names")= chr [1:16] "24" "41" "140" "146" ...

set.seed(123) #random number generator
ind = sample(2, nrow(BreastCancer), replace=TRUE, prob=c(0.6, 0.4))
train = BreastCancer[ind==1,]
test = BreastCancer[ind==2,]
str(train)

## 'data.frame':    411 obs. of  10 variables:
##  $ Cl.thickness   : num  5 3 8 1 2 4 2 1 8 4 ...
##  $ Cell.size      : num  1 1 10 1 1 2 1 1 7 1 ...
##  $ Cell.shape     : num  1 1 10 1 1 1 1 1 5 1 ...
##  $ Marg.adhesion  : num  1 1 8 1 1 1 1 1 10 1 ...
##  $ Epith.c.size   : num  2 2 7 2 2 2 2 2 7 2 ...
##  $ Bare.nuclei    : num  1 2 10 10 1 1 1 3 9 1 ...
##  $ Bl.cromatin    : num  3 3 9 3 1 2 2 3 5 2 ...
##  $ Normal.nucleoli: num  1 1 7 1 1 1 1 1 5 1 ...
##  $ Mitoses        : num  1 1 1 1 5 1 1 1 4 1 ...
##  $ Class          : Factor w/ 2 levels "benign","malignant": 1 1 2 1 1 1 1 1 2 1 ...
##  - attr(*, "na.action")=Class 'omit'  Named int [1:16] 24 41 140 146 159 165 236 250 276 293 ...
##   .. ..- attr(*, "names")= chr [1:16] "24" "41" "140" "146" ...

library(stats)
# Model: Stepwise Logistic Regression Model
m1 <- glm(Class~.,data=train,family=binomial())
m1 <- step(m1)

## Start:  AIC=87.55
## Class ~ Cl.thickness + Cell.size + Cell.shape + Marg.adhesion + 
##     Epith.c.size + Bare.nuclei + Bl.cromatin + Normal.nucleoli + 
##     Mitoses
## 
##                   Df Deviance    AIC
## - Cell.size        1   68.005 86.005
## - Bl.cromatin      1   68.378 86.378
## <none>                 67.545 87.545
## - Normal.nucleoli  1   69.587 87.587
## - Cell.shape       1   69.845 87.845
## - Mitoses          1   70.495 88.495
## - Marg.adhesion    1   70.548 88.548
## - Epith.c.size     1   71.519 89.519
## - Cl.thickness     1   77.448 95.448
## - Bare.nuclei      1   77.965 95.965
## 
## Step:  AIC=86.01
## Class ~ Cl.thickness + Cell.shape + Marg.adhesion + Epith.c.size + 
##     Bare.nuclei + Bl.cromatin + Normal.nucleoli + Mitoses
## 
##                   Df Deviance    AIC
## - Bl.cromatin      1   68.800 84.800
## - Normal.nucleoli  1   69.615 85.615
## - Cell.shape       1   69.973 85.973
## <none>                 68.005 86.005
## - Marg.adhesion    1   70.634 86.634
## - Mitoses          1   70.635 86.635
## - Epith.c.size     1   71.529 87.529
## - Cl.thickness     1   77.481 93.481
## - Bare.nuclei      1   78.064 94.064
## 
## Step:  AIC=84.8
## Class ~ Cl.thickness + Cell.shape + Marg.adhesion + Epith.c.size + 
##     Bare.nuclei + Normal.nucleoli + Mitoses
## 
##                   Df Deviance    AIC
## <none>                 68.800 84.800
## - Normal.nucleoli  1   71.185 85.185
## - Cell.shape       1   71.193 85.193
## - Marg.adhesion    1   71.483 85.483
## - Mitoses          1   71.688 85.688
## - Epith.c.size     1   74.073 88.073
## - Cl.thickness     1   79.044 93.044
## - Bare.nuclei      1   82.885 96.885

summary(m1)

## 
## Call:
## glm(formula = Class ~ Cl.thickness + Cell.shape + Marg.adhesion + 
##     Epith.c.size + Bare.nuclei + Normal.nucleoli + Mitoses, family = binomial(), 
##     data = train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.96097  -0.13932  -0.07928   0.03354   2.51104  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -9.2316     1.2422  -7.432 1.07e-13 ***
## Cl.thickness      0.4426     0.1500   2.951 0.003170 ** 
## Cell.shape        0.3250     0.2243   1.449 0.147273    
## Marg.adhesion     0.2649     0.1679   1.577 0.114786    
## Epith.c.size      0.4530     0.2046   2.214 0.026852 *  
## Bare.nuclei       0.4159     0.1147   3.625 0.000288 ***
## Normal.nucleoli   0.2190     0.1469   1.491 0.135953    
## Mitoses           0.4813     0.3025   1.591 0.111582    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 538.3  on 410  degrees of freedom
## Residual deviance:  68.8  on 403  degrees of freedom
## AIC: 84.8
## 
## Number of Fisher Scoring iterations: 8

# List of significant variables and features with p-value <0.01
significant.variables <- summary(m1)$coeff[-1,4] < 0.01
names(significant.variables)[significant.variables == TRUE]

## [1] "Cl.thickness" "Bare.nuclei"

prob <- predict(m1, type = "response")
res <- residuals(m1, type = "deviance")

#Plot Residuals
plot(predict(m1), res,
     xlab="Fitted values", ylab = "Residuals",
     ylim = max(abs(res)) * c(-1,1))

#score test data set
test$m1_score <- predict(m1,type='response',test)
m1_pred <- prediction(test$m1_score, test$Class)
m1_perf <- performance(m1_pred,"tpr","fpr")

#ROC
plot(m1_perf, lwd=2, colorize=TRUE, main="ROC m1: Logistic Regression Performance")
lines(x=c(0, 1), y=c(0, 1), col="red", lwd=1, lty=3);
lines(x=c(1, 0), y=c(0, 1), col="green", lwd=1, lty=4)

# Print, plot variable importance
print(varImp(m1, scale = FALSE))

##                  Overall
## Cl.thickness    2.950789
## Cell.shape      1.449232
## Marg.adhesion   1.577043
## Epith.c.size    2.213662
## Bare.nuclei     3.625448
## Normal.nucleoli 1.491033
## Mitoses         1.591125

# Plot precision/recall curve
m1_perf_precision <- performance(m1_pred, measure = "prec", x.measure = "rec")
plot(m1_perf_precision, main="m1 Logistic:Precision/recall curve")

# Plot accuracy as function of threshold
m1_perf_acc <- performance(m1_pred, measure = "acc")
plot(m1_perf_acc, main="m1 Logistic:Accuracy as function of threshold")

#KS, Gini & AUC m1
m1_KS <- round(max(attr(m1_perf,'y.values')[[1]]-attr(m1_perf,'x.values')[[1]])*100, 2)
m1_AUROC <- round(performance(m1_pred, measure = "auc")@y.values[[1]]*100, 2)
m1_Gini <- (2*m1_AUROC - 100)
cat("AUROC: ",m1_AUROC,"\tKS: ", m1_KS, "\tGini:", m1_Gini, "\n")

## AUROC:  99.53    KS:  95.05  Gini: 99.06

library(rpart)
m2 <- rpart(Class~.,data=train)
# Print tree detail
printcp(m2)

## 
## Classification tree:
## rpart(formula = Class ~ ., data = train)
## 
## Variables actually used in tree construction:
## [1] Bare.nuclei Cell.size  
## 
## Root node error: 149/411 = 0.36253
## 
## n= 411 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.805369      0   1.00000 1.00000 0.065409
## 2 0.073826      1   0.19463 0.22819 0.037480
## 3 0.020134      2   0.12081 0.16107 0.031905
## 4 0.010000      3   0.10067 0.16107 0.031905

# Tree plot
plot(m2, main="Tree:Recursive Partitioning")
text(m2)

# Better version of plot
prp(m2,type=2,extra=1,  main="Tree:Recursive Partitioning")

# score test data
test$m2_score <- predict(m2,type='prob',test)
m2_pred <- prediction(test$m2_score[,2],test$Class)
m2_perf <- performance(m2_pred,"tpr","fpr")

# MOdel performance plot
plot(m2_perf, lwd=2, colorize=TRUE, main="ROC m2: Traditional Recursive Partitioning")
lines(x=c(0, 1), y=c(0, 1), col="red", lwd=1, lty=3);
lines(x=c(1, 0), y=c(0, 1), col="green", lwd=1, lty=4)

# Plot precision/recall curve
m2_perf_precision <- performance(m2_pred, measure = "prec", x.measure = "rec")
plot(m2_perf_precision, main="m2 Recursive Partitioning:Precision/recall curve")

# Plot accuracy as function of threshold
m2_perf_acc <- performance(m2_pred, measure = "acc")
plot(m2_perf_acc, main="m2 Recursive Partitioning:Accuracy as function of threshold")

# KS & AUC m1
m2_AUROC <- round(performance(m2_pred, measure = "auc")@y.values[[1]]*100, 2)
m2_KS <- round(max(attr(m2_perf,'y.values')[[1]]-attr(m2_perf,'x.values')[[1]])*100, 2)
m2_Gini <- (2*m2_AUROC - 100)
cat("AUROC: ",m2_AUROC,"\tKS: ", m2_KS, "\tGini:", m2_Gini, "\n")

## AUROC:  95.65    KS:  90.05  Gini: 91.3

library(randomForest)
m3 <- randomForest(Class ~ ., data = train)

m3_fitForest <- predict(m3, newdata = test, type="prob")[,2]
m3_pred <- prediction( m3_fitForest, test$Class)
m3_perf <- performance(m3_pred, "tpr", "fpr")

#plot variable importance
varImpPlot(m3, main="Random Forest: Variable Importance")

# Model Performance plot
plot(m3_perf,colorize=TRUE, lwd=2, main = "m3 ROC: Random Forest", col = "blue")
lines(x=c(0, 1), y=c(0, 1), col="red", lwd=1, lty=3);
lines(x=c(1, 0), y=c(0, 1), col="green", lwd=1, lty=4)

# Plot precision/recall curve
m3_perf_precision <- performance(m3_pred, measure = "prec", x.measure = "rec")
plot(m3_perf_precision, main="m3 Random Forests:Precision/recall curve")

# Plot accuracy as function of threshold
m3_perf_acc <- performance(m3_pred, measure = "acc")
plot(m3_perf_acc, main="m3 Random Forests:Accuracy as function of threshold")

#KS & AUC  m3
m3_AUROC <- round(performance(m3_pred, measure = "auc")@y.values[[1]]*100, 2)
m3_KS <- round(max(attr(m3_perf,'y.values')[[1]] - attr(m3_perf,'x.values')[[1]])*100, 2)
m3_Gini <- (2*m3_AUROC - 100)
cat("AUROC: ",m3_AUROC,"\tKS: ", m3_KS, "\tGini:", m3_Gini, "\n")

## AUROC:  99.62    KS:  94.51  Gini: 99.24

library(party)
set.seed(123456742)
m3_1 <- cforest(Class~., control = cforest_unbiased(mtry = 2, ntree = 50), data = train)

# Variable Importance
kable(as.data.frame(varimp(m3_1)))

	varimp(m3_1)
Cl.thickness	0.0140397
Cell.size	0.0585430
Cell.shape	0.0688742
Marg.adhesion	0.0083444
Epith.c.size	0.0072848
Bare.nuclei	0.0349669
Bl.cromatin	0.0263576
Normal.nucleoli	0.0275497
Mitoses	-0.0007947

# Model Summary
summary(m3_1)

##       Length        Class         Mode 
##            1 RandomForest           S4

# Model Performance
m3_1_fitForest <- predict(m3, newdata = test, type = "prob")[,2]
m3_1_pred <- prediction(m3_1_fitForest, test$Class)
m3_1_perf <- performance(m3_1_pred, "tpr", "fpr")

# Model Performance Plot
plot(m3_1_perf, colorize=TRUE, lwd=2, main = " m3_1 ROC: Conditional Random Forests")
lines(x=c(0, 1), y=c(0, 1), col="red", lwd=1, lty=3);
lines(x=c(1, 0), y=c(0, 1), col="green", lwd=1, lty=4)

# Plot precision/recall curve
m3_1_perf_precision <- performance(m3_1_pred, measure = "prec", x.measure = "rec")
plot(m3_1_perf_precision, main="m3_1 Conditional Random Forests:Precision/recall curve")

# Plot accuracy as function of threshold
m3_1_perf_acc <- performance(m3_1_pred, measure = "acc")
plot(m3_1_perf_acc, main="m3_1 Conditional Random Forests:Accuracy as function of threshold")

# KS & AUC m3_1
m3_1_AUROC <- round(performance(m3_1_pred, measure = "auc")@y.values[[1]]*100, 2)
m3_1_KS <- round(max(attr(m3_perf,'y.values')[[1]] - attr(m3_perf,'x.values')[[1]])*100, 2)
m3_1_Gini <- (2*m3_1_AUROC - 100)
cat("AUROC: ",m3_1_AUROC,"\tKS: ", m3_1_KS, "\tGini:", m3_1_Gini, "\n")

## AUROC:  99.62    KS:  94.51  Gini: 99.24

#library(party)
m4 <- ctree(Class~.,data=train)
plot(m4, main="m4: Conditional inference Tree",col="blue")

resultdfr <- as.data.frame(do.call("rbind", treeresponse(m4, newdata = test)))
test$m4_score <- resultdfr[,2]
m4_pred <- prediction(test$m4_score,test$Class)
m4_perf <- performance(m4_pred,"tpr","fpr")

# Model Performance
plot(m4_perf, colorize=TRUE, lwd=2, main="ROC m4: Conditional inference Tree")
lines(x=c(0, 1), y=c(0, 1), col="red", lwd=1, lty=3);
lines(x=c(1, 0), y=c(0, 1), col="green", lwd=1, lty=4)

# Plot precision/recall curve
m4_perf_precision <- performance(m4_pred, measure = "prec", x.measure = "rec")
plot(m4_perf_precision, main="m4 CIT:Plot precision/recall curve")

# Plot accuracy as function of threshold
m4_perf_acc <- performance(m4_pred, measure = "acc")
plot(m4_perf_acc, main="m4 CIT:Plot accuracy as function of threshold")

#KS & AUC m4
m4_AUROC <- round(performance(m4_pred, measure = "auc")@y.values[[1]]*100, 2)
m4_KS <- round(max(attr(m4_perf,'y.values')[[1]]-attr(m4_perf,'x.values')[[1]])*100, 2)

m4_Gini <- (2*m4_AUROC - 100)
cat("AUROC: ",m4_AUROC,"\tKS: ", m4_KS, "\tGini:", m4_Gini, "\n")

## AUROC:  97.66    KS:  91.71  Gini: 95.32

library(kernlab) #for SVM
# Basic Model
m7_1 <- ksvm(Class ~ ., data = train, kernel = "vanilladot")

##  Setting default kernel parameters

m7_1_pred <- predict(m7_1, test[,1:9], type="response")
head(m7_1_pred)

## [1] malignant malignant benign    benign    benign    benign   
## Levels: benign malignant

 #Model accuracy:
table(m7_1_pred, test$Class)

##            
## m7_1_pred   benign malignant
##   benign       176         4
##   malignant      6        86

#agreement
m7_1_accuracy  <- (m7_1_pred == test$Class)
sum(m7_1_accuracy)

## [1] 262

# Compute at the prediction scores
m7_1_score <- predict(m7_1,test, type="decision")
m7_1_pred <- prediction(m7_1_score, test$Class)

# Plot ROC curve
m7_1_perf <- performance(m7_1_pred, measure = "tpr", x.measure = "fpr")
plot(m7_1_perf, colorize=TRUE, lwd=2, main="m7_1 SVM:Plot ROC curve - Vanilladot")
lines(x=c(0, 1), y=c(0, 1), col="red", lwd=1, lty=3);
lines(x=c(1, 0), y=c(0, 1), col="green", lwd=1, lty=4)

# Plot precision/recall curve
m7_1_perf_precision <- performance(m7_1_pred, measure = "prec", x.measure = "rec")
plot(m7_1_perf_precision, main="m7_1 SVM:Plot precision/recall curve")

# Plot accuracy as function of threshold
m7_1_perf_acc <- performance(m7_1_pred, measure = "acc")
plot(m7_1_perf_acc, main="m7_1 SVM:Plot accuracy as function of threshold")

# Model Performance

#KS & AUC m7_1
m7_1_AUROC <- round(performance(m7_1_pred, measure = "auc")@y.values[[1]]*100, 2)
m7_1_KS <- round(max(attr(m7_1_perf,'y.values')[[1]]-attr(m7_1_perf,'x.values')[[1]])*100, 2)
m7_1_Gini <- (2*m7_1_AUROC - 100)
cat("AUROC: ",m7_1_AUROC,"\tKS: ", m7_1_KS, "\tGini:", m7_1_Gini, "\n")

## AUROC:  99.68    KS:  96.15  Gini: 99.36

library(kernlab)
# Model Improvement with  Gaussian RBF kernel
m7_2 <- ksvm(Class ~ ., data = train, kernel = "rbfdot")
m7_2_pred <- predict(m7_2, test[,1:9], type="response")
head(m7_2_pred)

## [1] malignant malignant benign    benign    benign    benign   
## Levels: benign malignant

# Model accuracy:
table(m7_2_pred, test$Class)

##            
## m7_2_pred   benign malignant
##   benign       169         2
##   malignant     13        88

# Compute at the prediction scores
m7_2_score <- predict(m7_2,test, type="decision")
m7_2_pred <- prediction(m7_2_score, test$Class)


# Plot ROC curve
m7_2_perf <- performance(m7_2_pred, measure = "tpr", x.measure = "fpr")
plot(m7_2_perf, colorize=TRUE, lwd=2, main="SVM:Plot ROC curve - RBF", col="blue")
lines(x=c(0, 1), y=c(0, 1), col="red", lwd=1, lty=3);
lines(x=c(1, 0), y=c(0, 1), col="green", lwd=1, lty=4)

# Plot precision/recall curve
m7_2_perf_precision <- performance(m7_2_pred, measure = "prec", x.measure = "rec")
plot(m7_2_perf_precision, main="m7_2 SVM:Plot precision/recall curve")

# Model Performance
#KS &AUC m7_2
m7_2_AUROC <- round(performance(m7_2_pred, measure = "auc")@y.values[[1]]*100, 2)
m7_2_KS <- round(max(attr(m7_2_perf,'y.values')[[1]]-attr(m7_2_perf,'x.values')[[1]])*100, 2)
m7_2_Gini <- (2*m7_2_AUROC - 100)
cat("AUROC: ",m7_2_AUROC,"\tKS: ", m7_2_KS, "\tGini:", m7_2_Gini, "\n")

## AUROC:  99.09    KS:  92.31  Gini: 98.18

# ROC Comparision
plot(m7_1_perf, col='blue', lty=1, main='SVM:Model Performance Comparision (m7 ROC)') 
plot(m7_2_perf, col='green',lty=2, add=TRUE); # simple tree
    legend(0.5,0.4,
           c("m7_1: SVM vanilladot", "m7_2: SVM RBF kernel"),
           col=c('blue', 'green'),
           lwd=3);
lines(x=c(0, 1), y=c(0, 1), col="red", lwd=1, lty=3);# random line

#Compare ROC Performance of Models
plot(m1_perf, col='blue', lty=1, main='ROCs: Model Performance Comparision') # logistic regression
plot(m2_perf, col='gold',lty=2, add=TRUE); # simple tree
plot(m3_perf, col='green',add=TRUE,lty=4); # random forest
plot(m4_perf, col='dark gray',add=TRUE,lty=5); # Conditional Inference Tree
plot(m7_2_perf, col='black',add=TRUE,lty=7); # Support Vector Machine (SVM)
legend(0.6,0.5,
c('m1:logistic reg','m2:Recursive Partitioning', 
              'm3:random forest', "m4:condtn inference tree", "m7_2:SVM"),
           col=c('blue','gold', 'orange','green', 'dark gray'),
           lwd=3);
lines(c(0,1),c(0,1),col = "gray", lty = 4 ) # random line

ML

Kushan De Silva

October 12, 2017