library(dlookr)
## 
## Attaching package: 'dlookr'
## The following object is masked from 'package:base':
## 
##     transform
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.1.8
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract() masks dlookr::extract()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(rstatix)
## 
## Attaching package: 'rstatix'
## 
## The following object is masked from 'package:stats':
## 
##     filter
library(UpSetR)
## 
## Attaching package: 'UpSetR'
## 
## The following object is masked from 'package:lattice':
## 
##     histogram
library(naniar)

library(Hmisc)
## Loading required package: survival
## 
## Attaching package: 'survival'
## 
## The following object is masked from 'package:caret':
## 
##     cluster
## 
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## The following object is masked from 'package:dlookr':
## 
##     describe
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## 
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## 
## The following object is masked from 'package:datasets':
## 
##     sleep
library(RWeka)
library(mice)
## 
## Attaching package: 'mice'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(tree)
library(partykit)
## Loading required package: libcoin
## Loading required package: mvtnorm
library(maptree)
## Loading required package: cluster
## Loading required package: rpart
library(blorr)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following object is masked from 'package:colorspace':
## 
##     coords
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(rpart.plot)
library(sjPlot)
#read the dataset
df <- read.csv("E:/MScDataScience/predictive/logistic.csv")

head(df)
summary(df) #summarise dataset
##        ID            Cohort            admission        enrolled        
##  Min.   :298447   Length:350         Min.   :0.0000   Length:350        
##  1st Qu.:299703   Class :character   1st Qu.:1.0000   Class :character  
##  Median :300013   Mode  :character   Median :1.0000   Mode  :character  
##  Mean   :300014                      Mean   :0.7743                     
##  3rd Qu.:300335                      3rd Qu.:1.0000                     
##  Max.   :301592                      Max.   :1.0000                     
##                                                                         
##      HSGPA             ACT              Graduated     
##  Min.   :  0.000   Length:350         Min.   :0.0000  
##  1st Qu.:  3.120   Class :character   1st Qu.:0.0000  
##  Median :  3.380   Mode  :character   Median :0.0000  
##  Mean   :  7.107                      Mean   :0.3992  
##  3rd Qu.:  3.690                      3rd Qu.:1.0000  
##  Max.   :999.000                      Max.   :1.0000  
##                                       NA's   :112
library(flextable)
## 
## Attaching package: 'flextable'
## The following object is masked from 'package:partykit':
## 
##     width
## The following object is masked from 'package:purrr':
## 
##     compose
diagnose(df) %>% flextable() #diagnose df

variables

types

missing_count

missing_percent

unique_count

unique_rate

ID

integer

0

0

319

0.911428571

Cohort

character

0

0

5

0.014285714

admission

integer

0

0

2

0.005714286

enrolled

character

0

0

3

0.008571429

HSGPA

numeric

0

0

127

0.362857143

ACT

character

0

0

26

0.074285714

Graduated

integer

112

32

3

0.008571429

Hmisc::describe(df) 
## df 
## 
##  7  Variables      350  Observations
## --------------------------------------------------------------------------------
## ID 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      350        0      319        1    3e+05    535.3   299202   299349 
##      .25      .50      .75      .90      .95 
##   299703   300013   300335   300580   300797 
## 
## lowest : 298447 298720 298955 299005 299021, highest: 301142 301194 301303 301313 301592
## --------------------------------------------------------------------------------
## Cohort 
##        n  missing distinct 
##      350        0        5 
## 
## lowest : Fall2015   Fall2016   Fall2018   Spring2019 Summer2017
## highest: Fall2015   Fall2016   Fall2018   Spring2019 Summer2017
##                                                                  
## Value        Fall2015   Fall2016   Fall2018 Spring2019 Summer2017
## Frequency          17         61        105        166          1
## Proportion      0.049      0.174      0.300      0.474      0.003
## --------------------------------------------------------------------------------
## admission 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##      350        0        2    0.524      271   0.7743   0.3505 
## 
## --------------------------------------------------------------------------------
## enrolled 
##        n  missing distinct 
##      350        0        3 
##                             
## Value          0     1   n/a
## Frequency    189   160     1
## Proportion 0.540 0.457 0.003
## --------------------------------------------------------------------------------
## HSGPA 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      350        0      127        1    7.107    8.129     2.67     2.85 
##      .25      .50      .75      .90      .95 
##     3.12     3.38     3.69     3.95     4.00 
## 
## lowest :   0.00   2.26   2.42   2.55   2.56, highest:   3.98   3.99   4.00 333.00 999.00
##                             
## Value          0   330  1000
## Frequency    348     1     1
## Proportion 0.994 0.003 0.003
## 
## For the frequency table, variable is rounded to the nearest 10
## --------------------------------------------------------------------------------
## ACT 
##        n  missing distinct 
##      350        0       26 
## 
## lowest : 10   11   12   13   14  , highest: 31   32   33   34   NULL
## --------------------------------------------------------------------------------
## Graduated 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##      238      112        2     0.72       95   0.3992   0.4817 
## 
## --------------------------------------------------------------------------------
view_df(df, show.frq = T, show.prc = T, show.na = T)
Data frame: df
ID Name Label missings Values Value Labels Freq. %
1 ID 0 (0.00%) range: 298447-301592
2 Cohort 0 (0.00%) <output omitted> <output omitted> <output omitted>
3 admission 0 (0.00%) range: 0-1
4 enrolled 0 (0.00%) <output omitted> <output omitted> <output omitted>
5 HSGPA 0 (0.00%) range: 0.0-999.0
6 ACT 0 (0.00%) <output omitted> <output omitted> <output omitted>
7 Graduated 112 (32.00%) range: 0-1

From the data summary, admission and graduated are integers. However from the data description, these variables should be represented as factors. Again, ACT is represented as character, however these are scores and should be represented as numbers. Again, enrolled is represented at character which should be factor. Thus, re-formatting the data types

#before changing data types copy the original data df to df1
df1 <- df
df1$admission <- as.factor(df1$admission)
df1$Graduated <- as.factor(df1$Graduated)
df1$enrolled <- as.factor(df1$enrolled)
df1$ACT <- as.numeric(df1$ACT)
## Warning: NAs introduced by coercion
df1$Cohort <- as.character(df1$Cohort)
diagnose(df1[, -1]) %>% flextable()  #remove id

variables

types

missing_count

missing_percent

unique_count

unique_rate

Cohort

character

0

0.000000

5

0.014285714

admission

factor

0

0.000000

2

0.005714286

enrolled

factor

0

0.000000

3

0.008571429

HSGPA

numeric

0

0.000000

127

0.362857143

ACT

numeric

8

2.285714

26

0.074285714

Graduated

factor

112

32.000000

3

0.008571429

print(view_df(df1[, -1], show.frq = T, show.prc = T, show.na = T))
diagnose_category(df1[, -1]) %>% flextable()

variables

levels

N

freq

ratio

rank

Cohort

Spring2019

350

166

47.4285714

1

Cohort

Fall2018

350

105

30.0000000

2

Cohort

Fall2016

350

61

17.4285714

3

Cohort

Fall2015

350

17

4.8571429

4

Cohort

Summer2017

350

1

0.2857143

5

admission

1

350

271

77.4285714

1

admission

0

350

79

22.5714286

2

enrolled

0

350

189

54.0000000

1

enrolled

1

350

160

45.7142857

2

enrolled

n/a

350

1

0.2857143

3

Graduated

0

350

143

40.8571429

1

Graduated

350

112

32.0000000

2

Graduated

1

350

95

27.1428571

3

From the categorical summary, we see that enrolled has n/a level which occurs only 1 and thus, it can be removed and again, Graduated had 112 NAs

diagnose_numeric(df1[, -1])
md.pattern(df1[,-1])

##     Cohort admission enrolled HSGPA ACT Graduated    
## 232      1         1        1     1   1         1   0
## 110      1         1        1     1   1         0   1
## 6        1         1        1     1   0         1   1
## 2        1         1        1     1   0         0   2
##          0         0        0     0   8       112 120
funModeling::status(df1[, -1]) %>% flextable()

variable

q_zeros

p_zeros

q_na

p_na

q_inf

p_inf

type

unique

Cohort

0

0.00000000

0

0.00000000

0

0

character

5

admission

79

0.22571429

0

0.00000000

0

0

factor

2

enrolled

189

0.54000000

0

0.00000000

0

0

factor

3

HSGPA

8

0.02285714

0

0.00000000

0

0

numeric

127

ACT

0

0.00000000

8

0.02285714

0

0

numeric

25

Graduated

143

0.40857143

112

0.32000000

0

0

factor

2

plot_outlier(df1[, -1])

There are significant outliers in the HSGPA scores which affects the distribution. Removal of these outliers leads to improved distribution

Detect outliers using boxplot methods. Boxplots are a popular and an easy method for identifying outliers. There are two categories of outlier: (1) outliers and (2) extreme points.

Values above Q3 + 1.5xIQR or below Q1 - 1.5xIQR are considered as outliers. Values above Q3 + 3xIQR or below Q1 - 3xIQR are considered as extreme points (or extreme outliers).

df1[, -1] %>% identify_outliers(HSGPA)

#outlier removal

df2 <- df1 %>% rownames_to_column(var = "index") #create an index to identify and remove extreme outliers
df2$index <- as.numeric(df2$index)
head(df2)
# Remove outliers
df2_outlier <- df2 %>% identify_outliers(HSGPA) %>% filter(is.extreme == "TRUE") #get extreme outliers

df3 <- df2 %>% 
  anti_join(df2_outlier, by = "index")

df3
plot_outlier(df3[, -c(1,2)])

df4 <- df3[, -c(1,2)] #The dataset without outliers
#lets select only admitted students and no enrollment
df5 <- df4 %>% filter(admission == 1) 
df5 <- df5 %>% filter(enrolled == 1)
md.pattern(df5)

##     Cohort admission enrolled HSGPA ACT Graduated   
## 135      1         1        1     1   1         1  0
## 18       1         1        1     1   1         0  1
## 3        1         1        1     1   0         1  1
##          0         0        0     0   3        18 21
view_df(df5, show.frq = T, show.prc = T, show.na = T)
Data frame: df5
ID Name Label missings Values Value Labels Freq. %
1 Cohort 0 (0.00%) <output omitted> <output omitted> <output omitted>
2 admission 0 (0.00%) 0
1
0
156
0.00
100.00
3 enrolled 0 (0.00%) 0
1
n/a
0
156
0
0.00
100.00
0.00
4 HSGPA 0 (0.00%) range: 2.4-4.0
5 ACT 3 (1.92%) range: 14-34
6 Graduated 18 (11.54%) 0
1
48
90
34.78
65.22
df5 <- df5[, -c(2,3)]
df5 %>%
ggpairs(aes(col = Graduated, fill = Graduated, alpha = 0.6),
        upper = list(combo = 'box'),
        diag = list(discrete = wrap('barDiag', position = 'fill')),
        lower = list(combo = 'dot_no_facet')) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning: Removed 3 rows containing non-finite values (`stat_boxplot()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 21 rows containing missing values
## Warning: Removed 3 rows containing missing values (`geom_point()`).
## Removed 3 rows containing missing values (`geom_point()`).
## Warning: Removed 3 rows containing non-finite values (`stat_density()`).
## Warning: Removed 3 rows containing non-finite values (`stat_boxplot()`).
## Warning: Removed 18 rows containing missing values (`geom_point()`).
## Warning: Removed 21 rows containing missing values (`geom_point()`).

# Remove nas
naomit_df6 <- na.omit(df5)

md.pattern(naomit_df6)
##  /\     /\
## {  `---'  }
## {  O   O  }
## ==>  V <==  No need for mice. This data set is completely observed.
##  \  \|/  /
##   `-----'

##     Cohort HSGPA ACT Graduated  
## 135      1     1   1         1 0
##          0     0   0         0 0
naomit_df6 %>%
ggpairs(aes(col = Graduated, fill = Graduated, alpha = 0.6),
        upper = list(combo = 'box'),
        diag = list(discrete = wrap('barDiag', position = 'fill')),
        lower = list(combo = 'dot_no_facet')) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

view_df(naomit_df6, show.frq = T, show.prc = T, show.na = T)
Data frame: naomit_df6
ID Name Label missings Values Value Labels Freq. %
1 Cohort 0 (0.00%) <output omitted> <output omitted> <output omitted>
2 HSGPA 0 (0.00%) range: 2.4-4.0
3 ACT 0 (0.00%) range: 14-34
4 Graduated 0 (0.00%) 0
1
48
87
35.56
64.44

model building

naomit_df6 <- naomit_df6 %>% mutate(grad = case_when(Graduated == 0 ~ "No",
                        Graduated == 1 ~ "Yes")) #recode graduated as yes and no
new_df <- naomit_df6[, -c(1,4)] #remove admission and graduated
oj_trControl = trainControl(
   method = "cv",
   number = 10,
   savePredictions = "final",       # save preds for the optimal tuning parameter
   classProbs = TRUE,  # class probs in addition to preds
   summaryFunction = twoClassSummary
   )
# build a tree model
set.seed(1234)
tree_mod1 <- train(
   grad ~ ., 
   data = new_df, 
   method = "rpart",
   tuneLength = 5,
   metric = "ROC",
   trControl = oj_trControl
   )
print(tree_mod1)
## CART 
## 
## 135 samples
##   2 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 121, 122, 121, 122, 122, 123, ... 
## Resampling results across tuning parameters:
## 
##   cp        ROC        Sens   Spec     
##   0.000000  0.5398958  0.320  0.7458333
##   0.015625  0.5310069  0.255  0.7819444
##   0.031250  0.5323958  0.295  0.7680556
##   0.046875  0.4993403  0.235  0.7583333
##   0.062500  0.4722569  0.085  0.8625000
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.
tree_mod1$finalModel
## n= 135 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 135 48 Yes (0.35555556 0.64444444)  
##    2) ACT< 25.5 67 32 Yes (0.47761194 0.52238806)  
##      4) HSGPA>=3.14 55 26 No (0.52727273 0.47272727)  
##        8) HSGPA< 3.375 25  8 No (0.68000000 0.32000000) *
##        9) HSGPA>=3.375 30 12 Yes (0.40000000 0.60000000)  
##         18) ACT< 21.5 16  7 No (0.56250000 0.43750000) *
##         19) ACT>=21.5 14  3 Yes (0.21428571 0.78571429) *
##      5) HSGPA< 3.14 12  3 Yes (0.25000000 0.75000000) *
##    3) ACT>=25.5 68 16 Yes (0.23529412 0.76470588)  
##      6) HSGPA< 3.345 21  8 Yes (0.38095238 0.61904762)  
##       12) HSGPA>=3.275 9  4 No (0.55555556 0.44444444) *
##       13) HSGPA< 3.275 12  3 Yes (0.25000000 0.75000000) *
##      7) HSGPA>=3.345 47  8 Yes (0.17021277 0.82978723)  
##       14) HSGPA>=3.755 25  7 Yes (0.28000000 0.72000000)  
##         28) ACT< 28.5 9  4 No (0.55555556 0.44444444) *
##         29) ACT>=28.5 16  2 Yes (0.12500000 0.87500000) *
##       15) HSGPA< 3.755 22  1 Yes (0.04545455 0.95454545) *
rpart.plot(tree_mod1$finalModel)

treemod1_cart1 <- bind_cols(
   predict(tree_mod1, newdata = new_df, type = "prob"),
   Predicted = predict(tree_mod1, newdata = new_df, type = "raw"),
   Actual = new_df$grad
)

treemod1_cart1$Actual <- as.factor(treemod1_cart1$Actual)
treemod1_cart2 <- confusionMatrix(treemod1_cart1$Predicted, treemod1_cart1$Actual)
treemod1_cart2
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  36  23
##        Yes 12  64
##                                           
##                Accuracy : 0.7407          
##                  95% CI : (0.6583, 0.8123)
##     No Information Rate : 0.6444          
##     P-Value [Acc > NIR] : 0.01098         
##                                           
##                   Kappa : 0.4619          
##                                           
##  Mcnemar's Test P-Value : 0.09097         
##                                           
##             Sensitivity : 0.7500          
##             Specificity : 0.7356          
##          Pos Pred Value : 0.6102          
##          Neg Pred Value : 0.8421          
##              Prevalence : 0.3556          
##          Detection Rate : 0.2667          
##    Detection Prevalence : 0.4370          
##       Balanced Accuracy : 0.7428          
##                                           
##        'Positive' Class : No              
## 
mdl_auc <- Metrics::auc(actual = treemod1_cart1$Actual == "No", treemod1_cart1$No)
yardstick::roc_curve(treemod1_cart1, Actual, No) %>%
  autoplot() +
  labs(
    title = "tree1 ROC Curve ",
    subtitle = paste0("AUC = ", round(mdl_auc, 4))
  )
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## ℹ The deprecated feature was likely used in the yardstick package.
##   Please report the issue at <]8;;https://github.com/tidymodels/yardstick/issueshttps://github.com/tidymodels/yardstick/issues]8;;>.

plot(varImp(tree_mod1), main="Variable Importance with tree1")

# build a tree model
set.seed(1234)
logit_mod1 <- train(
   grad ~ ., 
   data = new_df, 
   method = "glm",
   tuneLength = 5,
   metric = "ROC",
   trControl = oj_trControl
   )

summary(logit_mod1)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9823  -1.2127   0.7090   0.9473   1.5178  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -3.48769    2.03670  -1.712  0.08682 . 
## HSGPA        0.33657    0.57120   0.589  0.55570   
## ACT          0.11633    0.04248   2.738  0.00618 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 175.72  on 134  degrees of freedom
## Residual deviance: 165.90  on 132  degrees of freedom
## AIC: 171.9
## 
## Number of Fisher Scoring iterations: 4
logitmod1_cart1 <- bind_cols(
   predict(logit_mod1, newdata = new_df, type = "prob"),
   Predicted = predict(logit_mod1, newdata = new_df, type = "raw"),
   Actual = new_df$grad
)

logitmod1_cart1$Actual <- as.factor(logitmod1_cart1$Actual)
logitmod1_cart2 <- confusionMatrix(logitmod1_cart1$Predicted, logitmod1_cart1$Actual)
logitmod1_cart2
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  11  10
##        Yes 37  77
##                                           
##                Accuracy : 0.6519          
##                  95% CI : (0.5651, 0.7317)
##     No Information Rate : 0.6444          
##     P-Value [Acc > NIR] : 0.4676342       
##                                           
##                   Kappa : 0.1307          
##                                           
##  Mcnemar's Test P-Value : 0.0001491       
##                                           
##             Sensitivity : 0.22917         
##             Specificity : 0.88506         
##          Pos Pred Value : 0.52381         
##          Neg Pred Value : 0.67544         
##              Prevalence : 0.35556         
##          Detection Rate : 0.08148         
##    Detection Prevalence : 0.15556         
##       Balanced Accuracy : 0.55711         
##                                           
##        'Positive' Class : No              
## 
log_auc <- Metrics::auc(actual = logitmod1_cart1$Actual == "No", logitmod1_cart1$No)
yardstick::roc_curve(logitmod1_cart1, Actual, No) %>%
  autoplot() +
  labs(
    title = "logistic ROC Curve ",
    subtitle = paste0("AUC = ", round(log_auc, 4))
  )

library(modelplotr)
## Package modelplotr loaded! Happy model plotting!
scores_and_ntiles <- prepare_scores_and_ntiles(datasets=list("new_df","new_df"),
                      dataset_labels = list("train data","test data"),
                      models = list("tree_mod1","logit_mod1"),
                      model_labels = list("decision tree","logistic"),
                      target_column="grad",
                      ntiles=100)
## Warning: `select_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `select()` instead.
## ℹ The deprecated feature was likely used in the modelplotr package.
##   Please report the issue at <]8;;https://github.com/jurrr/modelplotr/issueshttps://github.com/jurrr/modelplotr/issues]8;;>.
## ... scoring caret model "tree_mod1" on dataset "new_df".
## ... scoring caret model "logit_mod1" on dataset "new_df".
## ... scoring caret model "tree_mod1" on dataset "new_df".
## ... scoring caret model "logit_mod1" on dataset "new_df".
## Data preparation step 1 succeeded! Dataframe created.
# transform data generated with prepare_scores_and_ntiles into aggregated data for chosen plotting scope 
plot_input <- plotting_scope(prepared_input = scores_and_ntiles,
                             select_model_label = "decision tree",
                             select_dataset_label = "train data")
## Warning: `group_by_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `group_by()` instead.
## ℹ See vignette('programming') for more help
## ℹ The deprecated feature was likely used in the modelplotr package.
##   Please report the issue at <]8;;https://github.com/jurrr/modelplotr/issueshttps://github.com/jurrr/modelplotr/issues]8;;>.
## Data preparation step 2 succeeded! Dataframe created.
## "prepared_input" aggregated...
## Data preparation step 3 succeeded! Dataframe created.
## 
## No comparison specified, default values are used. 
## 
## Single evaluation line will be plotted: Target value "No" plotted for dataset "train data" and model "decision tree.
## "
## -> To compare models, specify: scope = "compare_models"
## -> To compare datasets, specify: scope = "compare_datasets"
## -> To compare target classes, specify: scope = "compare_targetclasses"
## -> To plot one line, do not specify scope or specify scope = "no_comparison".
# plot the cumulative gains plot
plot_cumgains(data = plot_input)

# plot the cumulative gains plot and annotate the plot at percentile = 20
plot_cumgains(data = plot_input,highlight_ntile = 20)
## Warning: Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.
##  
## Plot annotation for plot: Cumulative gains
## - When we select 20% with the highest probability according to model decision tree, this selection holds 40% of all No cases in train data. 
##  
## 

# plot the cumulative lift plot and annotate the plot at percentile = 20
plot_cumlift(data = plot_input,highlight_ntile = 20)
## Warning: Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.
##  
## Plot annotation for plot: Cumulative lift
## - When we select 20% with the highest probability according to model decision tree in train data, this selection for No cases is 2.0 times better than selecting without a model. 
##  
## 

# set plotting scope to model comparison
plot_input <- plotting_scope(prepared_input = scores_and_ntiles,scope = "compare_models")
## Data preparation step 2 succeeded! Dataframe created.
## "prepared_input" aggregated...
## Data preparation step 3 succeeded! Dataframe created.
## 
## Models "decision tree", "logistic" compared for dataset "train data" and target value "No".
# plot the cumulative response plot and annotate the plot at ntile 20
plot_cumgains(data = plot_input,highlight_ntile = 20)
## Warning: Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.
##  
## Plot annotation for plot: Cumulative gains
## - When we select 20% with the highest probability according to model decision tree, this selection holds 40% of all No cases in train data. 
## - When we select 20% with the highest probability according to model logistic, this selection holds 29% of all No cases in train data. 
##  
## 

plot_cumlift(data = plot_input,highlight_ntile = 20)
## Warning: Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.
##  
## Plot annotation for plot: Cumulative lift
## - When we select 20% with the highest probability according to model decision tree in train data, this selection for No cases is 2.0 times better than selecting without a model. 
## - When we select 20% with the highest probability according to model logistic in train data, this selection for No cases is 1.5 times better than selecting without a model. 
##  
## 

#Converting Yes to 1 and No to 0 in df1 

logrec_df <- logitmod1_cart1
treerec_df <- treemod1_cart1

logrec_df$Predicted<-ifelse(logrec_df$Predicted=="Yes",1,0)
logrec_df$Actual<-ifelse(logrec_df$Actual=="Yes",1,0)


treerec_df$Predicted<-ifelse(treerec_df$Predicted=="Yes",1,0)
treerec_df$Actual<-ifelse(treerec_df$Actual=="Yes",1,0)
library(InformationValue)
## 
## Attaching package: 'InformationValue'
## The following objects are masked from 'package:caret':
## 
##     confusionMatrix, precision, recall, sensitivity, specificity
ks_stat(logrec_df$Actual, logrec_df$Predicted)
## [1] 0.1588
ks_stat(logrec_df$Actual, logrec_df$Predicted, returnKSTable = T)
ks_plot(logrec_df$Actual, logrec_df$Predicted)

ks_stat(treerec_df$Actual, treerec_df$Predicted)
## [1] 0.4813
ks_plot(treerec_df$Actual, treerec_df$Predicted)