library(dlookr)

## 
## Attaching package: 'dlookr'

## The following object is masked from 'package:base':
## 
##     transform

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.1.8
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract() masks dlookr::extract()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors

library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library(rstatix)

## 
## Attaching package: 'rstatix'
## 
## The following object is masked from 'package:stats':
## 
##     filter

library(UpSetR)

## 
## Attaching package: 'UpSetR'
## 
## The following object is masked from 'package:lattice':
## 
##     histogram

library(naniar)

library(Hmisc)

## Loading required package: survival
## 
## Attaching package: 'survival'
## 
## The following object is masked from 'package:caret':
## 
##     cluster
## 
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## The following object is masked from 'package:dlookr':
## 
##     describe
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units

library(VIM)

## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## 
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## 
## The following object is masked from 'package:datasets':
## 
##     sleep

library(RWeka)
library(mice)

## 
## Attaching package: 'mice'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     cbind, rbind

library(tree)
library(partykit)

## Loading required package: libcoin
## Loading required package: mvtnorm

library(maptree)

## Loading required package: cluster
## Loading required package: rpart

library(blorr)
library(pROC)

## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following object is masked from 'package:colorspace':
## 
##     coords
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(rpart.plot)
library(sjPlot)

#read the dataset
df <- read.csv("E:/MScDataScience/predictive/logistic.csv")

head(df)

summary(df) #summarise dataset

##        ID            Cohort            admission        enrolled        
##  Min.   :298447   Length:350         Min.   :0.0000   Length:350        
##  1st Qu.:299703   Class :character   1st Qu.:1.0000   Class :character  
##  Median :300013   Mode  :character   Median :1.0000   Mode  :character  
##  Mean   :300014                      Mean   :0.7743                     
##  3rd Qu.:300335                      3rd Qu.:1.0000                     
##  Max.   :301592                      Max.   :1.0000                     
##                                                                         
##      HSGPA             ACT              Graduated     
##  Min.   :  0.000   Length:350         Min.   :0.0000  
##  1st Qu.:  3.120   Class :character   1st Qu.:0.0000  
##  Median :  3.380   Mode  :character   Median :0.0000  
##  Mean   :  7.107                      Mean   :0.3992  
##  3rd Qu.:  3.690                      3rd Qu.:1.0000  
##  Max.   :999.000                      Max.   :1.0000  
##                                       NA's   :112

library(flextable)

## 
## Attaching package: 'flextable'

## The following object is masked from 'package:partykit':
## 
##     width

## The following object is masked from 'package:purrr':
## 
##     compose

diagnose(df) %>% flextable() #diagnose df

variables	types	missing_count	missing_percent	unique_count	unique_rate
ID	integer	0	0	319	0.911428571
Cohort	character	0	0	5	0.014285714
admission	integer	0	0	2	0.005714286
enrolled	character	0	0	3	0.008571429
HSGPA	numeric	0	0	127	0.362857143
ACT	character	0	0	26	0.074285714
Graduated	integer	112	32	3	0.008571429

Hmisc::describe(df)

## df 
## 
##  7  Variables      350  Observations
## --------------------------------------------------------------------------------
## ID 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      350        0      319        1    3e+05    535.3   299202   299349 
##      .25      .50      .75      .90      .95 
##   299703   300013   300335   300580   300797 
## 
## lowest : 298447 298720 298955 299005 299021, highest: 301142 301194 301303 301313 301592
## --------------------------------------------------------------------------------
## Cohort 
##        n  missing distinct 
##      350        0        5 
## 
## lowest : Fall2015   Fall2016   Fall2018   Spring2019 Summer2017
## highest: Fall2015   Fall2016   Fall2018   Spring2019 Summer2017
##                                                                  
## Value        Fall2015   Fall2016   Fall2018 Spring2019 Summer2017
## Frequency          17         61        105        166          1
## Proportion      0.049      0.174      0.300      0.474      0.003
## --------------------------------------------------------------------------------
## admission 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##      350        0        2    0.524      271   0.7743   0.3505 
## 
## --------------------------------------------------------------------------------
## enrolled 
##        n  missing distinct 
##      350        0        3 
##                             
## Value          0     1   n/a
## Frequency    189   160     1
## Proportion 0.540 0.457 0.003
## --------------------------------------------------------------------------------
## HSGPA 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      350        0      127        1    7.107    8.129     2.67     2.85 
##      .25      .50      .75      .90      .95 
##     3.12     3.38     3.69     3.95     4.00 
## 
## lowest :   0.00   2.26   2.42   2.55   2.56, highest:   3.98   3.99   4.00 333.00 999.00
##                             
## Value          0   330  1000
## Frequency    348     1     1
## Proportion 0.994 0.003 0.003
## 
## For the frequency table, variable is rounded to the nearest 10
## --------------------------------------------------------------------------------
## ACT 
##        n  missing distinct 
##      350        0       26 
## 
## lowest : 10   11   12   13   14  , highest: 31   32   33   34   NULL
## --------------------------------------------------------------------------------
## Graduated 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##      238      112        2     0.72       95   0.3992   0.4817 
## 
## --------------------------------------------------------------------------------

view_df(df, show.frq = T, show.prc = T, show.na = T)

Data frame: df
ID	Name	missings	Values	Value Labels	Freq.	%
1	ID	0 (0.00%)	range: 298447-301592
2	Cohort	0 (0.00%)		<output omitted>	<output omitted>	<output omitted>
3	admission	0 (0.00%)	range: 0-1
4	enrolled	0 (0.00%)		<output omitted>	<output omitted>	<output omitted>
5	HSGPA	0 (0.00%)	range: 0.0-999.0
6	ACT	0 (0.00%)		<output omitted>	<output omitted>	<output omitted>
7	Graduated	112 (32.00%)	range: 0-1

From the data summary, admission and graduated are integers. However from the data description, these variables should be represented as factors. Again, ACT is represented as character, however these are scores and should be represented as numbers. Again, enrolled is represented at character which should be factor. Thus, re-formatting the data types

#before changing data types copy the original data df to df1
df1 <- df
df1$admission <- as.factor(df1$admission)
df1$Graduated <- as.factor(df1$Graduated)
df1$enrolled <- as.factor(df1$enrolled)
df1$ACT <- as.numeric(df1$ACT)

## Warning: NAs introduced by coercion

df1$Cohort <- as.character(df1$Cohort)

diagnose(df1[, -1]) %>% flextable()  #remove id

variables	types	missing_count	missing_percent	unique_count	unique_rate
Cohort	character	0	0.000000	5	0.014285714
admission	factor	0	0.000000	2	0.005714286
enrolled	factor	0	0.000000	3	0.008571429
HSGPA	numeric	0	0.000000	127	0.362857143
ACT	numeric	8	2.285714	26	0.074285714
Graduated	factor	112	32.000000	3	0.008571429

print(view_df(df1[, -1], show.frq = T, show.prc = T, show.na = T))

diagnose_category(df1[, -1]) %>% flextable()

variables	levels	N	freq	ratio	rank
Cohort	Spring2019	350	166	47.4285714	1
Cohort	Fall2018	350	105	30.0000000	2
Cohort	Fall2016	350	61	17.4285714	3
Cohort	Fall2015	350	17	4.8571429	4
Cohort	Summer2017	350	1	0.2857143	5
admission	1	350	271	77.4285714	1
admission	0	350	79	22.5714286	2
enrolled	0	350	189	54.0000000	1
enrolled	1	350	160	45.7142857	2
enrolled	n/a	350	1	0.2857143	3
Graduated	0	350	143	40.8571429	1
Graduated		350	112	32.0000000	2
Graduated	1	350	95	27.1428571	3

From the categorical summary, we see that enrolled has n/a level which occurs only 1 and thus, it can be removed and again, Graduated had 112 NAs

diagnose_numeric(df1[, -1])

md.pattern(df1[,-1])

##     Cohort admission enrolled HSGPA ACT Graduated    
## 232      1         1        1     1   1         1   0
## 110      1         1        1     1   1         0   1
## 6        1         1        1     1   0         1   1
## 2        1         1        1     1   0         0   2
##          0         0        0     0   8       112 120

funModeling::status(df1[, -1]) %>% flextable()

variable	q_zeros	p_zeros	q_na	p_na	type	unique
Cohort	0	0.00000000	0	0.00000000	character	5
admission	79	0.22571429	0	0.00000000	factor	2
enrolled	189	0.54000000	0	0.00000000	factor	3
HSGPA	8	0.02285714	0	0.00000000	numeric	127
ACT	0	0.00000000	8	0.02285714	numeric	25
Graduated	143	0.40857143	112	0.32000000	factor	2

plot_outlier(df1[, -1])

There are significant outliers in the HSGPA scores which affects the distribution. Removal of these outliers leads to improved distribution

Detect outliers using boxplot methods. Boxplots are a popular and an easy method for identifying outliers. There are two categories of outlier: (1) outliers and (2) extreme points.

Values above Q3 + 1.5xIQR or below Q1 - 1.5xIQR are considered as outliers. Values above Q3 + 3xIQR or below Q1 - 3xIQR are considered as extreme points (or extreme outliers).

df1[, -1] %>% identify_outliers(HSGPA)

#outlier removal

df2 <- df1 %>% rownames_to_column(var = "index") #create an index to identify and remove extreme outliers
df2$index <- as.numeric(df2$index)
head(df2)

# Remove outliers
df2_outlier <- df2 %>% identify_outliers(HSGPA) %>% filter(is.extreme == "TRUE") #get extreme outliers

df3 <- df2 %>% 
  anti_join(df2_outlier, by = "index")

df3

plot_outlier(df3[, -c(1,2)])

df4 <- df3[, -c(1,2)] #The dataset without outliers

#lets select only admitted students and no enrollment
df5 <- df4 %>% filter(admission == 1) 
df5 <- df5 %>% filter(enrolled == 1)

md.pattern(df5)

##     Cohort admission enrolled HSGPA ACT Graduated   
## 135      1         1        1     1   1         1  0
## 18       1         1        1     1   1         0  1
## 3        1         1        1     1   0         1  1
##          0         0        0     0   3        18 21

view_df(df5, show.frq = T, show.prc = T, show.na = T)

Data frame: df5
ID	Name	missings	Values	Value Labels	Freq.	%
1	Cohort	0 (0.00%)		<output omitted>	<output omitted>	<output omitted>
2	admission	0 (0.00%)		0 1	0 156	0.00 100.00
3	enrolled	0 (0.00%)		0 1 n/a	0 156 0	0.00 100.00 0.00
4	HSGPA	0 (0.00%)	range: 2.4-4.0
5	ACT	3 (1.92%)	range: 14-34
6	Graduated	18 (11.54%)		0 1	48 90	34.78 65.22

df5 <- df5[, -c(2,3)]

df5 %>%
ggpairs(aes(col = Graduated, fill = Graduated, alpha = 0.6),
        upper = list(combo = 'box'),
        diag = list(discrete = wrap('barDiag', position = 'fill')),
        lower = list(combo = 'dot_no_facet')) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

## Warning: Removed 3 rows containing non-finite values (`stat_boxplot()`).

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 21 rows containing missing values

## Warning: Removed 3 rows containing missing values (`geom_point()`).
## Removed 3 rows containing missing values (`geom_point()`).

## Warning: Removed 3 rows containing non-finite values (`stat_density()`).

## Warning: Removed 3 rows containing non-finite values (`stat_boxplot()`).

## Warning: Removed 18 rows containing missing values (`geom_point()`).

## Warning: Removed 21 rows containing missing values (`geom_point()`).

# Remove nas
naomit_df6 <- na.omit(df5)

md.pattern(naomit_df6)

##  /\     /\
## {  `---'  }
## {  O   O  }
## ==>  V <==  No need for mice. This data set is completely observed.
##  \  \|/  /
##   `-----'

##     Cohort HSGPA ACT Graduated  
## 135      1     1   1         1 0
##          0     0   0         0 0

naomit_df6 %>%
ggpairs(aes(col = Graduated, fill = Graduated, alpha = 0.6),
        upper = list(combo = 'box'),
        diag = list(discrete = wrap('barDiag', position = 'fill')),
        lower = list(combo = 'dot_no_facet')) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

view_df(naomit_df6, show.frq = T, show.prc = T, show.na = T)

Data frame: naomit_df6
ID	Name	missings	Values	Value Labels	Freq.	%
1	Cohort	0 (0.00%)		<output omitted>	<output omitted>	<output omitted>
2	HSGPA	0 (0.00%)	range: 2.4-4.0
3	ACT	0 (0.00%)	range: 14-34
4	Graduated	0 (0.00%)		0 1	48 87	35.56 64.44

model building

naomit_df6 <- naomit_df6 %>% mutate(grad = case_when(Graduated == 0 ~ "No",
                        Graduated == 1 ~ "Yes")) #recode graduated as yes and no

new_df <- naomit_df6[, -c(1,4)] #remove admission and graduated

oj_trControl = trainControl(
   method = "cv",
   number = 10,
   savePredictions = "final",       # save preds for the optimal tuning parameter
   classProbs = TRUE,  # class probs in addition to preds
   summaryFunction = twoClassSummary
   )

# build a tree model
set.seed(1234)
tree_mod1 <- train(
   grad ~ ., 
   data = new_df, 
   method = "rpart",
   tuneLength = 5,
   metric = "ROC",
   trControl = oj_trControl
   )

print(tree_mod1)

## CART 
## 
## 135 samples
##   2 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 121, 122, 121, 122, 122, 123, ... 
## Resampling results across tuning parameters:
## 
##   cp        ROC        Sens   Spec     
##   0.000000  0.5398958  0.320  0.7458333
##   0.015625  0.5310069  0.255  0.7819444
##   0.031250  0.5323958  0.295  0.7680556
##   0.046875  0.4993403  0.235  0.7583333
##   0.062500  0.4722569  0.085  0.8625000
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.

tree_mod1$finalModel

## n= 135 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 135 48 Yes (0.35555556 0.64444444)  
##    2) ACT< 25.5 67 32 Yes (0.47761194 0.52238806)  
##      4) HSGPA>=3.14 55 26 No (0.52727273 0.47272727)  
##        8) HSGPA< 3.375 25  8 No (0.68000000 0.32000000) *
##        9) HSGPA>=3.375 30 12 Yes (0.40000000 0.60000000)  
##         18) ACT< 21.5 16  7 No (0.56250000 0.43750000) *
##         19) ACT>=21.5 14  3 Yes (0.21428571 0.78571429) *
##      5) HSGPA< 3.14 12  3 Yes (0.25000000 0.75000000) *
##    3) ACT>=25.5 68 16 Yes (0.23529412 0.76470588)  
##      6) HSGPA< 3.345 21  8 Yes (0.38095238 0.61904762)  
##       12) HSGPA>=3.275 9  4 No (0.55555556 0.44444444) *
##       13) HSGPA< 3.275 12  3 Yes (0.25000000 0.75000000) *
##      7) HSGPA>=3.345 47  8 Yes (0.17021277 0.82978723)  
##       14) HSGPA>=3.755 25  7 Yes (0.28000000 0.72000000)  
##         28) ACT< 28.5 9  4 No (0.55555556 0.44444444) *
##         29) ACT>=28.5 16  2 Yes (0.12500000 0.87500000) *
##       15) HSGPA< 3.755 22  1 Yes (0.04545455 0.95454545) *

rpart.plot(tree_mod1$finalModel)

treemod1_cart1 <- bind_cols(
   predict(tree_mod1, newdata = new_df, type = "prob"),
   Predicted = predict(tree_mod1, newdata = new_df, type = "raw"),
   Actual = new_df$grad
)

treemod1_cart1$Actual <- as.factor(treemod1_cart1$Actual)
treemod1_cart2 <- confusionMatrix(treemod1_cart1$Predicted, treemod1_cart1$Actual)
treemod1_cart2

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  36  23
##        Yes 12  64
##                                           
##                Accuracy : 0.7407          
##                  95% CI : (0.6583, 0.8123)
##     No Information Rate : 0.6444          
##     P-Value [Acc > NIR] : 0.01098         
##                                           
##                   Kappa : 0.4619          
##                                           
##  Mcnemar's Test P-Value : 0.09097         
##                                           
##             Sensitivity : 0.7500          
##             Specificity : 0.7356          
##          Pos Pred Value : 0.6102          
##          Neg Pred Value : 0.8421          
##              Prevalence : 0.3556          
##          Detection Rate : 0.2667          
##    Detection Prevalence : 0.4370          
##       Balanced Accuracy : 0.7428          
##                                           
##        'Positive' Class : No              
##

mdl_auc <- Metrics::auc(actual = treemod1_cart1$Actual == "No", treemod1_cart1$No)
yardstick::roc_curve(treemod1_cart1, Actual, No) %>%
  autoplot() +
  labs(
    title = "tree1 ROC Curve ",
    subtitle = paste0("AUC = ", round(mdl_auc, 4))
  )

## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## ℹ The deprecated feature was likely used in the yardstick package.
##   Please report the issue at <]8;;https://github.com/tidymodels/yardstick/issueshttps://github.com/tidymodels/yardstick/issues]8;;>.

plot(varImp(tree_mod1), main="Variable Importance with tree1")

# build a tree model
set.seed(1234)
logit_mod1 <- train(
   grad ~ ., 
   data = new_df, 
   method = "glm",
   tuneLength = 5,
   metric = "ROC",
   trControl = oj_trControl
   )

summary(logit_mod1)

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9823  -1.2127   0.7090   0.9473   1.5178  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -3.48769    2.03670  -1.712  0.08682 . 
## HSGPA        0.33657    0.57120   0.589  0.55570   
## ACT          0.11633    0.04248   2.738  0.00618 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 175.72  on 134  degrees of freedom
## Residual deviance: 165.90  on 132  degrees of freedom
## AIC: 171.9
## 
## Number of Fisher Scoring iterations: 4

logitmod1_cart1 <- bind_cols(
   predict(logit_mod1, newdata = new_df, type = "prob"),
   Predicted = predict(logit_mod1, newdata = new_df, type = "raw"),
   Actual = new_df$grad
)

logitmod1_cart1$Actual <- as.factor(logitmod1_cart1$Actual)
logitmod1_cart2 <- confusionMatrix(logitmod1_cart1$Predicted, logitmod1_cart1$Actual)
logitmod1_cart2

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  11  10
##        Yes 37  77
##                                           
##                Accuracy : 0.6519          
##                  95% CI : (0.5651, 0.7317)
##     No Information Rate : 0.6444          
##     P-Value [Acc > NIR] : 0.4676342       
##                                           
##                   Kappa : 0.1307          
##                                           
##  Mcnemar's Test P-Value : 0.0001491       
##                                           
##             Sensitivity : 0.22917         
##             Specificity : 0.88506         
##          Pos Pred Value : 0.52381         
##          Neg Pred Value : 0.67544         
##              Prevalence : 0.35556         
##          Detection Rate : 0.08148         
##    Detection Prevalence : 0.15556         
##       Balanced Accuracy : 0.55711         
##                                           
##        'Positive' Class : No              
##

log_auc <- Metrics::auc(actual = logitmod1_cart1$Actual == "No", logitmod1_cart1$No)
yardstick::roc_curve(logitmod1_cart1, Actual, No) %>%
  autoplot() +
  labs(
    title = "logistic ROC Curve ",
    subtitle = paste0("AUC = ", round(log_auc, 4))
  )

library(modelplotr)

## Package modelplotr loaded! Happy model plotting!

scores_and_ntiles <- prepare_scores_and_ntiles(datasets=list("new_df","new_df"),
                      dataset_labels = list("train data","test data"),
                      models = list("tree_mod1","logit_mod1"),
                      model_labels = list("decision tree","logistic"),
                      target_column="grad",
                      ntiles=100)

## Warning: `select_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `select()` instead.
## ℹ The deprecated feature was likely used in the modelplotr package.
##   Please report the issue at <]8;;https://github.com/jurrr/modelplotr/issueshttps://github.com/jurrr/modelplotr/issues]8;;>.

## ... scoring caret model "tree_mod1" on dataset "new_df".

## ... scoring caret model "logit_mod1" on dataset "new_df".

## ... scoring caret model "tree_mod1" on dataset "new_df".

## ... scoring caret model "logit_mod1" on dataset "new_df".

## Data preparation step 1 succeeded! Dataframe created.

# transform data generated with prepare_scores_and_ntiles into aggregated data for chosen plotting scope 
plot_input <- plotting_scope(prepared_input = scores_and_ntiles,
                             select_model_label = "decision tree",
                             select_dataset_label = "train data")

## Warning: `group_by_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `group_by()` instead.
## ℹ See vignette('programming') for more help
## ℹ The deprecated feature was likely used in the modelplotr package.
##   Please report the issue at <]8;;https://github.com/jurrr/modelplotr/issueshttps://github.com/jurrr/modelplotr/issues]8;;>.

## Data preparation step 2 succeeded! Dataframe created.

## "prepared_input" aggregated...

## Data preparation step 3 succeeded! Dataframe created.
## 
## No comparison specified, default values are used. 
## 
## Single evaluation line will be plotted: Target value "No" plotted for dataset "train data" and model "decision tree.
## "
## -> To compare models, specify: scope = "compare_models"
## -> To compare datasets, specify: scope = "compare_datasets"
## -> To compare target classes, specify: scope = "compare_targetclasses"
## -> To plot one line, do not specify scope or specify scope = "no_comparison".

# plot the cumulative gains plot
plot_cumgains(data = plot_input)

# plot the cumulative gains plot and annotate the plot at percentile = 20
plot_cumgains(data = plot_input,highlight_ntile = 20)

## Warning: Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.

##  
## Plot annotation for plot: Cumulative gains
## - When we select 20% with the highest probability according to model decision tree, this selection holds 40% of all No cases in train data. 
##  
##

# plot the cumulative lift plot and annotate the plot at percentile = 20
plot_cumlift(data = plot_input,highlight_ntile = 20)

## Warning: Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.

##  
## Plot annotation for plot: Cumulative lift
## - When we select 20% with the highest probability according to model decision tree in train data, this selection for No cases is 2.0 times better than selecting without a model. 
##  
##

# set plotting scope to model comparison
plot_input <- plotting_scope(prepared_input = scores_and_ntiles,scope = "compare_models")

## Data preparation step 2 succeeded! Dataframe created.

## "prepared_input" aggregated...

## Data preparation step 3 succeeded! Dataframe created.
## 
## Models "decision tree", "logistic" compared for dataset "train data" and target value "No".

# plot the cumulative response plot and annotate the plot at ntile 20
plot_cumgains(data = plot_input,highlight_ntile = 20)

## Warning: Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.

##  
## Plot annotation for plot: Cumulative gains
## - When we select 20% with the highest probability according to model decision tree, this selection holds 40% of all No cases in train data. 
## - When we select 20% with the highest probability according to model logistic, this selection holds 29% of all No cases in train data. 
##  
##

plot_cumlift(data = plot_input,highlight_ntile = 20)

## Warning: Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.

##  
## Plot annotation for plot: Cumulative lift
## - When we select 20% with the highest probability according to model decision tree in train data, this selection for No cases is 2.0 times better than selecting without a model. 
## - When we select 20% with the highest probability according to model logistic in train data, this selection for No cases is 1.5 times better than selecting without a model. 
##  
##

#Converting Yes to 1 and No to 0 in df1 

logrec_df <- logitmod1_cart1
treerec_df <- treemod1_cart1

logrec_df$Predicted<-ifelse(logrec_df$Predicted=="Yes",1,0)
logrec_df$Actual<-ifelse(logrec_df$Actual=="Yes",1,0)


treerec_df$Predicted<-ifelse(treerec_df$Predicted=="Yes",1,0)
treerec_df$Actual<-ifelse(treerec_df$Actual=="Yes",1,0)

library(InformationValue)

## 
## Attaching package: 'InformationValue'

## The following objects are masked from 'package:caret':
## 
##     confusionMatrix, precision, recall, sensitivity, specificity

ks_stat(logrec_df$Actual, logrec_df$Predicted)

## [1] 0.1588

ks_stat(logrec_df$Actual, logrec_df$Predicted, returnKSTable = T)

ks_plot(logrec_df$Actual, logrec_df$Predicted)

ks_stat(treerec_df$Actual, treerec_df$Predicted)

## [1] 0.4813

ks_plot(treerec_df$Actual, treerec_df$Predicted)

R Notebook

model building