library(dlookr)
##
## Attaching package: 'dlookr'
## The following object is masked from 'package:base':
##
## transform
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.1.8
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract() masks dlookr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(rstatix)
##
## Attaching package: 'rstatix'
##
## The following object is masked from 'package:stats':
##
## filter
library(UpSetR)
##
## Attaching package: 'UpSetR'
##
## The following object is masked from 'package:lattice':
##
## histogram
library(naniar)
library(Hmisc)
## Loading required package: survival
##
## Attaching package: 'survival'
##
## The following object is masked from 'package:caret':
##
## cluster
##
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:dplyr':
##
## src, summarize
##
## The following object is masked from 'package:dlookr':
##
## describe
##
## The following objects are masked from 'package:base':
##
## format.pval, units
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
##
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
##
## The following object is masked from 'package:datasets':
##
## sleep
library(RWeka)
library(mice)
##
## Attaching package: 'mice'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(tree)
library(partykit)
## Loading required package: libcoin
## Loading required package: mvtnorm
library(maptree)
## Loading required package: cluster
## Loading required package: rpart
library(blorr)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following object is masked from 'package:colorspace':
##
## coords
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(rpart.plot)
library(sjPlot)
#read the dataset
df <- read.csv("E:/MScDataScience/predictive/logistic.csv")
head(df)
summary(df) #summarise dataset
## ID Cohort admission enrolled
## Min. :298447 Length:350 Min. :0.0000 Length:350
## 1st Qu.:299703 Class :character 1st Qu.:1.0000 Class :character
## Median :300013 Mode :character Median :1.0000 Mode :character
## Mean :300014 Mean :0.7743
## 3rd Qu.:300335 3rd Qu.:1.0000
## Max. :301592 Max. :1.0000
##
## HSGPA ACT Graduated
## Min. : 0.000 Length:350 Min. :0.0000
## 1st Qu.: 3.120 Class :character 1st Qu.:0.0000
## Median : 3.380 Mode :character Median :0.0000
## Mean : 7.107 Mean :0.3992
## 3rd Qu.: 3.690 3rd Qu.:1.0000
## Max. :999.000 Max. :1.0000
## NA's :112
library(flextable)
##
## Attaching package: 'flextable'
## The following object is masked from 'package:partykit':
##
## width
## The following object is masked from 'package:purrr':
##
## compose
diagnose(df) %>% flextable() #diagnose df
variables | types | missing_count | missing_percent | unique_count | unique_rate |
|---|---|---|---|---|---|
ID | integer | 0 | 0 | 319 | 0.911428571 |
Cohort | character | 0 | 0 | 5 | 0.014285714 |
admission | integer | 0 | 0 | 2 | 0.005714286 |
enrolled | character | 0 | 0 | 3 | 0.008571429 |
HSGPA | numeric | 0 | 0 | 127 | 0.362857143 |
ACT | character | 0 | 0 | 26 | 0.074285714 |
Graduated | integer | 112 | 32 | 3 | 0.008571429 |
Hmisc::describe(df)
## df
##
## 7 Variables 350 Observations
## --------------------------------------------------------------------------------
## ID
## n missing distinct Info Mean Gmd .05 .10
## 350 0 319 1 3e+05 535.3 299202 299349
## .25 .50 .75 .90 .95
## 299703 300013 300335 300580 300797
##
## lowest : 298447 298720 298955 299005 299021, highest: 301142 301194 301303 301313 301592
## --------------------------------------------------------------------------------
## Cohort
## n missing distinct
## 350 0 5
##
## lowest : Fall2015 Fall2016 Fall2018 Spring2019 Summer2017
## highest: Fall2015 Fall2016 Fall2018 Spring2019 Summer2017
##
## Value Fall2015 Fall2016 Fall2018 Spring2019 Summer2017
## Frequency 17 61 105 166 1
## Proportion 0.049 0.174 0.300 0.474 0.003
## --------------------------------------------------------------------------------
## admission
## n missing distinct Info Sum Mean Gmd
## 350 0 2 0.524 271 0.7743 0.3505
##
## --------------------------------------------------------------------------------
## enrolled
## n missing distinct
## 350 0 3
##
## Value 0 1 n/a
## Frequency 189 160 1
## Proportion 0.540 0.457 0.003
## --------------------------------------------------------------------------------
## HSGPA
## n missing distinct Info Mean Gmd .05 .10
## 350 0 127 1 7.107 8.129 2.67 2.85
## .25 .50 .75 .90 .95
## 3.12 3.38 3.69 3.95 4.00
##
## lowest : 0.00 2.26 2.42 2.55 2.56, highest: 3.98 3.99 4.00 333.00 999.00
##
## Value 0 330 1000
## Frequency 348 1 1
## Proportion 0.994 0.003 0.003
##
## For the frequency table, variable is rounded to the nearest 10
## --------------------------------------------------------------------------------
## ACT
## n missing distinct
## 350 0 26
##
## lowest : 10 11 12 13 14 , highest: 31 32 33 34 NULL
## --------------------------------------------------------------------------------
## Graduated
## n missing distinct Info Sum Mean Gmd
## 238 112 2 0.72 95 0.3992 0.4817
##
## --------------------------------------------------------------------------------
view_df(df, show.frq = T, show.prc = T, show.na = T)
| ID | Name | Label | missings | Values | Value Labels | Freq. | % |
|---|---|---|---|---|---|---|---|
| 1 | ID | 0 (0.00%) | range: 298447-301592 | ||||
| 2 | Cohort | 0 (0.00%) | <output omitted> | <output omitted> | <output omitted> | ||
| 3 | admission | 0 (0.00%) | range: 0-1 | ||||
| 4 | enrolled | 0 (0.00%) | <output omitted> | <output omitted> | <output omitted> | ||
| 5 | HSGPA | 0 (0.00%) | range: 0.0-999.0 | ||||
| 6 | ACT | 0 (0.00%) | <output omitted> | <output omitted> | <output omitted> | ||
| 7 | Graduated | 112 (32.00%) | range: 0-1 | ||||
From the data summary, admission and graduated are integers. However from the data description, these variables should be represented as factors. Again, ACT is represented as character, however these are scores and should be represented as numbers. Again, enrolled is represented at character which should be factor. Thus, re-formatting the data types
#before changing data types copy the original data df to df1
df1 <- df
df1$admission <- as.factor(df1$admission)
df1$Graduated <- as.factor(df1$Graduated)
df1$enrolled <- as.factor(df1$enrolled)
df1$ACT <- as.numeric(df1$ACT)
## Warning: NAs introduced by coercion
df1$Cohort <- as.character(df1$Cohort)
diagnose(df1[, -1]) %>% flextable() #remove id
variables | types | missing_count | missing_percent | unique_count | unique_rate |
|---|---|---|---|---|---|
Cohort | character | 0 | 0.000000 | 5 | 0.014285714 |
admission | factor | 0 | 0.000000 | 2 | 0.005714286 |
enrolled | factor | 0 | 0.000000 | 3 | 0.008571429 |
HSGPA | numeric | 0 | 0.000000 | 127 | 0.362857143 |
ACT | numeric | 8 | 2.285714 | 26 | 0.074285714 |
Graduated | factor | 112 | 32.000000 | 3 | 0.008571429 |
print(view_df(df1[, -1], show.frq = T, show.prc = T, show.na = T))
diagnose_category(df1[, -1]) %>% flextable()
variables | levels | N | freq | ratio | rank |
|---|---|---|---|---|---|
Cohort | Spring2019 | 350 | 166 | 47.4285714 | 1 |
Cohort | Fall2018 | 350 | 105 | 30.0000000 | 2 |
Cohort | Fall2016 | 350 | 61 | 17.4285714 | 3 |
Cohort | Fall2015 | 350 | 17 | 4.8571429 | 4 |
Cohort | Summer2017 | 350 | 1 | 0.2857143 | 5 |
admission | 1 | 350 | 271 | 77.4285714 | 1 |
admission | 0 | 350 | 79 | 22.5714286 | 2 |
enrolled | 0 | 350 | 189 | 54.0000000 | 1 |
enrolled | 1 | 350 | 160 | 45.7142857 | 2 |
enrolled | n/a | 350 | 1 | 0.2857143 | 3 |
Graduated | 0 | 350 | 143 | 40.8571429 | 1 |
Graduated | 350 | 112 | 32.0000000 | 2 | |
Graduated | 1 | 350 | 95 | 27.1428571 | 3 |
From the categorical summary, we see that enrolled has n/a level which occurs only 1 and thus, it can be removed and again, Graduated had 112 NAs
diagnose_numeric(df1[, -1])
md.pattern(df1[,-1])
## Cohort admission enrolled HSGPA ACT Graduated
## 232 1 1 1 1 1 1 0
## 110 1 1 1 1 1 0 1
## 6 1 1 1 1 0 1 1
## 2 1 1 1 1 0 0 2
## 0 0 0 0 8 112 120
funModeling::status(df1[, -1]) %>% flextable()
variable | q_zeros | p_zeros | q_na | p_na | q_inf | p_inf | type | unique |
|---|---|---|---|---|---|---|---|---|
Cohort | 0 | 0.00000000 | 0 | 0.00000000 | 0 | 0 | character | 5 |
admission | 79 | 0.22571429 | 0 | 0.00000000 | 0 | 0 | factor | 2 |
enrolled | 189 | 0.54000000 | 0 | 0.00000000 | 0 | 0 | factor | 3 |
HSGPA | 8 | 0.02285714 | 0 | 0.00000000 | 0 | 0 | numeric | 127 |
ACT | 0 | 0.00000000 | 8 | 0.02285714 | 0 | 0 | numeric | 25 |
Graduated | 143 | 0.40857143 | 112 | 0.32000000 | 0 | 0 | factor | 2 |
plot_outlier(df1[, -1])
There are significant outliers in the HSGPA scores which affects the distribution. Removal of these outliers leads to improved distribution
Detect outliers using boxplot methods. Boxplots are a popular and an easy method for identifying outliers. There are two categories of outlier: (1) outliers and (2) extreme points.
Values above Q3 + 1.5xIQR or below Q1 - 1.5xIQR are considered as outliers. Values above Q3 + 3xIQR or below Q1 - 3xIQR are considered as extreme points (or extreme outliers).
df1[, -1] %>% identify_outliers(HSGPA)
#outlier removal
df2 <- df1 %>% rownames_to_column(var = "index") #create an index to identify and remove extreme outliers
df2$index <- as.numeric(df2$index)
head(df2)
# Remove outliers
df2_outlier <- df2 %>% identify_outliers(HSGPA) %>% filter(is.extreme == "TRUE") #get extreme outliers
df3 <- df2 %>%
anti_join(df2_outlier, by = "index")
df3
plot_outlier(df3[, -c(1,2)])
df4 <- df3[, -c(1,2)] #The dataset without outliers
#lets select only admitted students and no enrollment
df5 <- df4 %>% filter(admission == 1)
df5 <- df5 %>% filter(enrolled == 1)
md.pattern(df5)
## Cohort admission enrolled HSGPA ACT Graduated
## 135 1 1 1 1 1 1 0
## 18 1 1 1 1 1 0 1
## 3 1 1 1 1 0 1 1
## 0 0 0 0 3 18 21
view_df(df5, show.frq = T, show.prc = T, show.na = T)
| ID | Name | Label | missings | Values | Value Labels | Freq. | % |
|---|---|---|---|---|---|---|---|
| 1 | Cohort | 0 (0.00%) | <output omitted> | <output omitted> | <output omitted> | ||
| 2 | admission | 0 (0.00%) |
0 1 |
0 156 |
0.00 100.00 |
||
| 3 | enrolled | 0 (0.00%) |
0 1 n/a |
0 156 0 |
0.00 100.00 0.00 |
||
| 4 | HSGPA | 0 (0.00%) | range: 2.4-4.0 | ||||
| 5 | ACT | 3 (1.92%) | range: 14-34 | ||||
| 6 | Graduated | 18 (11.54%) |
0 1 |
48 90 |
34.78 65.22 |
||
df5 <- df5[, -c(2,3)]
df5 %>%
ggpairs(aes(col = Graduated, fill = Graduated, alpha = 0.6),
upper = list(combo = 'box'),
diag = list(discrete = wrap('barDiag', position = 'fill')),
lower = list(combo = 'dot_no_facet')) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning: Removed 3 rows containing non-finite values (`stat_boxplot()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 21 rows containing missing values
## Warning: Removed 3 rows containing missing values (`geom_point()`).
## Removed 3 rows containing missing values (`geom_point()`).
## Warning: Removed 3 rows containing non-finite values (`stat_density()`).
## Warning: Removed 3 rows containing non-finite values (`stat_boxplot()`).
## Warning: Removed 18 rows containing missing values (`geom_point()`).
## Warning: Removed 21 rows containing missing values (`geom_point()`).
# Remove nas
naomit_df6 <- na.omit(df5)
md.pattern(naomit_df6)
## /\ /\
## { `---' }
## { O O }
## ==> V <== No need for mice. This data set is completely observed.
## \ \|/ /
## `-----'
## Cohort HSGPA ACT Graduated
## 135 1 1 1 1 0
## 0 0 0 0 0
naomit_df6 %>%
ggpairs(aes(col = Graduated, fill = Graduated, alpha = 0.6),
upper = list(combo = 'box'),
diag = list(discrete = wrap('barDiag', position = 'fill')),
lower = list(combo = 'dot_no_facet')) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
view_df(naomit_df6, show.frq = T, show.prc = T, show.na = T)
| ID | Name | Label | missings | Values | Value Labels | Freq. | % |
|---|---|---|---|---|---|---|---|
| 1 | Cohort | 0 (0.00%) | <output omitted> | <output omitted> | <output omitted> | ||
| 2 | HSGPA | 0 (0.00%) | range: 2.4-4.0 | ||||
| 3 | ACT | 0 (0.00%) | range: 14-34 | ||||
| 4 | Graduated | 0 (0.00%) |
0 1 |
48 87 |
35.56 64.44 |
||
naomit_df6 <- naomit_df6 %>% mutate(grad = case_when(Graduated == 0 ~ "No",
Graduated == 1 ~ "Yes")) #recode graduated as yes and no
new_df <- naomit_df6[, -c(1,4)] #remove admission and graduated
oj_trControl = trainControl(
method = "cv",
number = 10,
savePredictions = "final", # save preds for the optimal tuning parameter
classProbs = TRUE, # class probs in addition to preds
summaryFunction = twoClassSummary
)
# build a tree model
set.seed(1234)
tree_mod1 <- train(
grad ~ .,
data = new_df,
method = "rpart",
tuneLength = 5,
metric = "ROC",
trControl = oj_trControl
)
print(tree_mod1)
## CART
##
## 135 samples
## 2 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 121, 122, 121, 122, 122, 123, ...
## Resampling results across tuning parameters:
##
## cp ROC Sens Spec
## 0.000000 0.5398958 0.320 0.7458333
## 0.015625 0.5310069 0.255 0.7819444
## 0.031250 0.5323958 0.295 0.7680556
## 0.046875 0.4993403 0.235 0.7583333
## 0.062500 0.4722569 0.085 0.8625000
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.
tree_mod1$finalModel
## n= 135
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 135 48 Yes (0.35555556 0.64444444)
## 2) ACT< 25.5 67 32 Yes (0.47761194 0.52238806)
## 4) HSGPA>=3.14 55 26 No (0.52727273 0.47272727)
## 8) HSGPA< 3.375 25 8 No (0.68000000 0.32000000) *
## 9) HSGPA>=3.375 30 12 Yes (0.40000000 0.60000000)
## 18) ACT< 21.5 16 7 No (0.56250000 0.43750000) *
## 19) ACT>=21.5 14 3 Yes (0.21428571 0.78571429) *
## 5) HSGPA< 3.14 12 3 Yes (0.25000000 0.75000000) *
## 3) ACT>=25.5 68 16 Yes (0.23529412 0.76470588)
## 6) HSGPA< 3.345 21 8 Yes (0.38095238 0.61904762)
## 12) HSGPA>=3.275 9 4 No (0.55555556 0.44444444) *
## 13) HSGPA< 3.275 12 3 Yes (0.25000000 0.75000000) *
## 7) HSGPA>=3.345 47 8 Yes (0.17021277 0.82978723)
## 14) HSGPA>=3.755 25 7 Yes (0.28000000 0.72000000)
## 28) ACT< 28.5 9 4 No (0.55555556 0.44444444) *
## 29) ACT>=28.5 16 2 Yes (0.12500000 0.87500000) *
## 15) HSGPA< 3.755 22 1 Yes (0.04545455 0.95454545) *
rpart.plot(tree_mod1$finalModel)
treemod1_cart1 <- bind_cols(
predict(tree_mod1, newdata = new_df, type = "prob"),
Predicted = predict(tree_mod1, newdata = new_df, type = "raw"),
Actual = new_df$grad
)
treemod1_cart1$Actual <- as.factor(treemod1_cart1$Actual)
treemod1_cart2 <- confusionMatrix(treemod1_cart1$Predicted, treemod1_cart1$Actual)
treemod1_cart2
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 36 23
## Yes 12 64
##
## Accuracy : 0.7407
## 95% CI : (0.6583, 0.8123)
## No Information Rate : 0.6444
## P-Value [Acc > NIR] : 0.01098
##
## Kappa : 0.4619
##
## Mcnemar's Test P-Value : 0.09097
##
## Sensitivity : 0.7500
## Specificity : 0.7356
## Pos Pred Value : 0.6102
## Neg Pred Value : 0.8421
## Prevalence : 0.3556
## Detection Rate : 0.2667
## Detection Prevalence : 0.4370
## Balanced Accuracy : 0.7428
##
## 'Positive' Class : No
##
mdl_auc <- Metrics::auc(actual = treemod1_cart1$Actual == "No", treemod1_cart1$No)
yardstick::roc_curve(treemod1_cart1, Actual, No) %>%
autoplot() +
labs(
title = "tree1 ROC Curve ",
subtitle = paste0("AUC = ", round(mdl_auc, 4))
)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## ℹ The deprecated feature was likely used in the yardstick package.
## Please report the issue at <]8;;https://github.com/tidymodels/yardstick/issueshttps://github.com/tidymodels/yardstick/issues]8;;>.
plot(varImp(tree_mod1), main="Variable Importance with tree1")
# build a tree model
set.seed(1234)
logit_mod1 <- train(
grad ~ .,
data = new_df,
method = "glm",
tuneLength = 5,
metric = "ROC",
trControl = oj_trControl
)
summary(logit_mod1)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9823 -1.2127 0.7090 0.9473 1.5178
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.48769 2.03670 -1.712 0.08682 .
## HSGPA 0.33657 0.57120 0.589 0.55570
## ACT 0.11633 0.04248 2.738 0.00618 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 175.72 on 134 degrees of freedom
## Residual deviance: 165.90 on 132 degrees of freedom
## AIC: 171.9
##
## Number of Fisher Scoring iterations: 4
logitmod1_cart1 <- bind_cols(
predict(logit_mod1, newdata = new_df, type = "prob"),
Predicted = predict(logit_mod1, newdata = new_df, type = "raw"),
Actual = new_df$grad
)
logitmod1_cart1$Actual <- as.factor(logitmod1_cart1$Actual)
logitmod1_cart2 <- confusionMatrix(logitmod1_cart1$Predicted, logitmod1_cart1$Actual)
logitmod1_cart2
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 11 10
## Yes 37 77
##
## Accuracy : 0.6519
## 95% CI : (0.5651, 0.7317)
## No Information Rate : 0.6444
## P-Value [Acc > NIR] : 0.4676342
##
## Kappa : 0.1307
##
## Mcnemar's Test P-Value : 0.0001491
##
## Sensitivity : 0.22917
## Specificity : 0.88506
## Pos Pred Value : 0.52381
## Neg Pred Value : 0.67544
## Prevalence : 0.35556
## Detection Rate : 0.08148
## Detection Prevalence : 0.15556
## Balanced Accuracy : 0.55711
##
## 'Positive' Class : No
##
log_auc <- Metrics::auc(actual = logitmod1_cart1$Actual == "No", logitmod1_cart1$No)
yardstick::roc_curve(logitmod1_cart1, Actual, No) %>%
autoplot() +
labs(
title = "logistic ROC Curve ",
subtitle = paste0("AUC = ", round(log_auc, 4))
)
library(modelplotr)
## Package modelplotr loaded! Happy model plotting!
scores_and_ntiles <- prepare_scores_and_ntiles(datasets=list("new_df","new_df"),
dataset_labels = list("train data","test data"),
models = list("tree_mod1","logit_mod1"),
model_labels = list("decision tree","logistic"),
target_column="grad",
ntiles=100)
## Warning: `select_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `select()` instead.
## ℹ The deprecated feature was likely used in the modelplotr package.
## Please report the issue at <]8;;https://github.com/jurrr/modelplotr/issueshttps://github.com/jurrr/modelplotr/issues]8;;>.
## ... scoring caret model "tree_mod1" on dataset "new_df".
## ... scoring caret model "logit_mod1" on dataset "new_df".
## ... scoring caret model "tree_mod1" on dataset "new_df".
## ... scoring caret model "logit_mod1" on dataset "new_df".
## Data preparation step 1 succeeded! Dataframe created.
# transform data generated with prepare_scores_and_ntiles into aggregated data for chosen plotting scope
plot_input <- plotting_scope(prepared_input = scores_and_ntiles,
select_model_label = "decision tree",
select_dataset_label = "train data")
## Warning: `group_by_()` was deprecated in dplyr 0.7.0.
## ℹ Please use `group_by()` instead.
## ℹ See vignette('programming') for more help
## ℹ The deprecated feature was likely used in the modelplotr package.
## Please report the issue at <]8;;https://github.com/jurrr/modelplotr/issueshttps://github.com/jurrr/modelplotr/issues]8;;>.
## Data preparation step 2 succeeded! Dataframe created.
## "prepared_input" aggregated...
## Data preparation step 3 succeeded! Dataframe created.
##
## No comparison specified, default values are used.
##
## Single evaluation line will be plotted: Target value "No" plotted for dataset "train data" and model "decision tree.
## "
## -> To compare models, specify: scope = "compare_models"
## -> To compare datasets, specify: scope = "compare_datasets"
## -> To compare target classes, specify: scope = "compare_targetclasses"
## -> To plot one line, do not specify scope or specify scope = "no_comparison".
# plot the cumulative gains plot
plot_cumgains(data = plot_input)
# plot the cumulative gains plot and annotate the plot at percentile = 20
plot_cumgains(data = plot_input,highlight_ntile = 20)
## Warning: Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.
##
## Plot annotation for plot: Cumulative gains
## - When we select 20% with the highest probability according to model decision tree, this selection holds 40% of all No cases in train data.
##
##
# plot the cumulative lift plot and annotate the plot at percentile = 20
plot_cumlift(data = plot_input,highlight_ntile = 20)
## Warning: Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.
##
## Plot annotation for plot: Cumulative lift
## - When we select 20% with the highest probability according to model decision tree in train data, this selection for No cases is 2.0 times better than selecting without a model.
##
##
# set plotting scope to model comparison
plot_input <- plotting_scope(prepared_input = scores_and_ntiles,scope = "compare_models")
## Data preparation step 2 succeeded! Dataframe created.
## "prepared_input" aggregated...
## Data preparation step 3 succeeded! Dataframe created.
##
## Models "decision tree", "logistic" compared for dataset "train data" and target value "No".
# plot the cumulative response plot and annotate the plot at ntile 20
plot_cumgains(data = plot_input,highlight_ntile = 20)
## Warning: Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.
##
## Plot annotation for plot: Cumulative gains
## - When we select 20% with the highest probability according to model decision tree, this selection holds 40% of all No cases in train data.
## - When we select 20% with the highest probability according to model logistic, this selection holds 29% of all No cases in train data.
##
##
plot_cumlift(data = plot_input,highlight_ntile = 20)
## Warning: Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.
##
## Plot annotation for plot: Cumulative lift
## - When we select 20% with the highest probability according to model decision tree in train data, this selection for No cases is 2.0 times better than selecting without a model.
## - When we select 20% with the highest probability according to model logistic in train data, this selection for No cases is 1.5 times better than selecting without a model.
##
##
#Converting Yes to 1 and No to 0 in df1
logrec_df <- logitmod1_cart1
treerec_df <- treemod1_cart1
logrec_df$Predicted<-ifelse(logrec_df$Predicted=="Yes",1,0)
logrec_df$Actual<-ifelse(logrec_df$Actual=="Yes",1,0)
treerec_df$Predicted<-ifelse(treerec_df$Predicted=="Yes",1,0)
treerec_df$Actual<-ifelse(treerec_df$Actual=="Yes",1,0)
library(InformationValue)
##
## Attaching package: 'InformationValue'
## The following objects are masked from 'package:caret':
##
## confusionMatrix, precision, recall, sensitivity, specificity
ks_stat(logrec_df$Actual, logrec_df$Predicted)
## [1] 0.1588
ks_stat(logrec_df$Actual, logrec_df$Predicted, returnKSTable = T)
ks_plot(logrec_df$Actual, logrec_df$Predicted)
ks_stat(treerec_df$Actual, treerec_df$Predicted)
## [1] 0.4813
ks_plot(treerec_df$Actual, treerec_df$Predicted)