Introduction
This notebook uses the Water Quality: Drinking Water Potablity data from Kaggle. The file contains ten features describing water quality metrics for 3276 water bodies, see here for further description of the metrics. The task of this exercise is to predict if the water is safe for human consumption, and the variable of interest in potability.
Data dictionary
| No. | Variable | Description |
|---|---|---|
| 1 | ph | pH of water |
| 2 | Hardness | Capacity of water to precipitate soap in mg/L |
| 3 | Solids | Total dissolved solids in ppm |
| 4 | Chloramines | Amount of Chloramines in ppm |
| 5 | Sulfate | Amount of Sulfates dissolved in mg/L |
| 6 | Conductivity | Electrical conductivity of water in μS/cm |
| 7 | Organic_carbon | Amount of organic carbon in ppm |
| 8 | Trihalomethanes | Amount of Trihalomethanes in μg/L |
| 9 | Turbidity | Measure of light emiting property of water in NTU (Nephelometric Turbidity Units) |
| 10 | Potability | Indicates if water is safe for human consumption |
# load libraries
library(tidyverse)
library(scales)
library(Hmisc)
library(skimr)
library(janitor)
library(psych)
library(ggsci)
library(ggstatsplot)
library(gt)
# import data
data = read_csv("water_potability.csv")
── Column specification ───────────────────────────────────────────────────────────────────────
cols(
ph = col_double(),
Hardness = col_double(),
Solids = col_double(),
Chloramines = col_double(),
Sulfate = col_double(),
Conductivity = col_double(),
Organic_carbon = col_double(),
Trihalomethanes = col_double(),
Turbidity = col_double(),
Potability = col_double()
)
# clean names
data = data %>% clean_names() %>% mutate_at(vars(potability),list(factor))
# summary
glimpse(data)
Rows: 3,276
Columns: 10
$ ph <dbl> NA, 3.716080, 8.099124, 8.316766, 9.092223, 5.584087, 10.223862, 8.63…
$ hardness <dbl> 204.8905, 129.4229, 224.2363, 214.3734, 181.1015, 188.3133, 248.0717,…
$ solids <dbl> 20791.32, 18630.06, 19909.54, 22018.42, 17978.99, 28748.69, 28749.72,…
$ chloramines <dbl> 7.300212, 6.635246, 9.275884, 8.059332, 6.546600, 7.544869, 7.513408,…
$ sulfate <dbl> 368.5164, NA, NA, 356.8861, 310.1357, 326.6784, 393.6634, 303.3098, 2…
$ conductivity <dbl> 564.3087, 592.8854, 418.6062, 363.2665, 398.4108, 280.4679, 283.6516,…
$ organic_carbon <dbl> 10.379783, 15.180013, 16.868637, 18.436524, 11.558279, 8.399735, 13.7…
$ trihalomethanes <dbl> 86.99097, 56.32908, 66.42009, 100.34167, 31.99799, 54.91786, 84.60356…
$ turbidity <dbl> 2.963135, 4.500656, 3.055934, 4.628771, 4.075075, 2.559708, 2.672989,…
$ potability <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
# missing data
skim(data)
── Data Summary ────────────────────────
Values
Name data
Number of rows 3276
Number of columns 10
_______________________
Column type frequency:
factor 1
numeric 9
________________________
Group variables None
── Variable type: factor ──────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate ordered n_unique top_counts
1 potability 0 1 FALSE 2 0: 1998, 1: 1278
── Variable type: numeric ─────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate mean sd p0 p25 p50 p75
1 ph 491 0.850 7.08 1.59 0 6.09 7.04 8.06
2 hardness 0 1 196. 32.9 47.4 177. 197. 217.
3 solids 0 1 22014. 8769. 321. 15667. 20928. 27333.
4 chloramines 0 1 7.12 1.58 0.352 6.13 7.13 8.11
5 sulfate 781 0.762 334. 41.4 129. 308. 333. 360.
6 conductivity 0 1 426. 80.8 181. 366. 422. 482.
7 organic_carbon 0 1 14.3 3.31 2.20 12.1 14.2 16.6
8 trihalomethanes 162 0.951 66.4 16.2 0.738 55.8 66.6 77.3
9 turbidity 0 1 3.97 0.780 1.45 3.44 3.96 4.50
p100 hist
1 14.0 ▁▂▇▂▁
2 323. ▁▂▇▃▁
3 61227. ▂▇▅▁▁
4 13.1 ▁▂▇▃▁
5 481. ▁▁▇▆▁
6 753. ▁▇▇▂▁
7 28.3 ▁▅▇▂▁
8 124 ▁▂▇▅▁
9 6.74 ▁▅▇▃▁
# target distribution
Hmisc::describe(data$potability)
data$potability
n missing distinct
3276 0 2
Value 0 1
Frequency 1998 1278
Proportion 0.61 0.39
# summary by target class
psych::describeBy(data[,-10],data$potability,mat=T)
# df without na
data2 = data %>% drop_na()
dim(data2)
[1] 2011 10
# df with missing values imputation using mean of target class
data_cln = data %>%
group_by(potability) %>%
mutate(ph=ifelse(is.na(ph),mean(ph,na.rm=TRUE),ph),
sulfate=ifelse(is.na(sulfate),mean(sulfate,na.rm=TRUE),sulfate),
trihalomethanes=ifelse(is.na(trihalomethanes),mean(trihalomethanes,na.rm=TRUE),
trihalomethanes))
colSums(is.na(data_cln))
ph hardness solids chloramines sulfate
0 0 0 0 0
conductivity organic_carbon trihalomethanes turbidity potability
0 0 0 0 0
theme_set(theme_bw(base_size=10))
theme_update(plot.title.position="plot")
# target variable
data %>% group_by(potability) %>% tally() %>% mutate(prop=round(n/sum(n),3)) %>%
ggplot(aes(x=potability, y=n, fill=potability)) + geom_col() +
geom_text(aes(label=paste0(n," ","(",scales::percent(prop),")")),vjust=-0.5,size=3) +
scale_fill_npg() +
labs(y="Count", title="Target variable distribution")
# Features
data2 %>% pivot_longer(cols=!potability) %>%
ggplot(aes(y=potability, x= value, color=potability)) +
geom_violin(show.legend=F) +
facet_wrap(~name,ncol=3, scales="free") +
scale_color_npg() +
theme_minimal(base_size = 10) +
theme(strip.text=element_text(face="bold")) +
labs(title="Distribution of features by potability")
# differences in mean
data2 %>% pivot_longer(cols=!potability) %>%
group_by(potability, name) %>%
summarise(mean = round(mean(value),1)) %>%
pivot_wider(names_from = potability, values_from=mean) %>%
rename("potability_0"="0", "potability_1"="1", "variable"="name") %>%
mutate(difference = potability_1-potability_0)
data2 %>% pivot_longer(cols=!potability) %>%
group_by(potability, name) %>%
summarise(mean = round(mean(value),1), median= round(median(value),1)) %>%
pivot_wider(names_from = potability, values_from=c(mean,median)) %>%
mutate(mean_diff= mean_1-mean_0, med_diff= median_1-median_0) %>%
select(name, mean_1, mean_0, mean_diff, median_1, median_0, med_diff)
# function to not fill density plot
# reference: https://stackoverflow.com/questions/34727408/edit-individual-ggplots-in-ggallyggpairs-how-do-i-have-the-density-plot-not-f
ggally_func <- function(data, mapping, ...){
ggplot(data = data2, mapping=mapping) +
geom_density(mapping = aes_string(color="potability"), fill=NA)
}
# plot
ggpairs(data2, columns = 1:9, aes(color=potability,alpha=0.2),
diag = list(continuous = ggally_func)) +
scale_color_npg() +
theme_light() +
theme(panel.grid=element_blank(),
axis.text=element_text(size=7),
strip.background = element_rect(fill=NA, color="black"),
strip.text=element_text(color="black"))
# correlation (df without NA)
cdata1 = data2 %>% mutate(potability= parse_number(as.character(potability)))
ggstatsplot::ggcorrmat(
data=cdata1,
cor.vars=c(ph:potability),
type="pearson",
ggcorrplot.args = list(lab_size=3, tl.srt=90, tl.cex=9)
)
library(caret)
library(rattle)
library(pROC)
library(MLmetrics)
library(rpart)
library(randomForest)
library(vip)
library(xgboost)
library(fastAdaboost)
library(adabag)
# make.names
colnames(data_cln) <- make.names(colnames(data_cln))
data_cln$potability = make.names(data_cln$potability)
# partition data based on potability
set.seed(123)
train.index <- createDataPartition(data_cln$potability, p = .7, list = FALSE)
xtrain <- data_cln[ train.index,]
xtest <- data_cln[-train.index,]
# check distribution
Hmisc::describe(xtrain$potability)
xtrain$potability
n missing distinct
2294 0 2
Value X0 X1
Frequency 1399 895
Proportion 0.61 0.39
Hmisc::describe(xtest$potability)
xtest$potability
n missing distinct
982 0 2
Value X0 X1
Frequency 599 383
Proportion 0.61 0.39
# glm
fit.control <- trainControl(method = "repeatedcv", number = 10, repeats = 3,
summaryFunction = twoClassSummary, classProbs = TRUE,
savePredictions = TRUE)
set.seed(123)
glm_mod <- train(potability ~., data = xtrain, method = "glm",
family = "binomial", trControl = fit.control, metric="ROC")
glm_mod
Generalized Linear Model
2294 samples
9 predictor
2 classes: 'X0', 'X1'
No pre-processing
Resampling: Cross-Validated (10 fold, repeated 3 times)
Summary of sample sizes: 2065, 2065, 2065, 2064, 2064, 2065, ...
Resampling results:
ROC Sens Spec
0.5044387 1 0.006708281
summary(glm_mod)
Call:
NULL
Deviance Residuals:
Min 1Q Median 3Q Max
-1.1428 -1.0040 -0.9476 1.3471 1.6204
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -5.978e-01 7.436e-01 -0.804 0.4214
ph 2.899e-02 2.954e-02 0.982 0.3263
hardness -7.118e-04 1.318e-03 -0.540 0.5891
solids 6.903e-06 4.930e-06 1.400 0.1615
chloramines 5.506e-02 2.751e-02 2.001 0.0453 *
sulfate -6.705e-04 1.217e-03 -0.551 0.5817
conductivity -4.649e-04 5.329e-04 -0.872 0.3830
organic_carbon -1.332e-02 1.279e-02 -1.042 0.2976
trihalomethanes -4.382e-04 2.719e-03 -0.161 0.8720
turbidity 4.526e-02 5.500e-02 0.823 0.4105
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 3068.5 on 2293 degrees of freedom
Residual deviance: 3058.7 on 2284 degrees of freedom
AIC: 3078.7
Number of Fisher Scoring iterations: 4
probsTest <- predict(glm_mod, xtest, type = "prob") # predict
threshold <- 0.5
glm.pred <- factor( ifelse(probsTest[, "X1"] > threshold, "X1", "X0") )
confusionMatrix(glm.pred, factor(xtest$potability),positive = "X1") # confusion matrix
Confusion Matrix and Statistics
Reference
Prediction X0 X1
X0 599 381
X1 0 2
Accuracy : 0.612
95% CI : (0.5807, 0.6426)
No Information Rate : 0.61
P-Value [Acc > NIR] : 0.4619
Kappa : 0.0064
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.005222
Specificity : 1.000000
Pos Pred Value : 1.000000
Neg Pred Value : 0.611224
Prevalence : 0.390020
Detection Rate : 0.002037
Detection Prevalence : 0.002037
Balanced Accuracy : 0.502611
'Positive' Class : X1
roc(response=xtest$potability, predictor=factor(glm.pred,ordered=T),plot=T,print.auc=T) # plot ROC
Call:
roc.default(response = xtest$potability, predictor = factor(glm.pred, ordered = T), plot = T, print.auc = T)
Data: factor(glm.pred, ordered = T) in 599 controls (xtest$potability X0) < 383 cases (xtest$potability X1).
Area under the curve: 0.5026
paste("F1-score: ",(F1_Score(y_pred = glm.pred, y_true = factor(xtest$potability), positive = "X1"))) # F1-score
[1] "F1-score: 0.0103896103896104"
varImp(glm_mod) # variable importance
glm variable importance
set.seed(123)
knn_mod = train(potability ~., data = xtrain, method = "knn",
trControl = fit.control, metric = "ROC", preProcess = c("center", "scale"),
tuneLength=10)
knn_mod
k-Nearest Neighbors
2294 samples
9 predictor
2 classes: 'X0', 'X1'
Pre-processing: centered (9), scaled (9)
Resampling: Cross-Validated (10 fold, repeated 3 times)
Summary of sample sizes: 2065, 2065, 2065, 2064, 2064, 2065, ...
Resampling results across tuning parameters:
k ROC Sens Spec
5 0.6158743 0.7853306 0.3732251
7 0.6205355 0.7970110 0.3407241
9 0.6252038 0.8189311 0.3239908
11 0.6305220 0.8396591 0.3035164
13 0.6356760 0.8622902 0.2800874
15 0.6414193 0.8722987 0.2666792
17 0.6385615 0.8794399 0.2558344
19 0.6392682 0.8899263 0.2413317
21 0.6427417 0.9023124 0.2338660
23 0.6411638 0.9077955 0.2216063
ROC was used to select the optimal model using the largest value.
The final value used for the model was k = 21.
set.seed(123)
knn_pred = predict(knn_mod,xtest) # predict
confusionMatrix(knn_pred, factor(xtest$potability),positive = "X1") # confusion matrix
Confusion Matrix and Statistics
Reference
Prediction X0 X1
X0 542 296
X1 57 87
Accuracy : 0.6405
95% CI : (0.6096, 0.6706)
No Information Rate : 0.61
P-Value [Acc > NIR] : 0.02637
Kappa : 0.1487
Mcnemar's Test P-Value : < 2e-16
Sensitivity : 0.22715
Specificity : 0.90484
Pos Pred Value : 0.60417
Neg Pred Value : 0.64678
Prevalence : 0.39002
Detection Rate : 0.08859
Detection Prevalence : 0.14664
Balanced Accuracy : 0.56600
'Positive' Class : X1
roc(response=xtest$potability, predictor=factor(knn_pred,ordered=T) ,plot=T,print.auc=T) # plot ROC
Call:
roc.default(response = xtest$potability, predictor = factor(knn_pred, ordered = T), plot = T, print.auc = T)
Data: factor(knn_pred, ordered = T) in 599 controls (xtest$potability X0) < 383 cases (xtest$potability X1).
Area under the curve: 0.5652
paste("F1-score: ",(F1_Score(y_pred = knn_pred , y_true = xtest$potability, positive = "X1"))) # F1-score
[1] "F1-score: 0.329545454545454"
set.seed(123)
svm_mod = train(potability ~., data = xtrain, method = "svmRadialWeights",
trControl = fit.control, metric = "ROC",preProcess = c("center", "scale"))
svm_mod
Support Vector Machines with Class Weights
2294 samples
9 predictor
2 classes: 'X0', 'X1'
Pre-processing: centered (9), scaled (9)
Resampling: Cross-Validated (10 fold, repeated 3 times)
Summary of sample sizes: 2065, 2065, 2065, 2064, 2064, 2065, ...
Resampling results across tuning parameters:
C Weight ROC Sens Spec
0.25 1 0.6809521 0.9790373 0.146762380
0.25 2 0.6915361 0.9995238 0.002234707
0.25 3 0.6832695 1.0000000 0.000000000
0.50 1 0.6867964 0.9599794 0.220528506
0.50 2 0.6920178 0.9969030 0.044319600
0.50 3 0.6832030 0.9992840 0.007823554
1.00 1 0.6904796 0.9380524 0.266325427
1.00 2 0.6919483 0.9866598 0.107270079
1.00 3 0.6832414 0.9961871 0.052888057
Tuning parameter 'sigma' was held constant at a value of 0.0811876
ROC was used to select the optimal model using the largest value.
The final values used for the model were sigma = 0.0811876, C = 0.5 and Weight = 2.
svm_pred = predict(svm_mod,xtest) # predict
confusionMatrix(svm_pred, factor(xtest$potability),positive = "X1") # confusion matrix
Confusion Matrix and Statistics
Reference
Prediction X0 X1
X0 596 369
X1 3 14
Accuracy : 0.6212
95% CI : (0.59, 0.6516)
No Information Rate : 0.61
P-Value [Acc > NIR] : 0.2465
Kappa : 0.0381
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.03655
Specificity : 0.99499
Pos Pred Value : 0.82353
Neg Pred Value : 0.61762
Prevalence : 0.39002
Detection Rate : 0.01426
Detection Prevalence : 0.01731
Balanced Accuracy : 0.51577
'Positive' Class : X1
roc(response=xtest$potability, predictor=factor(svm_pred,ordered=T) ,plot=T,print.auc=T) # plot ROC
Call:
roc.default(response = xtest$potability, predictor = factor(svm_pred, ordered = T), plot = T, print.auc = T)
Data: factor(svm_pred, ordered = T) in 599 controls (xtest$potability X0) < 383 cases (xtest$potability X1).
Area under the curve: 0.5158
paste("F1-score: ",(F1_Score(y_pred = svm_pred , y_true = xtest$potability, positive = "X1"))) # F1-score
[1] "F1-score: 0.07"
# rpart model
set.seed(123)
rpart_mod <- train(
potability ~., data = xtrain, method = "rpart",
trControl = fit.control,
tuneLength = 30
)
The metric "Accuracy" was not in the result set. ROC will be used instead.
rpart_mod
CART
2294 samples
9 predictor
2 classes: 'X0', 'X1'
No pre-processing
Resampling: Cross-Validated (10 fold, repeated 3 times)
Summary of sample sizes: 2065, 2065, 2065, 2064, 2064, 2065, ...
Resampling results across tuning parameters:
cp ROC Sens Spec
0.000000000 0.8196866 0.7965296 0.65327507
0.003852822 0.8316346 0.8618123 0.61826883
0.007705644 0.8252204 0.8810963 0.58662921
0.011558467 0.8174704 0.8897020 0.55494798
0.015411289 0.8152053 0.8897037 0.54449854
0.019264111 0.8118076 0.8877852 0.53439451
0.023116933 0.7981692 0.8417951 0.57116937
0.026969755 0.7693982 0.8339209 0.54077403
0.030822578 0.7050128 0.9137307 0.36428631
0.034675400 0.6604668 0.9821429 0.24172285
0.038528222 0.6564087 0.9940476 0.22720350
0.042381044 0.6564087 0.9940476 0.22720350
0.046233866 0.6564087 0.9940476 0.22720350
0.050086688 0.6564087 0.9940476 0.22720350
0.053939511 0.6564087 0.9940476 0.22720350
0.057792333 0.6564087 0.9940476 0.22720350
0.061645155 0.6564087 0.9940476 0.22720350
0.065497977 0.6564087 0.9940476 0.22720350
0.069350799 0.6564087 0.9940476 0.22720350
0.073203622 0.6564087 0.9940476 0.22720350
0.077056444 0.6564087 0.9940476 0.22720350
0.080909266 0.6564087 0.9940476 0.22720350
0.084762088 0.6564087 0.9940476 0.22720350
0.088614910 0.6564087 0.9940476 0.22720350
0.092467733 0.6564087 0.9940476 0.22720350
0.096320555 0.6564087 0.9940476 0.22720350
0.100173377 0.6564087 0.9940476 0.22720350
0.104026199 0.6564087 0.9940476 0.22720350
0.107879021 0.6437738 0.9945238 0.20646275
0.111731844 0.5693906 0.9964286 0.09895963
ROC was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.003852822.
plot(rpart_mod) # plot complexity parameter
unlist(rpart_mod$bestTune) # print best complexity parameter
cp
0.003852822
fancyRpartPlot(rpart_mod$finalModel) # plot tree
rpart_pred = predict(rpart_mod,xtest) # predict
confusionMatrix(rpart_pred, factor(xtest$potability),positive = "X1") # confusion matrix
Confusion Matrix and Statistics
Reference
Prediction X0 X1
X0 524 150
X1 75 233
Accuracy : 0.7709
95% CI : (0.7433, 0.7968)
No Information Rate : 0.61
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.5008
Mcnemar's Test P-Value : 8.084e-07
Sensitivity : 0.6084
Specificity : 0.8748
Pos Pred Value : 0.7565
Neg Pred Value : 0.7774
Prevalence : 0.3900
Detection Rate : 0.2373
Detection Prevalence : 0.3136
Balanced Accuracy : 0.7416
'Positive' Class : X1
roc(response=xtest$potability, predictor=factor(rpart_pred,ordered=T) ,plot=T,print.auc=T) # plot ROC
Call:
roc.default(response = xtest$potability, predictor = factor(rpart_pred, ordered = T), plot = T, print.auc = T)
Data: factor(rpart_pred, ordered = T) in 599 controls (xtest$potability X0) < 383 cases (xtest$potability X1).
Area under the curve: 0.7416
paste("F1-score: ",(F1_Score(y_pred = rpart_pred , y_true = xtest$potability, positive = "X1"))) # F1-score
[1] "F1-score: 0.67438494934877"
vip(rpart_mod, geom="point") + ggplot2::theme_bw() # plot variable importance
set.seed(203)
rf_mod = train(potability ~ ., data = xtrain, method = "rf",
tuneLength = 5, metric = "ROC",
trControl = fit.control)
rf_mod
Random Forest
2294 samples
9 predictor
2 classes: 'X0', 'X1'
No pre-processing
Resampling: Cross-Validated (10 fold, repeated 3 times)
Summary of sample sizes: 2065, 2064, 2065, 2065, 2064, 2065, ...
Resampling results across tuning parameters:
mtry ROC Sens Spec
2 0.8557016 0.9004094 0.6044653
3 0.8620255 0.8927835 0.6208406
5 0.8627854 0.8758736 0.6286600
7 0.8575789 0.8677715 0.6335456
9 0.8528523 0.8539431 0.6331835
ROC was used to select the optimal model using the largest value.
The final value used for the model was mtry = 5.
rf_pred = predict(rf_mod,xtest) # predict
confusionMatrix(rf_pred, factor(xtest$potability), positive = "X1") # confusion matrix
Confusion Matrix and Statistics
Reference
Prediction X0 X1
X0 522 115
X1 77 268
Accuracy : 0.8045
95% CI : (0.7783, 0.8289)
No Information Rate : 0.61
P-Value [Acc > NIR] : < 2e-16
Kappa : 0.5816
Mcnemar's Test P-Value : 0.00758
Sensitivity : 0.6997
Specificity : 0.8715
Pos Pred Value : 0.7768
Neg Pred Value : 0.8195
Prevalence : 0.3900
Detection Rate : 0.2729
Detection Prevalence : 0.3513
Balanced Accuracy : 0.7856
'Positive' Class : X1
roc(response=xtest$potability, predictor=factor(rf_pred,ordered=T) ,plot=T,print.auc=T) # plot ROC
Call:
roc.default(response = xtest$potability, predictor = factor(rf_pred, ordered = T), plot = T, print.auc = T)
Data: factor(rf_pred, ordered = T) in 599 controls (xtest$potability X0) < 383 cases (xtest$potability X1).
Area under the curve: 0.7864
paste("F1-score: ",(F1_Score(y_pred = rf_pred , y_true = xtest$potability, positive = "X1"))) # F1-score
[1] "F1-score: 0.737276478679505"
vip(rf_mod, geom="point") + ggplot2::theme_bw() # plot variable importance
set.seed(123)
xgb <- train(potability ~., data = xtrain, method = "xgbTree",
trControl = trainControl("cv", number = 10))
xgb$bestTune
# predict
xgb_pred = predict(xgb,xtest) # predict
confusionMatrix(xgb_pred, factor(xtest$potability),positive = "X1") # confusion matrix
Confusion Matrix and Statistics
Reference
Prediction X0 X1
X0 506 132
X1 93 251
Accuracy : 0.7709
95% CI : (0.7433, 0.7968)
No Information Rate : 0.61
P-Value [Acc > NIR] : <2e-16
Kappa : 0.5094
Mcnemar's Test P-Value : 0.0113
Sensitivity : 0.6554
Specificity : 0.8447
Pos Pred Value : 0.7297
Neg Pred Value : 0.7931
Prevalence : 0.3900
Detection Rate : 0.2556
Detection Prevalence : 0.3503
Balanced Accuracy : 0.7500
'Positive' Class : X1
roc(response=xtest$potability, predictor=factor(xgb_pred,ordered=T) ,plot=T,print.auc=T) # plot ROC
Call:
roc.default(response = xtest$potability, predictor = factor(xgb_pred, ordered = T), plot = T, print.auc = T)
Data: factor(xgb_pred, ordered = T) in 599 controls (xtest$potability X0) < 383 cases (xtest$potability X1).
Area under the curve: 0.75
paste("F1-score: ",(F1_Score(y_pred = xgb_pred , y_true = xtest$potability, positive = "X1"))) # F1-score
[1] "F1-score: 0.69050894085282"
vip(xgb, geom="point") + ggplot2::theme_bw() # plot variable importance
set.seed(123)
ada <- train(potability ~., data = xtrain, method = "adaboost",
trControl = trainControl("cv", number = 10))
ada$bestTune
# predict
ada_pred = predict(ada,xtest) # predict
confusionMatrix(ada_pred, factor(xtest$potability),positive = "X1") # confusion matrix
Confusion Matrix and Statistics
Reference
Prediction X0 X1
X0 492 116
X1 107 267
Accuracy : 0.7729
95% CI : (0.7454, 0.7988)
No Information Rate : 0.61
P-Value [Acc > NIR] : <2e-16
Kappa : 0.5207
Mcnemar's Test P-Value : 0.5922
Sensitivity : 0.6971
Specificity : 0.8214
Pos Pred Value : 0.7139
Neg Pred Value : 0.8092
Prevalence : 0.3900
Detection Rate : 0.2719
Detection Prevalence : 0.3809
Balanced Accuracy : 0.7592
'Positive' Class : X1
roc(response=xtest$potability, predictor=factor(ada_pred,ordered=T) ,plot=T,print.auc=T) # plot ROC
Call:
roc.default(response = xtest$potability, predictor = factor(ada_pred, ordered = T), plot = T, print.auc = T)
Data: factor(ada_pred, ordered = T) in 599 controls (xtest$potability X0) < 383 cases (xtest$potability X1).
Area under the curve: 0.7592
paste("F1-score: ",(F1_Score(y_pred = ada_pred , y_true = xtest$potability, positive = "X1"))) # F1-score
[1] "F1-score: 0.705416116248349"
varImp(ada)
AdaBag variable importance
set.seed(123)
adabag <- train(potability ~., data = xtrain, method = "AdaBag",
trControl = trainControl("cv", number = 3))
# predict
ada_pred = predict(ada,xtest) # predict
confusionMatrix(ada_pred, factor(xtest$potability),positive = "X1") # confusion matrix
Confusion Matrix and Statistics
Reference
Prediction X0 X1
X0 521 200
X1 78 183
Accuracy : 0.7169
95% CI : (0.6876, 0.7449)
No Information Rate : 0.61
P-Value [Acc > NIR] : 1.460e-12
Kappa : 0.3688
Mcnemar's Test P-Value : 3.955e-13
Sensitivity : 0.4778
Specificity : 0.8698
Pos Pred Value : 0.7011
Neg Pred Value : 0.7226
Prevalence : 0.3900
Detection Rate : 0.1864
Detection Prevalence : 0.2658
Balanced Accuracy : 0.6738
'Positive' Class : X1
roc(response=xtest$potability, predictor=factor(ada_pred,ordered=T) ,plot=T,print.auc=T) # plot ROC
Call:
roc.default(response = xtest$potability, predictor = factor(ada_pred, ordered = T), plot = T, print.auc = T)
Data: factor(ada_pred, ordered = T) in 599 controls (xtest$potability X0) < 383 cases (xtest$potability X1).
Area under the curve: 0.6738
paste("F1-score: ",(F1_Score(y_pred = ada_pred , y_true = xtest$potability, positive = "X1"))) # F1-score
[1] "F1-score: 0.56832298136646"
varImp(ada)
AdaBag variable importance
set.seed(123)
bag <- train(potability ~., data = xtrain, method = "treebag",
trControl = fit.control, metric = "ROC")
bag
Bagged CART
2294 samples
9 predictor
2 classes: 'X0', 'X1'
No pre-processing
Resampling: Cross-Validated (10 fold, repeated 3 times)
Summary of sample sizes: 2065, 2065, 2065, 2064, 2064, 2065, ...
Resampling results:
ROC Sens Spec
0.8441907 0.8372628 0.64134
# predict
bag_pred = predict(bag,xtest) # predict
confusionMatrix(bag_pred, factor(xtest$potability),positive = "X1") # confusion matrix
Confusion Matrix and Statistics
Reference
Prediction X0 X1
X0 494 127
X1 105 256
Accuracy : 0.7637
95% CI : (0.7359, 0.79)
No Information Rate : 0.61
P-Value [Acc > NIR] : <2e-16
Kappa : 0.4983
Mcnemar's Test P-Value : 0.168
Sensitivity : 0.6684
Specificity : 0.8247
Pos Pred Value : 0.7091
Neg Pred Value : 0.7955
Prevalence : 0.3900
Detection Rate : 0.2607
Detection Prevalence : 0.3676
Balanced Accuracy : 0.7466
'Positive' Class : X1
roc(response=xtest$potability, predictor=factor(bag_pred,ordered=T) ,plot=T,print.auc=T) # plot ROC
Call:
roc.default(response = xtest$potability, predictor = factor(bag_pred, ordered = T), plot = T, print.auc = T)
Data: factor(bag_pred, ordered = T) in 599 controls (xtest$potability X0) < 383 cases (xtest$potability X1).
Area under the curve: 0.7466
paste("F1-score: ",(F1_Score(y_pred = bag_pred , y_true = xtest$potability, positive = "X1"))) # F1-score
[1] "F1-score: 0.688172043010753"
vip(bag, geom="point") + ggplot2::theme_bw() # plot variable importance
results = tribble(
~"Method",~"Accuracy",~"AUC-ROC",~"F1",~"Sensitivity",~"Specificity",
"glm",0.612,0.5026,0.0104,0.0052,1,
"knn",0.6405,0.5652,0.3295,0.2272,0.9048,
"svmRadialWeights",0.6212,0.5158,0.07,0.0366,0.995,
"CART (rpart)",0.7709,0.7416,0.6744,0.6084,0.8748,
"random forest",0.8045,0.7864,0.7373,0.6997,0.8715,
"xgbTree",0.7709,0.75,0.6905,0.6554,0.8447,
"adaboost",0.7729,0.7592,0.7054,0.6971,0.8214,
"adaBag",0.7169,0.6738,0.5683,0.4778,0.8698,
"treebag",0.7637, 0.7466,0.6882,0.6684,0.8247
)
# format table
results %>% gt() %>%
tab_header(
title = md("**Results Summary**"),
subtitle = "Differences in performance measures"
) %>%
tab_style(
style = list(
cell_borders(
sides = "bottom",
color = "black",
weight = px(2)
)
),
locations = list(
cells_column_labels(
columns = gt::everything()
)
)
) %>%
tab_options(table.font.size = 12,
table.width = 600) %>%
data_color(
columns=vars("Accuracy", "AUC-ROC","F1","Sensitivity","Specificity"),
colors= scales::col_numeric(
palette = c("#ffffff", "#f2fbd2", "#c9ecb4", "#93d3ab", "#35b0ab"),
domain=NULL
)
)
| Results Summary | |||||
|---|---|---|---|---|---|
| Differences in performance measures | |||||
| Method | Accuracy | AUC-ROC | F1 | Sensitivity | Specificity |
| glm | 0.6120 | 0.5026 | 0.0104 | 0.0052 | 1.0000 |
| knn | 0.6405 | 0.5652 | 0.3295 | 0.2272 | 0.9048 |
| svmRadialWeights | 0.6212 | 0.5158 | 0.0700 | 0.0366 | 0.9950 |
| CART (rpart) | 0.7709 | 0.7416 | 0.6744 | 0.6084 | 0.8748 |
| random forest | 0.8045 | 0.7864 | 0.7373 | 0.6997 | 0.8715 |
| xgbTree | 0.7709 | 0.7500 | 0.6905 | 0.6554 | 0.8447 |
| adaboost | 0.7729 | 0.7592 | 0.7054 | 0.6971 | 0.8214 |
| adaBag | 0.7169 | 0.6738 | 0.5683 | 0.4778 | 0.8698 |
| treebag | 0.7637 | 0.7466 | 0.6882 | 0.6684 | 0.8247 |