# loading the essential packages.
library(olsrr)
library(readr)
library(ggplot2)
library(caret)
library(tibble)
library(rcompanion)
library(dplyr)
library(purrr)
library(corrplot)
library(DescTools)
library(lmtest)
library(caret)
library(AER)
library(nnet)
library(Tplyr)
library(verification)
library(janitor)
library(class)
library(kernlab)
library(glmnet)
library(tidyverse)
library(pROC)
library(DMwR)
library(ROSE)
# importing the training and test sample.
drugs <- read_csv("drugs_train.csv")
drugs_test_data <- read_csv("drugs_test.csv")
# checking the structure of the datasets.
glimpse(drugs)
glimpse(drugs_test_data)
# FOR THE TRAIN SAMPLE DATA
# create a vector of names.
drugs_categorical_vars <-
sapply(drugs[, -1], is.character) %>%
which() %>%
names()
drugs_categorical_vars
# and apply a conversion in a loop
for (variable in drugs_categorical_vars) {
drugs[[variable]] <- as.factor(drugs[[variable]])
}
# FOR THE TEST SAMPLE DATA
# create a vector of names.
drugs_test_data_categorical_vars <-
sapply(drugs_test_data[, -1], is.character) %>%
which() %>%
names()
drugs_test_data_categorical_vars
# and apply a conversion in a loop.
for (variable in drugs_test_data_categorical_vars) {
drugs_test_data[[variable]] <- as.factor(drugs_test_data[[variable]])
}
# to check the distribution of the dependent variable (cocaine_consumption_last_month).
count<- table(drugs$consumption_cocaine_last_month)
ggplot(drugs,
aes(x = consumption_cocaine_last_month)) +
geom_bar(fill = "sky blue")+geom_text(stat='count', aes(label=..count..), vjust=1) + theme_bw()+
ggtitle("COCAINE CONSUMPTION LAST MONTH DATA")
# splitting the training dataset.
set.seed(987654321)
drugs_which_train <- createDataPartition(drugs$consumption_cocaine_last_month,
p = 0.75,
list = FALSE)
drugs_train <- drugs[drugs_which_train,]
drugs_test <- drugs[-drugs_which_train,]
table(drugs_train$consumption_cocaine_last_month)
##
## No Yes
## 1030 96
table(drugs_test$consumption_cocaine_last_month)
##
## No Yes
## 343 31
source("F_summary_binary.R")
source("F_summary_binary_class.R")
options(contrasts = c("contr.treatment",
"contr.treatment"))
# training with no cross validation.
set.seed(987654321)
ctrl_nocv_pp <- trainControl(method = "none")
# run the train() function with pre-processing.
set.seed(987654321)
drugs_train_knn5_pp <-
train(consumption_cocaine_last_month ~ .,
data = drugs_train %>%
dplyr::select(-id),
method = "knn",
trControl = ctrl_nocv_pp, preProcess = c("range"))
drugs_train_knn5_pp
## k-Nearest Neighbors
##
## 1126 samples
## 16 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: re-scaling to [0, 1] (22)
## Resampling: None
# calculate fitted values - prediction on the training sample using the trained model.
set.seed(987654321)
drugs_train_knn5_pp_fitted <- predict(drugs_train_knn5_pp,
drugs_train)
table(drugs_train_knn5_pp_fitted)
## drugs_train_knn5_pp_fitted
## No Yes
## 1093 33
# check different accuracy measures
summary_binary_class(predicted_classes = drugs_train_knn5_pp_fitted,
real = drugs_train$consumption_cocaine_last_month)
## Accuracy Sensitivity Specificity Pos Pred Value
## 0.92274 0.21875 0.98835 0.63636
## Neg Pred Value F1 Balanced Accuracy
## 0.93138 0.32558 0.60355
# applying model on the test data.
drugs_test_data_forecasts <-
data.frame(drugs_train_knn5_pp = predict(drugs_train_knn5_pp,
drugs_test_data))
table(drugs_test_data_forecasts)
## drugs_test_data_forecasts
## No Yes
## 372 13
Since the data on which our predictions are to be carried out is a highly unbalanced data, we apply some balancing techniques to solve this issue.
# weighing method
(freqs <- table(drugs_train$consumption_cocaine_last_month))
myWeights <- ifelse(drugs_train$consumption_cocaine_last_month == "yes",
0.5/freqs[2],
0.5/freqs[1]) * nrow(drugs_train)
tabyl(myWeights)
sum(myWeights) == nrow(drugs_train)
# estimating the model with weights
set.seed(987654321)
drugs_logit_train_weighted <-
train(consumption_cocaine_last_month ~ .,
data = drugs_train %>%
dplyr::select(-id),
method = "glm",
family = "binomial",
trControl = ctrl_cv5,
weights = myWeights)
# lets see the result
drugs_logit_train_weighted
## Generalized Linear Model
##
## 1126 samples
## 16 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 900, 901, 901, 901, 901
## Resampling results:
##
## ROC Sens Spec Accuracy Kappa
## 0.779139 0.7174757 0.6894737 0.714942 0.181457
# the result is better than the previous statistics we had.
# changing the sampling entry to "down"
ctrl_cv5$sampling <- "down"
set.seed(987654321)
drugs_logit_train_down <-
train(consumption_cocaine_last_month ~ .,
data = drugs_train %>%
dplyr::select(-id),
method = "glm",
family = "binomial",
trControl = ctrl_cv5)
# lets see the result
drugs_logit_train_down
## Generalized Linear Model
##
## 1126 samples
## 16 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 900, 901, 901, 901, 901
## Addtional sampling using down-sampling
##
## Resampling results:
##
## ROC Sens Spec Accuracy Kappa
## 0.7462749 0.6650485 0.7389474 0.6714297 0.1648796
# this model has a poorer result in comparison to the weighted method but has a higher specificity.
# up-sampling method
ctrl_cv5$sampling <- "up"
set.seed(987654321)
drugs_logit_train_up <-
train(consumption_cocaine_last_month ~ .,
data = drugs_train %>%
dplyr::select(-id),
method = "glm",
family = "binomial",
trControl = ctrl_cv5)
# lets see the result
drugs_logit_train_up
## Generalized Linear Model
##
## 1126 samples
## 16 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 900, 901, 901, 901, 901
## Addtional sampling using up-sampling
##
## Resampling results:
##
## ROC Sens Spec Accuracy Kappa
## 0.7756336 0.7067961 0.6989474 0.706057 0.1767088
# better performance except for specificity
# SMOTE model
ctrl_cv5$sampling <- "smote"
set.seed(987654321)
drugs_logit_train_smote <-
train(consumption_cocaine_last_month ~ .,
data = drugs_train %>%
dplyr::select(-id),
method = "glm",
family = "binomial",
trControl = ctrl_cv5)
# lets see the result
drugs_logit_train_smote
## Generalized Linear Model
##
## 1126 samples
## 16 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 900, 901, 901, 901, 901
## Addtional sampling using SMOTE
##
## Resampling results:
##
## ROC Sens Spec Accuracy Kappa
## 0.782537 0.738835 0.6989474 0.735355 0.2053915
# seems to be better than up/down sampling
# ROSE model
ctrl_cv5$sampling <- "rose"
set.seed(987654321)
drugs_logit_train_rose <-
train(consumption_cocaine_last_month ~ .,
data = drugs_train %>%
dplyr::select(-id),
method = "glm",
family = "binomial",
trControl = ctrl_cv5)
drugs_logit_train_rose
# good result but with a lower accuracy.
# comparing all models on validation stage
drugs_logit_train$results
drugs_logit_train$results[, 2:6]
models_all <- ls(pattern = "drugs_logit_train")
# removing models that are not needed.
models_all<- models_all[-c(7,8)]
get("drugs_logit_train_weighted")
sapply(models_all,
function(x) (get(x))$results[,2:6])%>%
t()
## ROC Sens Spec Accuracy Kappa
## drugs_logit_train 0.7740393 0.9902913 0.06315789 0.9111976 0.07412679
## drugs_logit_train_down 0.7462749 0.6650485 0.7389474 0.6714297 0.1648796
## drugs_logit_train_rose 0.7697752 0.6961165 0.7205263 0.698061 0.1773631
## drugs_logit_train_smote 0.782537 0.738835 0.6989474 0.735355 0.2053915
## drugs_logit_train_up 0.7756336 0.7067961 0.6989474 0.706057 0.1767088
## drugs_logit_train_weighted 0.779139 0.7174757 0.6894737 0.714942 0.181457
# comparing the results on the training and test sample
source("F_summary_binary_class.R")
# we can easily apply this to all the models
# in the training sample
models_all %>%
sapply(function(x) get(x) %>%
predict(newdata = drugs_train) %>%
summary_binary_class(level_positive = "yes",
level_negative = "no",
real = drugs_train$consumption_cocaine_last_month)) %>%
# transpose the result to # have models in rows
t()
## Accuracy Sensitivity Specificity Pos Pred Value
## drugs_logit_train 0.91652 0.08333 0.99417 0.57143
## drugs_logit_train_down 0.69538 0.78125 0.68738 0.18892
## drugs_logit_train_rose 0.71758 0.80208 0.70971 0.20479
## drugs_logit_train_smote 0.75577 0.77083 0.75437 0.22630
## drugs_logit_train_up 0.72824 0.78125 0.72330 0.20833
## drugs_logit_train_weighted 0.72824 0.80208 0.72136 0.21154
## Neg Pred Value F1 Balanced Accuracy
## drugs_logit_train 0.92086 0.14545 0.53875
## drugs_logit_train_down 0.97119 0.30426 0.73431
## drugs_logit_train_rose 0.97467 0.32627 0.75590
## drugs_logit_train_smote 0.97247 0.34988 0.76260
## drugs_logit_train_up 0.97258 0.32895 0.75228
## drugs_logit_train_weighted 0.97507 0.33478 0.76172
# and do the same for the TEST data
models_all %>%
sapply(function(x) get(x) %>%
predict(newdata = drugs_test) %>%
summary_binary_class(level_positive = "yes",
level_negative = "no",
real = drugs_test$consumption_cocaine_last_month)) %>%
# transpose the result to # have models in rows
t()
## Accuracy Sensitivity Specificity Pos Pred Value
## drugs_logit_train 0.90909 0.03226 0.98834 0.20000
## drugs_logit_train_down 0.68717 0.70968 0.68513 0.16923
## drugs_logit_train_rose 0.68984 0.74194 0.68513 0.17557
## drugs_logit_train_smote 0.74866 0.64516 0.75802 0.19417
## drugs_logit_train_up 0.71925 0.67742 0.72303 0.18103
## drugs_logit_train_weighted 0.70588 0.67742 0.70845 0.17355
## Neg Pred Value F1 Balanced Accuracy
## drugs_logit_train 0.91870 0.05556 0.51030
## drugs_logit_train_down 0.96311 0.27329 0.69740
## drugs_logit_train_rose 0.96708 0.28395 0.71353
## drugs_logit_train_smote 0.95941 0.29851 0.70159
## drugs_logit_train_up 0.96124 0.28571 0.70023
## drugs_logit_train_weighted 0.96047 0.27632 0.69294
After applying the various algorithms to train the model and
afterwards predict the output for the dependent variable, the choice of
algorithm for the dataset is the K Nearest Neighbour (KNN) with
preprocessing but no cross validation. For the performance of the model
on the training data, the model trained with smote (Synthetic Minority
Oversampling Technique) performed best with a balanced accuracy of
0.76260.
It is closely followed by the weighted model method. Overall, it is
clear that the resampling method delivered better predictions by taking
care of the issue of unbalanced data.
After applying the various algorithms to train the model and afterwards
predict the output for the dependent variable, the choice of algorithm
for the dataset is rose (Random OverSampling Examples). Applying this
model on the test data gives a balanced accuracy of 0.71353. The other
statistical parameters are also not bad.
# saving predictions in a csv (comma-separated values ) file.
final_test_forecasts <- cbind(drugs_test_data, drugs_test_data_forecasts)
write.csv(final_test_forecasts, "test_data_forecasts.csv", row.names=FALSE)