In this paper I will develop prediction models on the data on cocaine consumption. In order to do this I will use several Machine Learning models, such as Logit, Probit, k Nearest Neighbors, and Support Vector Machine.
However, due to strong imbalances of the data, I will perform the over-sampling method SMOTE in order to get more balanced data and, therefore, get better predictive results.
Let’s read the libraries, set working directory, set system language to English, and read the provided training data set as drugs_train.
library(dplyr)
library(magrittr)
library(knitr)
#install.packages("DMwR")
library(dplyr)
library(readr)
library(caret)
library(AER)
library(lmtest)
library(nnet)
library(verification)
library(janitor)
library(tidyverse)
Sys.setenv(LANG = "en")
setwd("C://Users//Mateusz//Documents//Studia//Master//UW//DS WNE//ML//ML project")
drugs_train <- read_csv("drugs_train.csv")
Let’s change the character variables into factors, and divide the data into a train subsample - drugs_my_train (70% of drugs_train), and test - drugs_my_test (30% of drugs_train) subsample data sets in order to check models’ performance later:
drugs_categorical_vars <-
sapply(drugs_train[, -1], is.character) %>%
which() %>%
names()
#drugs_categorical_vars
# and apply a conversion in a loop
for (variable in drugs_categorical_vars) {
drugs_train[[variable]] <- as.factor(drugs_train[[variable]])
}
glimpse(drugs_train)
## Rows: 1,500
## Columns: 21
## $ id <chr> "train_0001", "train_0002", "train_0003~
## $ age <fct> 45-54, 25-34, 18-24, 25-34, 18-24, 18-2~
## $ gender <fct> male, male, female, female, male, femal~
## $ education <fct> "Masters degree", "University degree", ~
## $ country <fct> USA, USA, USA, USA, Australia, Australi~
## $ ethnicity <fct> Mixed-Black/Asian, Mixed-Black/Asian, M~
## $ personality_neuroticism <dbl> 57.6, 47.8, 57.6, 71.8, 56.1, 47.8, 75.~
## $ personality_extraversion <dbl> 57.3, 67.0, 43.3, 31.2, 62.3, 20.7, 69.~
## $ personality_openness <dbl> 50.1, 45.7, 55.3, 43.6, 70.2, 57.8, 62.~
## $ personality_agreeableness <dbl> 47.8, 47.8, 45.6, 56.3, 66.1, 41.2, 54.~
## $ personality_conscientiousness <dbl> 53.7, 56.0, 49.9, 31.8, 42.4, 33.6, 38.~
## $ personality_impulsiveness <dbl> 42.8, 33.8, 63.0, 63.0, 50.4, 56.5, 56.~
## $ personality_sensation <dbl> 22.4, 30.8, 62.0, 71.1, 62.0, 62.0, 22.~
## $ consumption_alcohol <fct> used in last week, used in last week, u~
## $ consumption_amphetamines <fct> used over a decade ago, never used, nev~
## $ consumption_caffeine <fct> used in last day, used in last week, us~
## $ consumption_cannabis <fct> used in last week, never used, used in ~
## $ consumption_chocolate <fct> used in last day, used in last day, use~
## $ consumption_mushrooms <fct> never used, never used, used in last ye~
## $ consumption_nicotine <fct> used in last week, never used, used in ~
## $ consumption_cocaine_last_month <fct> No, No, No, No, No, No, Yes, No, Yes, N~
#data partition
options(scipen=999)
#setting seed
set.seed(77777)
drugs_which_training <- createDataPartition(drugs_train$consumption_cocaine_last_month,
p = 0.7,
list = FALSE)
drugs_my_train <- drugs_train[c(drugs_which_training),]
drugs_my_test <- drugs_train[-c(drugs_which_training),]
# descriptive statistics
#age
drugs_my_train %>%
dplyr::select(age) %>%
count(age) %>%
group_by(age)
## # A tibble: 6 x 2
## # Groups: age [6]
## age n
## <fct> <int>
## 1 18-24 372
## 2 25-34 250
## 3 35-44 202
## 4 45-54 174
## 5 55-64 41
## 6 65+ 12
#gender
drugs_my_train %>%
dplyr::select(gender) %>%
count(gender) %>%
group_by(gender)
## # A tibble: 2 x 2
## # Groups: gender [2]
## gender n
## <fct> <int>
## 1 female 515
## 2 male 536
#educ
drugs_my_train %>%
dplyr::select(education) %>%
count(education) %>%
group_by(education)
## # A tibble: 9 x 2
## # Groups: education [9]
## education n
## <fct> <int>
## 1 Doctorate degree 46
## 2 Left school at 16 years 43
## 3 Left school at 17 years 18
## 4 Left school at 18 years 58
## 5 Left school before 16 years 12
## 6 Masters degree 168
## 7 Professional certificate/ diploma 158
## 8 Some college or university, no certificate or degree 289
## 9 University degree 259
#country
drugs_my_train %>%
dplyr::select(country) %>%
count(country) %>%
group_by(country)
## # A tibble: 7 x 2
## # Groups: country [7]
## country n
## <fct> <int>
## 1 Australia 333
## 2 Canada 4
## 3 Ireland 10
## 4 New Zealand 66
## 5 Other 32
## 6 UK 50
## 7 USA 556
#ethnicity
drugs_my_train %>%
dplyr::select(ethnicity) %>%
count(ethnicity) %>%
group_by(ethnicity)
## # A tibble: 7 x 2
## # Groups: ethnicity [7]
## ethnicity n
## <fct> <int>
## 1 Asian 18
## 2 Black 17
## 3 Mixed-Black/Asian 952
## 4 Mixed-White/Asian 14
## 5 Mixed-White/Black 37
## 6 Other 11
## 7 White 2
#summary
summary(drugs_my_train)
## id age gender
## Length:1051 18-24:372 female:515
## Class :character 25-34:250 male :536
## Mode :character 35-44:202
## 45-54:174
## 55-64: 41
## 65+ : 12
##
## education country
## Some college or university, no certificate or degree:289 Australia :333
## University degree :259 Canada : 4
## Masters degree :168 Ireland : 10
## Professional certificate/ diploma :158 New Zealand: 66
## Left school at 18 years : 58 Other : 32
## Doctorate degree : 46 UK : 50
## (Other) : 73 USA :556
## ethnicity personality_neuroticism personality_extraversion
## Asian : 18 Min. : 0.00 Min. : 8.30
## Black : 17 1st Qu.: 41.30 1st Qu.: 39.40
## Mixed-Black/Asian:952 Median : 52.00 Median : 50.10
## Mixed-White/Asian: 14 Mean : 51.74 Mean : 49.92
## Mixed-White/Black: 37 3rd Qu.: 62.30 3rd Qu.: 59.70
## Other : 11 Max. :100.00 Max. :100.00
## White : 2
## personality_openness personality_agreeableness personality_conscientiousness
## Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 41.40 1st Qu.:41.20 1st Qu.: 40.60
## Median : 52.70 Median :49.80 Median : 49.90
## Mean : 52.77 Mean :49.96 Mean : 49.96
## 3rd Qu.: 62.50 3rd Qu.:58.50 3rd Qu.: 60.90
## Max. :100.00 Max. :95.60 Max. :100.00
##
## personality_impulsiveness personality_sensation
## Min. : 0.00 Min. : 0.00
## 1st Qu.: 33.80 1st Qu.: 38.80
## Median : 42.80 Median : 54.00
## Mean : 46.82 Mean : 52.01
## 3rd Qu.: 56.50 3rd Qu.: 71.10
## Max. :100.00 Max. :100.00
##
## consumption_alcohol consumption_amphetamines
## never used : 20 never used :560
## used in last day :275 used in last day : 63
## used in last decade : 44 used in last decade :126
## used in last month :156 used in last month : 42
## used in last week :423 used in last week : 38
## used in last year :113 used in last year :104
## used over a decade ago: 20 used over a decade ago:118
## consumption_caffeine consumption_cannabis
## never used : 12 never used :237
## used in last day :767 used in last day :260
## used in last decade : 14 used in last decade :148
## used in last month : 63 used in last month : 73
## used in last week :156 used in last week : 93
## used in last year : 32 used in last year :120
## used over a decade ago: 7 used over a decade ago:120
## consumption_chocolate consumption_mushrooms
## never used : 18 never used :544
## used in last day :469 used in last day : 4
## used in last decade : 6 used in last decade :141
## used in last month :163 used in last month : 66
## used in last week :362 used in last week : 27
## used in last year : 31 used in last year :152
## used over a decade ago: 2 used over a decade ago:117
## consumption_nicotine consumption_cocaine_last_month
## never used :244 No :962
## used in last day :346 Yes: 89
## used in last decade :112
## used in last month : 61
## used in last week : 79
## used in last year :101
## used over a decade ago:108
#box plots
box_neur <- ggplot(drugs_my_train, aes(y=personality_neuroticism))+
geom_boxplot()+
theme_gray()
box_extra <- ggplot(drugs_my_train, aes(y=personality_extraversion))+
geom_boxplot()+
theme_gray()
box_openness <- ggplot(drugs_my_train, aes(y=personality_openness))+
geom_boxplot()+
theme_gray()
box_agreeableness <- ggplot(drugs_my_train, aes(y=personality_agreeableness))+
geom_boxplot()+
theme_gray()
box_conscientiousness <- ggplot(drugs_my_train, aes(y=personality_conscientiousness))+
geom_boxplot()+
theme_gray()
box_impulsiveness <- ggplot(drugs_my_train, aes(y=personality_impulsiveness))+
geom_boxplot()+
theme_gray()
box_sensation <- ggplot(drugs_my_train, aes(y=personality_sensation))+
geom_boxplot()+
theme_gray()
library(gridExtra)
boxs <- grid.arrange(box_neur, box_extra, box_openness,
box_agreeableness, box_conscientiousness,
box_impulsiveness, box_sensation, ncol = 4)
# histograms
hist_neur <- ggplot(drugs_my_train, aes(x=personality_neuroticism))+
geom_histogram()+
xlab("Personality neuroticism")+
theme_gray()
hist_extra <- ggplot(drugs_my_train, aes(x=personality_extraversion))+
geom_histogram()+
xlab("Personality extraversion")+
theme_gray()
hist_openness <- ggplot(drugs_my_train, aes(x=personality_openness))+
geom_histogram()+
xlab("Personality openness")+
theme_gray()
hist_agreeableness <- ggplot(drugs_my_train, aes(x=personality_agreeableness))+
geom_histogram()+
xlab("Personality aggreeableness")+
theme_gray()
hist_conscientiousness <- ggplot(drugs_my_train, aes(x=personality_conscientiousness))+
geom_histogram()+
xlab("Personality conscientiousness")+
theme_gray()
hist_impulsiveness <- ggplot(drugs_my_train, aes(x=personality_impulsiveness))+
geom_histogram()+
xlab("Personality impulsiveness")+
theme_gray()
hist_sensation <- ggplot(drugs_my_train, aes(x=personality_sensation))+
geom_histogram()+
xlab("Personality sensation")+
theme_gray()
hists <- grid.arrange(hist_neur, hist_extra, hist_openness,
hist_agreeableness, hist_conscientiousness,
hist_impulsiveness, hist_sensation, ncol = 4)
#but first lets check for the NA's
any(is.na(drugs_my_train))
## [1] FALSE
#let's set the contrast
options(contrasts = c("contr.treatment", # for non-ordinal factors
"contr.treatment")) # for ordinal factors
#logit - before each estimation I will be setting the same seed in roder to have the results reproducible
set.seed(77777)
drugs_logit <- glm(consumption_cocaine_last_month ~ .,
family = binomial(link = "logit"),
data = drugs_my_train %>%
# we exclude id
dplyr::select(-id))
summary(drugs_logit)
##
## Call:
## glm(formula = consumption_cocaine_last_month ~ ., family = binomial(link = "logit"),
## data = drugs_my_train %>% dplyr::select(-id))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.84023 -0.29936 -0.12648 -0.00007 2.64158
##
## Coefficients:
## Estimate
## (Intercept) -36.3055121
## age25-34 -0.3914710
## age35-44 -1.0351050
## age45-54 -2.2307006
## age55-64 -0.1622596
## age65+ -15.4479801
## gendermale 0.0573572
## educationLeft school at 16 years 0.0588432
## educationLeft school at 17 years -0.2959200
## educationLeft school at 18 years -1.3594782
## educationLeft school before 16 years -17.8704993
## educationMasters degree 0.1566757
## educationProfessional certificate/ diploma 0.1755301
## educationSome college or university, no certificate or degree -0.7215369
## educationUniversity degree -0.2731482
## countryCanada -18.1127263
## countryIreland -17.7681065
## countryNew Zealand 0.2746791
## countryOther -1.0803952
## countryUK 1.0164840
## countryUSA -0.1195872
## ethnicityBlack -0.9142367
## ethnicityMixed-Black/Asian -2.4081828
## ethnicityMixed-White/Asian -0.8074202
## ethnicityMixed-White/Black -1.6972824
## ethnicityOther -2.1985035
## ethnicityWhite -21.8267831
## personality_neuroticism 0.0184237
## personality_extraversion 0.0225965
## personality_openness -0.0140442
## personality_agreeableness -0.0245856
## personality_conscientiousness -0.0005938
## personality_impulsiveness -0.0029824
## personality_sensation 0.0053448
## consumption_alcoholused in last day 18.2743197
## consumption_alcoholused in last decade 16.6841924
## consumption_alcoholused in last month 17.9893008
## consumption_alcoholused in last week 18.3622142
## consumption_alcoholused in last year 18.2691458
## consumption_alcoholused over a decade ago 2.0700543
## consumption_amphetaminesused in last day 1.6095070
## consumption_amphetaminesused in last decade 0.8146362
## consumption_amphetaminesused in last month 1.7171346
## consumption_amphetaminesused in last week 2.2455047
## consumption_amphetaminesused in last year 1.1237174
## consumption_amphetaminesused over a decade ago 0.1649665
## consumption_caffeineused in last day -1.8693831
## consumption_caffeineused in last decade -1.7119768
## consumption_caffeineused in last month -2.3187611
## consumption_caffeineused in last week -1.9734750
## consumption_caffeineused in last year -19.3428495
## consumption_caffeineused over a decade ago -17.6031860
## consumption_cannabisused in last day 0.3247062
## consumption_cannabisused in last decade 0.1098149
## consumption_cannabisused in last month 0.3046906
## consumption_cannabisused in last week 0.7734123
## consumption_cannabisused in last year 0.3987780
## consumption_cannabisused over a decade ago -15.4729936
## consumption_chocolateused in last day 18.5661057
## consumption_chocolateused in last decade 0.4615158
## consumption_chocolateused in last month 18.7910568
## consumption_chocolateused in last week 18.0975242
## consumption_chocolateused in last year 18.5715337
## consumption_chocolateused over a decade ago 35.3424751
## consumption_mushroomsused in last day 2.4334294
## consumption_mushroomsused in last decade 0.7483371
## consumption_mushroomsused in last month 1.8366511
## consumption_mushroomsused in last week 0.3576258
## consumption_mushroomsused in last year 0.6985941
## consumption_mushroomsused over a decade ago -1.3053068
## consumption_nicotineused in last day 0.7642505
## consumption_nicotineused in last decade 0.1041174
## consumption_nicotineused in last month -0.0570737
## consumption_nicotineused in last week 0.1128406
## consumption_nicotineused in last year -2.1455159
## consumption_nicotineused over a decade ago 0.1198158
## Std. Error
## (Intercept) 4956.7291926
## age25-34 0.3723988
## age35-44 0.5914976
## age45-54 1.1154791
## age55-64 1.2022579
## age65+ 4137.0273610
## gendermale 0.3370741
## educationLeft school at 16 years 1.2652602
## educationLeft school at 17 years 1.2869500
## educationLeft school at 18 years 1.2112855
## educationLeft school before 16 years 4265.4801884
## educationMasters degree 0.9500422
## educationProfessional certificate/ diploma 0.9875304
## educationSome college or university, no certificate or degree 0.9611212
## educationUniversity degree 0.9636869
## countryCanada 7600.2658529
## countryIreland 4817.4484786
## countryNew Zealand 0.5639500
## countryOther 0.8799397
## countryUK 0.6036812
## countryUSA 0.4011126
## ethnicityBlack 1.6524072
## ethnicityMixed-Black/Asian 0.9826639
## ethnicityMixed-White/Asian 1.2314633
## ethnicityMixed-White/Black 1.1256822
## ethnicityOther 1.4646335
## ethnicityWhite 10762.6060430
## personality_neuroticism 0.0120329
## personality_extraversion 0.0113933
## personality_openness 0.0106299
## personality_agreeableness 0.0107389
## personality_conscientiousness 0.0130459
## personality_impulsiveness 0.0114761
## personality_sensation 0.0087353
## consumption_alcoholused in last day 3591.2585613
## consumption_alcoholused in last decade 3591.2587439
## consumption_alcoholused in last month 3591.2585754
## consumption_alcoholused in last week 3591.2585564
## consumption_alcoholused in last year 3591.2585776
## consumption_alcoholused over a decade ago 4938.8151477
## consumption_amphetaminesused in last day 0.5225222
## consumption_amphetaminesused in last decade 0.4979291
## consumption_amphetaminesused in last month 0.5703810
## consumption_amphetaminesused in last week 0.5678197
## consumption_amphetaminesused in last year 0.4718404
## consumption_amphetaminesused over a decade ago 0.8769268
## consumption_caffeineused in last day 1.3217613
## consumption_caffeineused in last decade 1.8276628
## consumption_caffeineused in last month 1.4413051
## consumption_caffeineused in last week 1.3644459
## consumption_caffeineused in last year 2875.0957878
## consumption_caffeineused over a decade ago 5509.2087165
## consumption_cannabisused in last day 0.7547413
## consumption_cannabisused in last decade 0.7696436
## consumption_cannabisused in last month 0.8414888
## consumption_cannabisused in last week 0.7942740
## consumption_cannabisused in last year 0.7933655
## consumption_cannabisused over a decade ago 1435.5469492
## consumption_chocolateused in last day 3416.4341482
## consumption_chocolateused in last decade 7469.9849985
## consumption_chocolateused in last month 3416.4341575
## consumption_chocolateused in last week 3416.4341521
## consumption_chocolateused in last year 3416.4342590
## consumption_chocolateused over a decade ago 12723.6163447
## consumption_mushroomsused in last day 1.8463911
## consumption_mushroomsused in last decade 0.4615905
## consumption_mushroomsused in last month 0.5747346
## consumption_mushroomsused in last week 0.7672468
## consumption_mushroomsused in last year 0.4584395
## consumption_mushroomsused over a decade ago 1.1852805
## consumption_nicotineused in last day 0.5346443
## consumption_nicotineused in last decade 0.7420426
## consumption_nicotineused in last month 0.7522183
## consumption_nicotineused in last week 0.6667357
## consumption_nicotineused in last year 1.1372076
## consumption_nicotineused over a decade ago 1.2069052
## z value Pr(>|z|)
## (Intercept) -0.007 0.99416
## age25-34 -1.051 0.29316
## age35-44 -1.750 0.08012
## age45-54 -2.000 0.04553
## age55-64 -0.135 0.89264
## age65+ -0.004 0.99702
## gendermale 0.170 0.86488
## educationLeft school at 16 years 0.047 0.96291
## educationLeft school at 17 years -0.230 0.81814
## educationLeft school at 18 years -1.122 0.26172
## educationLeft school before 16 years -0.004 0.99666
## educationMasters degree 0.165 0.86901
## educationProfessional certificate/ diploma 0.178 0.85892
## educationSome college or university, no certificate or degree -0.751 0.45282
## educationUniversity degree -0.283 0.77684
## countryCanada -0.002 0.99810
## countryIreland -0.004 0.99706
## countryNew Zealand 0.487 0.62621
## countryOther -1.228 0.21952
## countryUK 1.684 0.09222
## countryUSA -0.298 0.76560
## ethnicityBlack -0.553 0.58007
## ethnicityMixed-Black/Asian -2.451 0.01426
## ethnicityMixed-White/Asian -0.656 0.51204
## ethnicityMixed-White/Black -1.508 0.13161
## ethnicityOther -1.501 0.13334
## ethnicityWhite -0.002 0.99838
## personality_neuroticism 1.531 0.12574
## personality_extraversion 1.983 0.04733
## personality_openness -1.321 0.18643
## personality_agreeableness -2.289 0.02206
## personality_conscientiousness -0.046 0.96369
## personality_impulsiveness -0.260 0.79496
## personality_sensation 0.612 0.54063
## consumption_alcoholused in last day 0.005 0.99594
## consumption_alcoholused in last decade 0.005 0.99629
## consumption_alcoholused in last month 0.005 0.99600
## consumption_alcoholused in last week 0.005 0.99592
## consumption_alcoholused in last year 0.005 0.99594
## consumption_alcoholused over a decade ago 0.000 0.99967
## consumption_amphetaminesused in last day 3.080 0.00207
## consumption_amphetaminesused in last decade 1.636 0.10183
## consumption_amphetaminesused in last month 3.011 0.00261
## consumption_amphetaminesused in last week 3.955 0.0000767
## consumption_amphetaminesused in last year 2.382 0.01724
## consumption_amphetaminesused over a decade ago 0.188 0.85078
## consumption_caffeineused in last day -1.414 0.15727
## consumption_caffeineused in last decade -0.937 0.34891
## consumption_caffeineused in last month -1.609 0.10766
## consumption_caffeineused in last week -1.446 0.14808
## consumption_caffeineused in last year -0.007 0.99463
## consumption_caffeineused over a decade ago -0.003 0.99745
## consumption_cannabisused in last day 0.430 0.66703
## consumption_cannabisused in last decade 0.143 0.88654
## consumption_cannabisused in last month 0.362 0.71729
## consumption_cannabisused in last week 0.974 0.33019
## consumption_cannabisused in last year 0.503 0.61522
## consumption_cannabisused over a decade ago -0.011 0.99140
## consumption_chocolateused in last day 0.005 0.99566
## consumption_chocolateused in last decade 0.000 0.99995
## consumption_chocolateused in last month 0.006 0.99561
## consumption_chocolateused in last week 0.005 0.99577
## consumption_chocolateused in last year 0.005 0.99566
## consumption_chocolateused over a decade ago 0.003 0.99778
## consumption_mushroomsused in last day 1.318 0.18752
## consumption_mushroomsused in last decade 1.621 0.10497
## consumption_mushroomsused in last month 3.196 0.00140
## consumption_mushroomsused in last week 0.466 0.64113
## consumption_mushroomsused in last year 1.524 0.12755
## consumption_mushroomsused over a decade ago -1.101 0.27078
## consumption_nicotineused in last day 1.429 0.15287
## consumption_nicotineused in last decade 0.140 0.88841
## consumption_nicotineused in last month -0.076 0.93952
## consumption_nicotineused in last week 0.169 0.86561
## consumption_nicotineused in last year -1.887 0.05921
## consumption_nicotineused over a decade ago 0.099 0.92092
##
## (Intercept)
## age25-34
## age35-44 .
## age45-54 *
## age55-64
## age65+
## gendermale
## educationLeft school at 16 years
## educationLeft school at 17 years
## educationLeft school at 18 years
## educationLeft school before 16 years
## educationMasters degree
## educationProfessional certificate/ diploma
## educationSome college or university, no certificate or degree
## educationUniversity degree
## countryCanada
## countryIreland
## countryNew Zealand
## countryOther
## countryUK .
## countryUSA
## ethnicityBlack
## ethnicityMixed-Black/Asian *
## ethnicityMixed-White/Asian
## ethnicityMixed-White/Black
## ethnicityOther
## ethnicityWhite
## personality_neuroticism
## personality_extraversion *
## personality_openness
## personality_agreeableness *
## personality_conscientiousness
## personality_impulsiveness
## personality_sensation
## consumption_alcoholused in last day
## consumption_alcoholused in last decade
## consumption_alcoholused in last month
## consumption_alcoholused in last week
## consumption_alcoholused in last year
## consumption_alcoholused over a decade ago
## consumption_amphetaminesused in last day **
## consumption_amphetaminesused in last decade
## consumption_amphetaminesused in last month **
## consumption_amphetaminesused in last week ***
## consumption_amphetaminesused in last year *
## consumption_amphetaminesused over a decade ago
## consumption_caffeineused in last day
## consumption_caffeineused in last decade
## consumption_caffeineused in last month
## consumption_caffeineused in last week
## consumption_caffeineused in last year
## consumption_caffeineused over a decade ago
## consumption_cannabisused in last day
## consumption_cannabisused in last decade
## consumption_cannabisused in last month
## consumption_cannabisused in last week
## consumption_cannabisused in last year
## consumption_cannabisused over a decade ago
## consumption_chocolateused in last day
## consumption_chocolateused in last decade
## consumption_chocolateused in last month
## consumption_chocolateused in last week
## consumption_chocolateused in last year
## consumption_chocolateused over a decade ago
## consumption_mushroomsused in last day
## consumption_mushroomsused in last decade
## consumption_mushroomsused in last month **
## consumption_mushroomsused in last week
## consumption_mushroomsused in last year
## consumption_mushroomsused over a decade ago
## consumption_nicotineused in last day
## consumption_nicotineused in last decade
## consumption_nicotineused in last month
## consumption_nicotineused in last week
## consumption_nicotineused in last year .
## consumption_nicotineused over a decade ago
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 609.70 on 1050 degrees of freedom
## Residual deviance: 375.75 on 975 degrees of freedom
## AIC: 527.75
##
## Number of Fisher Scoring iterations: 19
#next models' summaries I will not print - only the best performing model at the end of the script
lrtest(drugs_logit)
## Likelihood ratio test
##
## Model 1: consumption_cocaine_last_month ~ age + gender + education + country +
## ethnicity + personality_neuroticism + personality_extraversion +
## personality_openness + personality_agreeableness + personality_conscientiousness +
## personality_impulsiveness + personality_sensation + consumption_alcohol +
## consumption_amphetamines + consumption_caffeine + consumption_cannabis +
## consumption_chocolate + consumption_mushrooms + consumption_nicotine
## Model 2: consumption_cocaine_last_month ~ 1
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 76 -187.88
## 2 1 -304.85 -75 233.95 < 0.00000000000000022 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# compares the model with all the variables to the model with only a # constant
# we reject the null hypothesis
# that the model is NOT jointly significant
drugs_logit1_fitted <- predict(drugs_logit,
type = "response")
head(drugs_logit1_fitted)
## 1 2 3
## 0.0384822629834267668225 0.0246378322827037928022 0.0211260253326832521392
## 4 5 6
## 0.0323764206336811660725 0.4985442961268529704633 0.0000000000000002220446
table(drugs_my_train$consumption_cocaine_last_month,
ifelse(drugs_logit1_fitted > 0.5, # condition
"Yes", # what returned if condition TRUE
"No"))
##
## No Yes
## No 945 17
## Yes 69 20
drugs_probit1 <- glm(consumption_cocaine_last_month ~ .,
family = binomial(link = "probit"),
data = drugs_my_train %>%
# we exclude id
dplyr::select(-id))
#summary(drugs_probit1)
lrtest(drugs_probit1)
## Likelihood ratio test
##
## Model 1: consumption_cocaine_last_month ~ age + gender + education + country +
## ethnicity + personality_neuroticism + personality_extraversion +
## personality_openness + personality_agreeableness + personality_conscientiousness +
## personality_impulsiveness + personality_sensation + consumption_alcohol +
## consumption_amphetamines + consumption_caffeine + consumption_cannabis +
## consumption_chocolate + consumption_mushrooms + consumption_nicotine
## Model 2: consumption_cocaine_last_month ~ 1
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 76 -185.58
## 2 1 -304.85 -75 238.54 < 0.00000000000000022 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# we reject the null hypothesis
# that the model is NOT jointly significant
# lets calculate fitted values and compare them with real values
drugs_probit1_fitted <- predict(drugs_probit1,
type = "response")
# and compare them with the predictions from logit1
plot(drugs_logit1_fitted,
drugs_probit1_fitted)
cor(drugs_logit1_fitted,
drugs_probit1_fitted)
## [1] 0.9977558
table(ifelse(drugs_logit1_fitted > 0.5,
"Yes",
"No"),
ifelse(drugs_probit1_fitted > 0.5,
"Yes",
"No"))
##
## No Yes
## No 1011 3
## Yes 1 36
Let’s recall the accuracy measures: ### Sensitivity
Accordingly to the ML classes, let’s create the confusionMartix table with following measures:
(ctable <- confusionMatrix(data = as.factor(ifelse(drugs_logit1_fitted > 0.5,
"Yes",
"No")
),
reference = drugs_my_train$consumption_cocaine_last_month,
positive = "Yes")
)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 945 69
## Yes 17 20
##
## Accuracy : 0.9182
## 95% CI : (0.8999, 0.934)
## No Information Rate : 0.9153
## P-Value [Acc > NIR] : 0.3964
##
## Kappa : 0.2817
##
## Mcnemar's Test P-Value : 0.00000003809
##
## Sensitivity : 0.22472
## Specificity : 0.98233
## Pos Pred Value : 0.54054
## Neg Pred Value : 0.93195
## Prevalence : 0.08468
## Detection Rate : 0.01903
## Detection Prevalence : 0.03520
## Balanced Accuracy : 0.60352
##
## 'Positive' Class : Yes
##
# the output includes
# - classification table
# TN = True Negative
# FN = False Negative
# TP = True Positive
# FP = False Positive
# n = TN+TP+FN+FP
# Reference
# Prediction No Yes
#----------------------------
# No TN FN
# Yes FP TP
#-----------------------------
# - additional measures
# Accuracy = (TN + TP) / n - accuracy, percent of correct preditions
# Sensitivity = TP / (TP + FN) - sensitivity
# Specificity = TN / (TN + FP) - specificity
# PPV = TP / (TP + FP) - positive predictive value - the share of
# correctly predicted "successes"
# NPV = TN / (TN + FN) - negative predictive value - the share of
# correctly predicted "defaults"
# Balanced Accuracy = (sensitivity + specificity)/2
# Precision - the same as PPV
# Recall - the same as sensitivity
# F1 - harmonic mean of Precision and Recall (range between 0 and 1)
# equally weights the precision and recall, which is criticized
# as different errors may have different cost for the company
ctable$overall
## Accuracy Kappa AccuracyLower AccuracyUpper
## 0.91817316841104 0.28173871582963 0.89992970609637 0.93403052275286
## AccuracyNull AccuracyPValue McnemarPValue
## 0.91531874405328 0.39637990877588 0.00000003809314
ctable$byClass
## Sensitivity Specificity Pos Pred Value
## 0.22471910 0.98232848 0.54054054
## Neg Pred Value Precision Recall
## 0.93195266 0.54054054 0.22471910
## F1 Prevalence Detection Rate
## 0.31746032 0.08468126 0.01902950
## Detection Prevalence Balanced Accuracy
## 0.03520457 0.60352379
# we can easily get selected measures into a vector
c(ctable$overall[1],
ctable$byClass[c(1:4, 7, 11)])
## Accuracy Sensitivity Specificity Pos Pred Value
## 0.9181732 0.2247191 0.9823285 0.5405405
## Neg Pred Value F1 Balanced Accuracy
## 0.9319527 0.3174603 0.6035238
# lets write a simple function that will
# calculate these measures for any model
summary_binary <- function(predicted_probs,
real,
cutoff = 0.5,
level_positive = "Yes",
level_negative = "No") {
# save the classification table
ctable <- confusionMatrix(as.factor(ifelse(predicted_probs > cutoff,
level_positive,
level_negative)),
real,
level_positive)
# extract selected statistics into a vector
stats <- round(c(ctable$overall[1],
ctable$byClass[c(1:4, 7, 11)]),
5)
# and return them as a function result
return(stats)
}
summary_binary(predicted_probs = drugs_logit1_fitted,
real = drugs_my_train$consumption_cocaine_last_month)
## Accuracy Sensitivity Specificity Pos Pred Value
## 0.91817 0.22472 0.98233 0.54054
## Neg Pred Value F1 Balanced Accuracy
## 0.93195 0.31746 0.60352
#Let's create the ROC polt (predicted_probabilities_of_success)
roc.plot(ifelse(drugs_my_train$consumption_cocaine_last_month == "Yes", 1, 0),
# predicted probabilities
drugs_logit1_fitted)
# on the vertical axis we have sensitivity (True Positive Rate),
# and on a horizontal one 1 - specifity (False Positive Rate)
# Also we can calculate the Area Under Curve
roc.area(ifelse(drugs_my_train$consumption_cocaine_last_month == "Yes", 1, 0),
# predicted probabilities
drugs_logit1_fitted)
## $A
## [1] 0.9202621
##
## $n.total
## [1] 1051
##
## $n.events
## [1] 89
##
## $n.noevents
## [1] 962
##
## $p.value
## [1] 0.000000000000000000000000000000000000001056909
To increase the performance of our models we will use the cross validation methods.
# Firstly, lets check the percentage shares of the Cocaine consumption in our train and test subsample
tabyl(drugs_my_train$consumption_cocaine_last_month)
## drugs_my_train$consumption_cocaine_last_month n percent
## No 962 0.91531874
## Yes 89 0.08468126
tabyl(drugs_my_test$consumption_cocaine_last_month)
## drugs_my_test$consumption_cocaine_last_month n percent
## No 411 0.91536748
## Yes 38 0.08463252
# Let's at the beginning set the train control method to none
ctrl_nocv <- trainControl(method = "none")
# and use it in model estimation
set.seed(77777)
drugs_logit2_train <-
train(consumption_cocaine_last_month ~ .,
data = drugs_my_train %>%
# we exclude id
dplyr::select(-id),
# model type
method = "glm",
# family of models
family = "binomial",
# train control
trControl = ctrl_nocv)
#summary(drugs_logit2_train)
# prediction
drugs_logit2_fitted <- predict(drugs_logit2_train,
drugs_my_train)
head(drugs_logit2_fitted)
## [1] No No No No No No
## Levels: No Yes
drugs_logit2_fitted <- predict(drugs_logit2_train,
drugs_my_train,
type = "prob")
head(drugs_logit2_fitted)
## No Yes
## 1 0.9615177 0.0384822629834267668225
## 2 0.9753622 0.0246378322827037928022
## 3 0.9788740 0.0211260253326832521392
## 4 0.9676236 0.0323764206336811660725
## 5 0.5014557 0.4985442961268529704633
## 6 1.0000000 0.0000000000000002220446
Let’s see the forecast error on the TEST sample
drugs_logit2_forecasts <- predict(drugs_logit2_train,
drugs_my_test,
type = "prob")
# confusion matrix
confusionMatrix(data = as.factor(ifelse(drugs_logit2_forecasts["Yes"] > 0.5,
"Yes",
"No")),
reference = drugs_my_test$consumption_cocaine_last_month,
positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 400 36
## Yes 11 2
##
## Accuracy : 0.8953
## 95% CI : (0.8632, 0.9221)
## No Information Rate : 0.9154
## P-Value [Acc > NIR] : 0.9427253
##
## Kappa : 0.0369
##
## Mcnemar's Test P-Value : 0.0004639
##
## Sensitivity : 0.052632
## Specificity : 0.973236
## Pos Pred Value : 0.153846
## Neg Pred Value : 0.917431
## Prevalence : 0.084633
## Detection Rate : 0.004454
## Detection Prevalence : 0.028953
## Balanced Accuracy : 0.512934
##
## 'Positive' Class : Yes
##
# very poor performance
# ROC/AUC
roc.area(ifelse(drugs_my_test$consumption_cocaine_last_month == "Yes", 1, 0),
drugs_logit2_forecasts[,"Yes"])$A
## [1] 0.6927263
Firstly I will use the 10 folds CV method.
ctrl_cv10 <- trainControl(method = "cv",
number = 10)
set.seed(77777)
drugs_logit2_train2 <-
train(consumption_cocaine_last_month ~ .,
data = drugs_my_train %>%
# we exclude id
dplyr::select(-id),
method = "glm",
family = "binomial",
# train control - now WITH cross-validation
trControl = ctrl_cv10)
drugs_logit2_train2
## Generalized Linear Model
##
## 1051 samples
## 19 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 946, 946, 946, 945, 946, 946, ...
## Resampling results:
##
## Accuracy Kappa
## 0.8924888 0.1383457
drugs_logit2_train2$resample
## Accuracy Kappa Resample
## 1 0.9142857 0.26914153 Fold01
## 2 0.8761905 -0.05568445 Fold02
## 3 0.8952381 0.10672854 Fold03
## 4 0.8867925 0.18980892 Fold04
## 5 0.8761905 -0.05568445 Fold05
## 6 0.8571429 0.13223140 Fold06
## 7 0.9142857 0.42413163 Fold07
## 8 0.8857143 -0.04477612 Fold08
## 9 0.8761905 -0.05568445 Fold09
## 10 0.9428571 0.47324415 Fold10
summary(drugs_logit2_train2$resample$Accuracy)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.8571 0.8762 0.8863 0.8925 0.9095 0.9429
ctrl_cv10a <- trainControl(method = "cv",
number = 10,
# we enable calculating probabilities
# of individual levels in all validation steps
classProbs = TRUE,
# we change the function for summary measures
summaryFunction = twoClassSummary)
set.seed(77777)
drugs_logit2_train3 <-
train(consumption_cocaine_last_month ~ .,
data = drugs_my_train %>%
# we exclude id
dplyr::select(-id),
method = "glm",
family = "binomial",
metric = "ROC",
# and the new training control
trControl = ctrl_cv10a)
drugs_logit2_train3
## Generalized Linear Model
##
## 1051 samples
## 19 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 946, 946, 946, 945, 946, 946, ...
## Resampling results:
##
## ROC Sens Spec
## 0.7629618 0.9604811 0.1597222
ctrl_cv5x3 <- trainControl(method = "repeatedcv",
number = 5,
repeats = 3)
set.seed(77777)
drugs_logit2_train4 <-
train(consumption_cocaine_last_month ~ .,
data = drugs_my_train %>%
# we exclude is
dplyr::select(-id),
method = "glm",
family = "binomial",
# and the new training control
trControl = ctrl_cv5x3)
drugs_logit2_train4
## Generalized Linear Model
##
## 1051 samples
## 19 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 841, 841, 840, 841, 841, 840, ...
## Resampling results:
##
## Accuracy Kappa
## 0.8905748 0.1618484
ctrl_cv5x3a <- trainControl(method = "repeatedcv",
number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary,
repeats = 3)
set.seed(77777)
drugs_logit2_train5 <-
train(consumption_cocaine_last_month ~ .,
data = drugs_my_train %>%
# we exclude id
dplyr::select(-id),
method = "glm",
family = "binomial",
metric = "ROC",
# and the new training control
trControl = ctrl_cv5x3a)
drugs_logit2_train5
## Generalized Linear Model
##
## 1051 samples
## 19 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 841, 841, 840, 841, 841, 840, ...
## Resampling results:
##
## ROC Sens Spec
## 0.7681566 0.9563472 0.1788671
#summary(drugs_logit2_train5)
Let’s check whether the coefficients are equal
identical(coef(drugs_logit2_train$finalModel),
coef(drugs_logit2_train2$finalModel))
## [1] TRUE
identical(coef(drugs_logit2_train$finalModel),
coef(drugs_logit2_train3$finalModel))
## [1] TRUE
identical(coef(drugs_logit2_train$finalModel),
coef(drugs_logit2_train4$finalModel))
## [1] TRUE
identical(coef(drugs_logit2_train$finalModel),
coef(drugs_logit2_train5$finalModel))
## [1] TRUE
# All are the same
Since our data is imbalanced (91% of “No’s” in the cocaine consumption variable, and 9% of “Yes’s”), therefore I will present and use the oversampling method. With the use of the SMOTE function of the DMwR package I will increase the number of observations which are indicated as “Yes” for cocaine consumption for over the 2000% and also increase the number of “No’s” observations of 100%. This oversampling method is creating new observations basing on the k-Nearest Neighbors method with the k=5. This operation will result in have the almost perfectly balanced data of increased size.
#install.packages("DMwR")
library(DMwR)
drugs_my_train %>%
dplyr::select(consumption_cocaine_last_month) %>%
group_by(consumption_cocaine_last_month) %>%
summarise(cnt = n()) %>%
mutate(freq = round(cnt / sum(cnt), 3)) %>%
arrange(desc(freq))
## # A tibble: 2 x 3
## consumption_cocaine_last_month cnt freq
## <fct> <int> <dbl>
## 1 No 962 0.915
## 2 Yes 89 0.085
#SMOTE works by choosing one of your observations
#representing the minority class, finding its nearest
#neighbors (you can specify how many),
#and using the relationship between the
#chosen observation and the neighbor(s)
#to generate a new example of the minority class.
#https://towardsdatascience.com/balancing-act-classification-with-imbalanced-data-cea06df39781
#The DMwR has to be downloaded from GitHub repository, because it is not applicable in my R version - i do not know why...
#install.packages("remotes")
#remotes::install_github("cran/DMwR")
#library(DMwR)
smote_dataset <- as.data.frame(drugs_my_train)
train_balanced <- SMOTE(consumption_cocaine_last_month ~ .,
data = smote_dataset %>%
dplyr::select(-id),
perc.over = 2000, k=5, perc.under = 100)
train_balanced %>%
dplyr::select(consumption_cocaine_last_month) %>%
group_by(consumption_cocaine_last_month) %>%
summarise(cnt = n()) %>%
mutate(freq = round(cnt / sum(cnt), 3)) %>%
arrange(desc(freq))
## # A tibble: 2 x 3
## consumption_cocaine_last_month cnt freq
## <fct> <int> <dbl>
## 1 Yes 1869 0.512
## 2 No 1780 0.488
library(tidyverse)
train_balanced <- as.tibble(train_balanced)
drugs_my_train %>%
dplyr::select(education) %>%
group_by(education) %>%
summarise(cnt = n()) %>%
mutate(freq = round(cnt / sum(cnt), 3)) %>%
arrange(desc(freq))
## # A tibble: 9 x 3
## education cnt freq
## <fct> <int> <dbl>
## 1 Some college or university, no certificate or degree 289 0.275
## 2 University degree 259 0.246
## 3 Masters degree 168 0.16
## 4 Professional certificate/ diploma 158 0.15
## 5 Left school at 18 years 58 0.055
## 6 Doctorate degree 46 0.044
## 7 Left school at 16 years 43 0.041
## 8 Left school at 17 years 18 0.017
## 9 Left school before 16 years 12 0.011
train_balanced %>%
dplyr::select(education) %>%
group_by(education) %>%
summarise(cnt = n()) %>%
mutate(freq = round(cnt / sum(cnt), 3)) %>%
arrange(desc(freq))
## # A tibble: 9 x 3
## education cnt freq
## <fct> <int> <dbl>
## 1 Some college or university, no certificate or degree 1119 0.307
## 2 University degree 780 0.214
## 3 Masters degree 573 0.157
## 4 Professional certificate/ diploma 560 0.153
## 5 Left school at 18 years 188 0.052
## 6 Doctorate degree 182 0.05
## 7 Left school at 16 years 144 0.039
## 8 Left school at 17 years 80 0.022
## 9 Left school before 16 years 23 0.006
#hist comparison
hist_neur <- ggplot(drugs_my_train, aes(x=personality_neuroticism))+
geom_histogram()+
xlab("Personality neuroticism")+
theme_gray()
hist_extra <- ggplot(drugs_my_train, aes(x=personality_extraversion))+
geom_histogram()+
xlab("Personality extraversion")+
theme_gray()
hist_openness <- ggplot(drugs_my_train, aes(x=personality_openness))+
geom_histogram()+
xlab("Personality openness")+
theme_gray()
hist_agreeableness <- ggplot(drugs_my_train, aes(x=personality_agreeableness))+
geom_histogram()+
xlab("Personality aggreeableness")+
theme_gray()
hist_conscientiousness <- ggplot(drugs_my_train, aes(x=personality_conscientiousness))+
geom_histogram()+
xlab("Personality conscientiousness")+
theme_gray()
hist_impulsiveness <- ggplot(drugs_my_train, aes(x=personality_impulsiveness))+
geom_histogram()+
xlab("Personality impulsiveness")+
theme_gray()
hist_sensation <- ggplot(drugs_my_train, aes(x=personality_sensation))+
geom_histogram()+
xlab("Personality sensation")+
theme_gray()
#balanced hists
hist_neur_balanced <- ggplot(train_balanced, aes(x=personality_neuroticism))+
geom_histogram()+
xlab("Personality neuroticism balanced") +
theme_gray()
hist_extra_balanced <- ggplot(train_balanced, aes(x=personality_extraversion))+
geom_histogram()+
xlab("Personality extraversion balanced") +
theme_gray()
hist_openness_balanced <- ggplot(train_balanced, aes(x=personality_openness))+
geom_histogram()+
xlab("Personality openness balanced") +
theme_gray()
hist_agreeableness_balanced <- ggplot(train_balanced, aes(x=personality_agreeableness))+
geom_histogram()+
xlab("Personality agreeableness balanced") +
theme_gray()
hist_conscientiousness_balanced <- ggplot(train_balanced, aes(x=personality_conscientiousness))+
geom_histogram()+
xlab("Personality conscientiousness balanced") +
theme_gray()
hist_impulsiveness_balanced <- ggplot(train_balanced, aes(x=personality_impulsiveness))+
geom_histogram()+
xlab("Personality impulsiveness balanced") +
theme_gray()
hist_sensation_balanced <- ggplot(train_balanced, aes(x=personality_sensation))+
geom_histogram()+
xlab("Personality sensation balanced") +
theme_gray()
#comparision
hists_balanced <- grid.arrange(hist_neur, hist_neur_balanced,
hist_extra, hist_extra_balanced,
hist_openness, hist_openness_balanced,
hist_agreeableness, hist_agreeableness_balanced,
hist_conscientiousness, hist_conscientiousness_balanced,
hist_impulsiveness, hist_impulsiveness_balanced,
hist_sensation, hist_sensation_balanced,
ncol = 2)
# Balanced logit
drugs_logit_balanced1 <- glm(consumption_cocaine_last_month ~ .,
family = binomial(link = "logit"),
data = train_balanced)
#summary(drugs_logit_balanced1)
lrtest(drugs_logit_balanced1)
## Likelihood ratio test
##
## Model 1: consumption_cocaine_last_month ~ age + gender + education + country +
## ethnicity + personality_neuroticism + personality_extraversion +
## personality_openness + personality_agreeableness + personality_conscientiousness +
## personality_impulsiveness + personality_sensation + consumption_alcohol +
## consumption_amphetamines + consumption_caffeine + consumption_cannabis +
## consumption_chocolate + consumption_mushrooms + consumption_nicotine
## Model 2: consumption_cocaine_last_month ~ 1
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 76 -1234.3
## 2 1 -2528.2 -75 2587.9 < 0.00000000000000022 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
drugs_logit_balanced1_fitted1 <- predict(drugs_logit_balanced1,
type = "response")
head(drugs_logit_balanced1_fitted1)
## 1 2 3 4
## 0.1168222281481750 0.4157261815679076 0.0000000001631131 0.0000000246541759
## 5 6
## 0.1326055810786278 0.0064752428665410
table(train_balanced$consumption_cocaine_last_month,
ifelse(drugs_logit_balanced1_fitted1 > 0.5, # condition
"Yes", # what returned if condition TRUE
"No"))
##
## No Yes
## No 1453 327
## Yes 205 1664
(ctable_balanced <- confusionMatrix(data = as.factor(ifelse(drugs_logit_balanced1_fitted1 > 0.5,
"Yes",
"No")
),
reference = train_balanced$consumption_cocaine_last_month,
positive = "Yes")
)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1453 205
## Yes 327 1664
##
## Accuracy : 0.8542
## 95% CI : (0.8423, 0.8655)
## No Information Rate : 0.5122
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.7078
##
## Mcnemar's Test P-Value : 0.0000001554
##
## Sensitivity : 0.8903
## Specificity : 0.8163
## Pos Pred Value : 0.8358
## Neg Pred Value : 0.8764
## Prevalence : 0.5122
## Detection Rate : 0.4560
## Detection Prevalence : 0.5456
## Balanced Accuracy : 0.8533
##
## 'Positive' Class : Yes
##
str(ctable_balanced)
## List of 6
## $ positive: chr "Yes"
## $ table : 'table' int [1:2, 1:2] 1453 327 205 1664
## ..- attr(*, "dimnames")=List of 2
## .. ..$ Prediction: chr [1:2] "No" "Yes"
## .. ..$ Reference : chr [1:2] "No" "Yes"
## $ overall : Named num [1:7] 0.854 0.708 0.842 0.866 0.512 ...
## ..- attr(*, "names")= chr [1:7] "Accuracy" "Kappa" "AccuracyLower" "AccuracyUpper" ...
## $ byClass : Named num [1:11] 0.89 0.816 0.836 0.876 0.836 ...
## ..- attr(*, "names")= chr [1:11] "Sensitivity" "Specificity" "Pos Pred Value" "Neg Pred Value" ...
## $ mode : chr "sens_spec"
## $ dots : list()
## - attr(*, "class")= chr "confusionMatrix"
ctable_balanced$overall
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.854206631954 0.707762801790 0.842336036626 0.865506690588 0.512195121951
## AccuracyPValue McnemarPValue
## 0.000000000000 0.000000155424
ctable_balanced$byClass
## Sensitivity Specificity Pos Pred Value
## 0.8903157 0.8162921 0.8357609
## Neg Pred Value Precision Recall
## 0.8763571 0.8357609 0.8903157
## F1 Prevalence Detection Rate
## 0.8621762 0.5121951 0.4560153
## Detection Prevalence Balanced Accuracy
## 0.5456289 0.8533039
c(ctable_balanced$overall[1],
ctable_balanced$byClass[c(1:4, 7, 11)])
## Accuracy Sensitivity Specificity Pos Pred Value
## 0.8542066 0.8903157 0.8162921 0.8357609
## Neg Pred Value F1 Balanced Accuracy
## 0.8763571 0.8621762 0.8533039
summary_binary_balanced <- function(predicted_probs,
real,
cutoff = 0.5,
level_positive = "Yes",
level_negative = "No") {
ctable_balanced <- confusionMatrix(as.factor(ifelse(predicted_probs > cutoff,
level_positive,
level_negative)),
real,
level_positive)
stats <- round(c(ctable_balanced$overall[1],
cctable_balanced$byClass[c(1:4, 7, 11)]),
5)
return(stats)
}
summary_binary(predicted_probs = drugs_logit_balanced1_fitted1,
real = train_balanced$consumption_cocaine_last_month)
## Accuracy Sensitivity Specificity Pos Pred Value
## 0.85421 0.89032 0.81629 0.83576
## Neg Pred Value F1 Balanced Accuracy
## 0.87636 0.86218 0.85330
roc.plot(ifelse(train_balanced$consumption_cocaine_last_month == "Yes", 1, 0),
drugs_logit_balanced1_fitted1)
roc.area(ifelse(train_balanced$consumption_cocaine_last_month == "Yes", 1, 0),
drugs_logit_balanced1_fitted1)
## $A
## [1] 0.9234091
##
## $n.total
## [1] 3649
##
## $n.events
## [1] 1869
##
## $n.noevents
## [1] 1780
##
## $p.value
## [1] 0
tabyl(train_balanced$consumption_cocaine_last_month)
## train_balanced$consumption_cocaine_last_month n percent
## No 1780 0.4878049
## Yes 1869 0.5121951
tabyl(drugs_my_test$consumption_cocaine_last_month)
## drugs_my_test$consumption_cocaine_last_month n percent
## No 411 0.91536748
## Yes 38 0.08463252
ctrl_nocv <- trainControl(method = "none")
#drugs_logit_balanced2
set.seed(77777)
drugs_logit_balanced2 <-
train(consumption_cocaine_last_month ~ .,
data = train_balanced,
# model type
method = "glm",
# family of models
family = "binomial",
# train control
trControl = ctrl_nocv)
#summary(drugs_logit_balanced2)
# prediction
drugs_logit_balanced2_fitted2 <- predict(drugs_logit_balanced2,
train_balanced)
drugs_logit_balanced2_fitted2 <- predict(drugs_logit_balanced2,
train_balanced,
type = "prob")
head(drugs_logit_balanced2_fitted2)
## No Yes
## 1 0.8831778 0.1168222281481750
## 2 0.5842738 0.4157261815679076
## 3 1.0000000 0.0000000001631131
## 4 1.0000000 0.0000000246541759
## 5 0.8673944 0.1326055810786278
## 6 0.9935248 0.0064752428665410
# cross validation
ctrl_cv10 <- trainControl(method = "cv",
number = 10)
set.seed(77777)
drugs_logit2_train2_balanced <-
train(consumption_cocaine_last_month ~ .,
data = train_balanced,
method = "glm",
family = "binomial",
# train control - now WITH cross-validation
trControl = ctrl_cv10)
drugs_logit2_train2_balanced
## Generalized Linear Model
##
## 3649 samples
## 19 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 3285, 3284, 3284, 3284, 3284, 3284, ...
## Resampling results:
##
## Accuracy Kappa
## 0.8396733 0.6786115
# TEST
# let's see the forecast error on the TEST sample
set.seed(77777)
drugs_logit2_balanced2_forecasts2 <- predict(drugs_logit2_train2_balanced,
drugs_my_test,
type = "prob")
confusionMatrix(data = as.factor(ifelse(drugs_logit2_balanced2_forecasts2["Yes"] > 0.5,
"Yes",
"No")),
reference = drugs_my_test$consumption_cocaine_last_month,
positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 330 16
## Yes 81 22
##
## Accuracy : 0.784
## 95% CI : (0.743, 0.8212)
## No Information Rate : 0.9154
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.215
##
## Mcnemar's Test P-Value : 0.00000000008128
##
## Sensitivity : 0.57895
## Specificity : 0.80292
## Pos Pred Value : 0.21359
## Neg Pred Value : 0.95376
## Prevalence : 0.08463
## Detection Rate : 0.04900
## Detection Prevalence : 0.22940
## Balanced Accuracy : 0.69093
##
## 'Positive' Class : Yes
##
#----------
drugs_logit2_train2_balanced$resample
## Accuracy Kappa Resample
## 1 0.8076923 0.6139862 Fold01
## 2 0.8520548 0.7032878 Fold02
## 3 0.8438356 0.6870158 Fold03
## 4 0.8410959 0.6818264 Fold04
## 5 0.8465753 0.6919646 Fold05
## 6 0.8602740 0.7201131 Fold06
## 7 0.8630137 0.7251175 Fold07
## 8 0.8356164 0.6704091 Fold08
## 9 0.8164384 0.6326037 Fold09
## 10 0.8301370 0.6597913 Fold10
summary(drugs_logit2_train2_balanced$resample$Accuracy)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.8077 0.8315 0.8425 0.8397 0.8507 0.8630
ctrl_cv10a <- trainControl(method = "cv",
number = 10,
# we enable calculating probabilities
# of individual levels in all validation steps
classProbs = TRUE,
# we change the function for summary measures
summaryFunction = twoClassSummary)
set.seed(77777)
drugs_logit2_train3_balanced <-
train(consumption_cocaine_last_month ~ .,
data = train_balanced,
method = "glm",
family = "binomial",
metric = "ROC",
# new training control
trControl = ctrl_cv10a)
drugs_logit2_train3_balanced
## Generalized Linear Model
##
## 3649 samples
## 19 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 3285, 3284, 3284, 3284, 3284, 3284, ...
## Resampling results:
##
## ROC Sens Spec
## 0.9157245 0.7994382 0.8780116
# TEST
drugs_logit2_balanced2_forecasts3 <- predict(drugs_logit2_train3_balanced,
drugs_my_test,
type = "prob")
confusionMatrix(data = as.factor(ifelse(drugs_logit2_balanced2_forecasts3["Yes"] > 0.5,
"Yes",
"No")),
reference = drugs_my_test$consumption_cocaine_last_month,
positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 330 16
## Yes 81 22
##
## Accuracy : 0.784
## 95% CI : (0.743, 0.8212)
## No Information Rate : 0.9154
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.215
##
## Mcnemar's Test P-Value : 0.00000000008128
##
## Sensitivity : 0.57895
## Specificity : 0.80292
## Pos Pred Value : 0.21359
## Neg Pred Value : 0.95376
## Prevalence : 0.08463
## Detection Rate : 0.04900
## Detection Prevalence : 0.22940
## Balanced Accuracy : 0.69093
##
## 'Positive' Class : Yes
##
# 5x3 CV
ctrl_cv5x3 <- trainControl(method = "repeatedcv",
number = 5,
repeats = 3)
set.seed(77777)
drugs_logit2_train4_balanced <-
train(consumption_cocaine_last_month ~ .,
data = train_balanced,
method = "glm",
family = "binomial",
# and the new training control
trControl = ctrl_cv5x3)
drugs_logit2_train4_balanced
## Generalized Linear Model
##
## 3649 samples
## 19 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 2919, 2919, 2920, 2919, 2919, 2920, ...
## Resampling results:
##
## Accuracy Kappa
## 0.8380373 0.6753288
# TEST
drugs_logit2_balanced2_forecasts4 <- predict(drugs_logit2_train4_balanced,
drugs_my_test,
type = "prob")
confusionMatrix(data = as.factor(ifelse(drugs_logit2_balanced2_forecasts4["Yes"] > 0.5,
"Yes",
"No")),
reference = drugs_my_test$consumption_cocaine_last_month,
positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 330 16
## Yes 81 22
##
## Accuracy : 0.784
## 95% CI : (0.743, 0.8212)
## No Information Rate : 0.9154
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.215
##
## Mcnemar's Test P-Value : 0.00000000008128
##
## Sensitivity : 0.57895
## Specificity : 0.80292
## Pos Pred Value : 0.21359
## Neg Pred Value : 0.95376
## Prevalence : 0.08463
## Detection Rate : 0.04900
## Detection Prevalence : 0.22940
## Balanced Accuracy : 0.69093
##
## 'Positive' Class : Yes
##
# 5x3 CVa
ctrl_cv5x3a <- trainControl(method = "repeatedcv",
number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary,
repeats = 3)
set.seed(77777)
drugs_logit2_train5_balanced <-
train(consumption_cocaine_last_month ~ .,
data = train_balanced,
method = "glm",
family = "binomial",
metric = "ROC",
# and the new training control
trControl = ctrl_cv5x3a)
drugs_logit2_train5
## Generalized Linear Model
##
## 1051 samples
## 19 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 841, 841, 840, 841, 841, 840, ...
## Resampling results:
##
## ROC Sens Spec
## 0.7681566 0.9563472 0.1788671
# TEST
drugs_logit2_balanced2_forecasts5 <- predict(drugs_logit2_train5_balanced,
drugs_my_test,
type = "prob")
confusionMatrix(data = as.factor(ifelse(drugs_logit2_balanced2_forecasts5["Yes"] > 0.5,
"Yes",
"No")),
reference = drugs_my_test$consumption_cocaine_last_month,
positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 330 16
## Yes 81 22
##
## Accuracy : 0.784
## 95% CI : (0.743, 0.8212)
## No Information Rate : 0.9154
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.215
##
## Mcnemar's Test P-Value : 0.00000000008128
##
## Sensitivity : 0.57895
## Specificity : 0.80292
## Pos Pred Value : 0.21359
## Neg Pred Value : 0.95376
## Prevalence : 0.08463
## Detection Rate : 0.04900
## Detection Prevalence : 0.22940
## Balanced Accuracy : 0.69093
##
## 'Positive' Class : Yes
##
#summary(drugs_logit2_train5_balanced)
drugs_my_test_forecasts_balanced_logit <-
data.frame(
drugs_logit2_train = predict(drugs_logit2_train,
drugs_my_test),
drugs_logit2_train2 = predict(drugs_logit2_train2,
drugs_my_test),
drugs_logit2_train3 = predict(drugs_logit2_train3,
drugs_my_test),
drugs_logit2_train4 = predict(drugs_logit2_train4,
drugs_my_test),
drugs_logit2_train5 = predict(drugs_logit2_train5,
drugs_my_test),
drugs_logit_balanced2 = predict(drugs_logit_balanced2,
drugs_my_test),
drugs_logit2_train2_balanced = predict(drugs_logit2_train2_balanced,
drugs_my_test),
drugs_logit2_train3_balanced = predict(drugs_logit2_train3_balanced,
drugs_my_test),
drugs_logit2_train4_balanced = predict(drugs_logit2_train4_balanced,
drugs_my_test),
drugs_logit2_train5_balanced = predict(drugs_logit2_train5_balanced,
drugs_my_test)
)
head(drugs_my_test_forecasts_balanced_logit)
## drugs_logit2_train drugs_logit2_train2 drugs_logit2_train3
## 1 No No No
## 2 No No No
## 3 Yes Yes Yes
## 4 Yes Yes Yes
## 5 No No No
## 6 No No No
## drugs_logit2_train4 drugs_logit2_train5 drugs_logit_balanced2
## 1 No No No
## 2 No No No
## 3 Yes Yes Yes
## 4 Yes Yes Yes
## 5 No No Yes
## 6 No No Yes
## drugs_logit2_train2_balanced drugs_logit2_train3_balanced
## 1 No No
## 2 No No
## 3 Yes Yes
## 4 Yes Yes
## 5 Yes Yes
## 6 Yes Yes
## drugs_logit2_train4_balanced drugs_logit2_train5_balanced
## 1 No No
## 2 No No
## 3 Yes Yes
## 4 Yes Yes
## 5 Yes Yes
## 6 Yes Yes
#View(drugs_my_test_forecasts_balanced_logit)
# and lets apply the summary_binary_class()
# function to each of the columns with sapply
source("F_summary_binary_class.R")
sapply(drugs_my_test_forecasts_balanced_logit,
function(x) summary_binary_class(predicted_classes = x,
real = drugs_my_test$consumption_cocaine_last_month)) %>%
# transpose the results to have statistics in columns
# for easier comparison
t()
## Accuracy Sensitivity Specificity Pos Pred Value
## drugs_logit2_train 0.89532 0.05263 0.97324 0.15385
## drugs_logit2_train2 0.89532 0.05263 0.97324 0.15385
## drugs_logit2_train3 0.89532 0.05263 0.97324 0.15385
## drugs_logit2_train4 0.89532 0.05263 0.97324 0.15385
## drugs_logit2_train5 0.89532 0.05263 0.97324 0.15385
## drugs_logit_balanced2 0.78396 0.57895 0.80292 0.21359
## drugs_logit2_train2_balanced 0.78396 0.57895 0.80292 0.21359
## drugs_logit2_train3_balanced 0.78396 0.57895 0.80292 0.21359
## drugs_logit2_train4_balanced 0.78396 0.57895 0.80292 0.21359
## drugs_logit2_train5_balanced 0.78396 0.57895 0.80292 0.21359
## Neg Pred Value F1 Balanced Accuracy
## drugs_logit2_train 0.91743 0.07843 0.51293
## drugs_logit2_train2 0.91743 0.07843 0.51293
## drugs_logit2_train3 0.91743 0.07843 0.51293
## drugs_logit2_train4 0.91743 0.07843 0.51293
## drugs_logit2_train5 0.91743 0.07843 0.51293
## drugs_logit_balanced2 0.95376 0.31206 0.69093
## drugs_logit2_train2_balanced 0.95376 0.31206 0.69093
## drugs_logit2_train3_balanced 0.95376 0.31206 0.69093
## drugs_logit2_train4_balanced 0.95376 0.31206 0.69093
## drugs_logit2_train5_balanced 0.95376 0.31206 0.69093
I have chosen the drugs_logit2_train5_balanced model as the best (equal balanced accuracy to other balanced models, however, the most sophisticated form of the Cross validation). Let’s predict the “true” test values
summary(drugs_logit2_train5_balanced)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.14775 -0.37509 0.08391 0.56318 2.55106
##
## Coefficients:
## Estimate
## (Intercept) -37.614730
## `age25-34` 0.241677
## `age35-44` -0.437101
## `age45-54` -1.889068
## `age55-64` 0.744122
## `age65+` -14.066560
## gendermale -0.198283
## `educationLeft school at 16 years` -0.024668
## `educationLeft school at 17 years` -0.639289
## `educationLeft school at 18 years` -0.778369
## `educationLeft school before 16 years` -17.432335
## `educationMasters degree` -0.256845
## `educationProfessional certificate/ diploma` 0.272043
## `educationSome college or university, no certificate or degree` -0.549238
## `educationUniversity degree` -0.403273
## countryCanada -17.560343
## countryIreland -17.191089
## `countryNew Zealand` 0.510793
## countryOther -0.629489
## countryUK 1.104946
## countryUSA -0.237527
## ethnicityBlack -1.206872
## `ethnicityMixed-Black/Asian` -2.631905
## `ethnicityMixed-White/Asian` -0.485873
## `ethnicityMixed-White/Black` -0.721205
## ethnicityOther -1.832894
## ethnicityWhite -22.899911
## personality_neuroticism 0.032606
## personality_extraversion 0.039787
## personality_openness -0.025441
## personality_agreeableness -0.014062
## personality_conscientiousness -0.015726
## personality_impulsiveness 0.001109
## personality_sensation 0.012192
## `consumption_alcoholused in last day` 18.628331
## `consumption_alcoholused in last decade` 17.349230
## `consumption_alcoholused in last month` 18.466127
## `consumption_alcoholused in last week` 18.778757
## `consumption_alcoholused in last year` 18.378253
## `consumption_alcoholused over a decade ago` 2.671865
## `consumption_amphetaminesused in last day` 1.203393
## `consumption_amphetaminesused in last decade` 0.501242
## `consumption_amphetaminesused in last month` 1.574163
## `consumption_amphetaminesused in last week` 1.930688
## `consumption_amphetaminesused in last year` 1.018548
## `consumption_amphetaminesused over a decade ago` 0.285439
## `consumption_caffeineused in last day` -1.468155
## `consumption_caffeineused in last decade` -0.333180
## `consumption_caffeineused in last month` -0.820009
## `consumption_caffeineused in last week` -1.058324
## `consumption_caffeineused in last year` -20.055021
## `consumption_caffeineused over a decade ago` -16.542650
## `consumption_cannabisused in last day` 0.438330
## `consumption_cannabisused in last decade` 0.402835
## `consumption_cannabisused in last month` 0.314303
## `consumption_cannabisused in last week` 1.265524
## `consumption_cannabisused in last year` 0.947356
## `consumption_cannabisused over a decade ago` -15.572733
## `consumption_chocolateused in last day` 19.748120
## `consumption_chocolateused in last decade` 1.834968
## `consumption_chocolateused in last month` 20.399245
## `consumption_chocolateused in last week` 19.698788
## `consumption_chocolateused in last year` 20.017342
## `consumption_chocolateused over a decade ago` 39.332130
## `consumption_mushroomsused in last day` 3.136166
## `consumption_mushroomsused in last decade` 0.863555
## `consumption_mushroomsused in last month` 1.498854
## `consumption_mushroomsused in last week` 0.547453
## `consumption_mushroomsused in last year` 0.926488
## `consumption_mushroomsused over a decade ago` -1.927140
## `consumption_nicotineused in last day` 0.282236
## `consumption_nicotineused in last decade` -0.540618
## `consumption_nicotineused in last month` 0.016831
## `consumption_nicotineused in last week` 0.254224
## `consumption_nicotineused in last year` -2.706797
## `consumption_nicotineused over a decade ago` -0.253583
## Std. Error
## (Intercept) 1341.091793
## `age25-34` 0.123807
## `age35-44` 0.174146
## `age45-54` 0.312797
## `age55-64` 0.396726
## `age65+` 1447.907205
## gendermale 0.109639
## `educationLeft school at 16 years` 0.411946
## `educationLeft school at 17 years` 0.437377
## `educationLeft school at 18 years` 0.372905
## `educationLeft school before 16 years` 1115.490632
## `educationMasters degree` 0.295901
## `educationProfessional certificate/ diploma` 0.302409
## `educationSome college or university, no certificate or degree` 0.282966
## `educationUniversity degree` 0.291114
## countryCanada 1968.464271
## countryIreland 1271.015131
## `countryNew Zealand` 0.204474
## countryOther 0.267520
## countryUK 0.232742
## countryUSA 0.128073
## ethnicityBlack 0.717344
## `ethnicityMixed-Black/Asian` 0.364660
## `ethnicityMixed-White/Asian` 0.503709
## `ethnicityMixed-White/Black` 0.408734
## ethnicityOther 0.573595
## ethnicityWhite 3765.847186
## personality_neuroticism 0.005040
## personality_extraversion 0.004568
## personality_openness 0.004336
## personality_agreeableness 0.004244
## personality_conscientiousness 0.005231
## personality_impulsiveness 0.004922
## personality_sensation 0.003386
## `consumption_alcoholused in last day` 945.650914
## `consumption_alcoholused in last decade` 945.650995
## `consumption_alcoholused in last month` 945.650918
## `consumption_alcoholused in last week` 945.650911
## `consumption_alcoholused in last year` 945.650927
## `consumption_alcoholused over a decade ago` 1276.983107
## `consumption_amphetaminesused in last day` 0.189866
## `consumption_amphetaminesused in last decade` 0.158706
## `consumption_amphetaminesused in last month` 0.232139
## `consumption_amphetaminesused in last week` 0.219670
## `consumption_amphetaminesused in last year` 0.165966
## `consumption_amphetaminesused over a decade ago` 0.259809
## `consumption_caffeineused in last day` 0.525274
## `consumption_caffeineused in last decade` 0.701784
## `consumption_caffeineused in last month` 0.549249
## `consumption_caffeineused in last week` 0.536378
## `consumption_caffeineused in last year` 619.493606
## `consumption_caffeineused over a decade ago` 1305.976077
## `consumption_cannabisused in last day` 0.203153
## `consumption_cannabisused in last decade` 0.221557
## `consumption_cannabisused in last month` 0.255325
## `consumption_cannabisused in last week` 0.224559
## `consumption_cannabisused in last year` 0.225867
## `consumption_cannabisused over a decade ago` 351.726651
## `consumption_chocolateused in last day` 950.931513
## `consumption_chocolateused in last decade` 2100.778802
## `consumption_chocolateused in last month` 950.931520
## `consumption_chocolateused in last week` 950.931515
## `consumption_chocolateused in last year` 950.931570
## `consumption_chocolateused over a decade ago` 2749.031232
## `consumption_mushroomsused in last day` 1.056964
## `consumption_mushroomsused in last decade` 0.152582
## `consumption_mushroomsused in last month` 0.183597
## `consumption_mushroomsused in last week` 0.270585
## `consumption_mushroomsused in last year` 0.148769
## `consumption_mushroomsused over a decade ago` 0.350111
## `consumption_nicotineused in last day` 0.171001
## `consumption_nicotineused in last decade` 0.234817
## `consumption_nicotineused in last month` 0.227817
## `consumption_nicotineused in last week` 0.205562
## `consumption_nicotineused in last year` 0.357388
## `consumption_nicotineused over a decade ago` 0.314303
## z value
## (Intercept) -0.028
## `age25-34` 1.952
## `age35-44` -2.510
## `age45-54` -6.039
## `age55-64` 1.876
## `age65+` -0.010
## gendermale -1.809
## `educationLeft school at 16 years` -0.060
## `educationLeft school at 17 years` -1.462
## `educationLeft school at 18 years` -2.087
## `educationLeft school before 16 years` -0.016
## `educationMasters degree` -0.868
## `educationProfessional certificate/ diploma` 0.900
## `educationSome college or university, no certificate or degree` -1.941
## `educationUniversity degree` -1.385
## countryCanada -0.009
## countryIreland -0.014
## `countryNew Zealand` 2.498
## countryOther -2.353
## countryUK 4.748
## countryUSA -1.855
## ethnicityBlack -1.682
## `ethnicityMixed-Black/Asian` -7.217
## `ethnicityMixed-White/Asian` -0.965
## `ethnicityMixed-White/Black` -1.764
## ethnicityOther -3.195
## ethnicityWhite -0.006
## personality_neuroticism 6.470
## personality_extraversion 8.710
## personality_openness -5.868
## personality_agreeableness -3.314
## personality_conscientiousness -3.006
## personality_impulsiveness 0.225
## personality_sensation 3.601
## `consumption_alcoholused in last day` 0.020
## `consumption_alcoholused in last decade` 0.018
## `consumption_alcoholused in last month` 0.020
## `consumption_alcoholused in last week` 0.020
## `consumption_alcoholused in last year` 0.019
## `consumption_alcoholused over a decade ago` 0.002
## `consumption_amphetaminesused in last day` 6.338
## `consumption_amphetaminesused in last decade` 3.158
## `consumption_amphetaminesused in last month` 6.781
## `consumption_amphetaminesused in last week` 8.789
## `consumption_amphetaminesused in last year` 6.137
## `consumption_amphetaminesused over a decade ago` 1.099
## `consumption_caffeineused in last day` -2.795
## `consumption_caffeineused in last decade` -0.475
## `consumption_caffeineused in last month` -1.493
## `consumption_caffeineused in last week` -1.973
## `consumption_caffeineused in last year` -0.032
## `consumption_caffeineused over a decade ago` -0.013
## `consumption_cannabisused in last day` 2.158
## `consumption_cannabisused in last decade` 1.818
## `consumption_cannabisused in last month` 1.231
## `consumption_cannabisused in last week` 5.636
## `consumption_cannabisused in last year` 4.194
## `consumption_cannabisused over a decade ago` -0.044
## `consumption_chocolateused in last day` 0.021
## `consumption_chocolateused in last decade` 0.001
## `consumption_chocolateused in last month` 0.021
## `consumption_chocolateused in last week` 0.021
## `consumption_chocolateused in last year` 0.021
## `consumption_chocolateused over a decade ago` 0.014
## `consumption_mushroomsused in last day` 2.967
## `consumption_mushroomsused in last decade` 5.660
## `consumption_mushroomsused in last month` 8.164
## `consumption_mushroomsused in last week` 2.023
## `consumption_mushroomsused in last year` 6.228
## `consumption_mushroomsused over a decade ago` -5.504
## `consumption_nicotineused in last day` 1.650
## `consumption_nicotineused in last decade` -2.302
## `consumption_nicotineused in last month` 0.074
## `consumption_nicotineused in last week` 1.237
## `consumption_nicotineused in last year` -7.574
## `consumption_nicotineused over a decade ago` -0.807
## Pr(>|z|)
## (Intercept) 0.977624
## `age25-34` 0.050932
## `age35-44` 0.012074
## `age45-54` 0.000000001548037705
## `age55-64` 0.060702
## `age65+` 0.992249
## gendermale 0.070528
## `educationLeft school at 16 years` 0.952251
## `educationLeft school at 17 years` 0.143839
## `educationLeft school at 18 years` 0.036860
## `educationLeft school before 16 years` 0.987532
## `educationMasters degree` 0.385389
## `educationProfessional certificate/ diploma` 0.368341
## `educationSome college or university, no certificate or degree` 0.052257
## `educationUniversity degree` 0.165967
## countryCanada 0.992882
## countryIreland 0.989209
## `countryNew Zealand` 0.012487
## countryOther 0.018620
## countryUK 0.000002059330222405
## countryUSA 0.063650
## ethnicityBlack 0.092488
## `ethnicityMixed-Black/Asian` 0.000000000000529823
## `ethnicityMixed-White/Asian` 0.334750
## `ethnicityMixed-White/Black` 0.077650
## ethnicityOther 0.001396
## ethnicityWhite 0.995148
## personality_neuroticism 0.000000000098154652
## personality_extraversion < 0.0000000000000002
## personality_openness 0.000000004422215731
## personality_agreeableness 0.000921
## personality_conscientiousness 0.002644
## personality_impulsiveness 0.821687
## personality_sensation 0.000317
## `consumption_alcoholused in last day` 0.984284
## `consumption_alcoholused in last decade` 0.985363
## `consumption_alcoholused in last month` 0.984420
## `consumption_alcoholused in last week` 0.984157
## `consumption_alcoholused in last year` 0.984494
## `consumption_alcoholused over a decade ago` 0.998331
## `consumption_amphetaminesused in last day` 0.000000000232605068
## `consumption_amphetaminesused in last decade` 0.001587
## `consumption_amphetaminesused in last month` 0.000000000011925115
## `consumption_amphetaminesused in last week` < 0.0000000000000002
## `consumption_amphetaminesused in last year` 0.000000000840499628
## `consumption_amphetaminesused over a decade ago` 0.271921
## `consumption_caffeineused in last day` 0.005190
## `consumption_caffeineused in last decade` 0.634957
## `consumption_caffeineused in last month` 0.135447
## `consumption_caffeineused in last week` 0.048485
## `consumption_caffeineused in last year` 0.974174
## `consumption_caffeineused over a decade ago` 0.989894
## `consumption_cannabisused in last day` 0.030956
## `consumption_cannabisused in last decade` 0.069033
## `consumption_cannabisused in last month` 0.218325
## `consumption_cannabisused in last week` 0.000000017444085214
## `consumption_cannabisused in last year` 0.000027370299605261
## `consumption_cannabisused over a decade ago` 0.964685
## `consumption_chocolateused in last day` 0.983431
## `consumption_chocolateused in last decade` 0.999303
## `consumption_chocolateused in last month` 0.982885
## `consumption_chocolateused in last week` 0.983473
## `consumption_chocolateused in last year` 0.983206
## `consumption_chocolateused over a decade ago` 0.988585
## `consumption_mushroomsused in last day` 0.003006
## `consumption_mushroomsused in last decade` 0.000000015170214722
## `consumption_mushroomsused in last month` 0.000000000000000325
## `consumption_mushroomsused in last week` 0.043050
## `consumption_mushroomsused in last year` 0.000000000473326200
## `consumption_mushroomsused over a decade ago` 0.000000037049797314
## `consumption_nicotineused in last day` 0.098842
## `consumption_nicotineused in last decade` 0.021319
## `consumption_nicotineused in last month` 0.941106
## `consumption_nicotineused in last week` 0.216189
## `consumption_nicotineused in last year` 0.000000000000036238
## `consumption_nicotineused over a decade ago` 0.419775
##
## (Intercept)
## `age25-34` .
## `age35-44` *
## `age45-54` ***
## `age55-64` .
## `age65+`
## gendermale .
## `educationLeft school at 16 years`
## `educationLeft school at 17 years`
## `educationLeft school at 18 years` *
## `educationLeft school before 16 years`
## `educationMasters degree`
## `educationProfessional certificate/ diploma`
## `educationSome college or university, no certificate or degree` .
## `educationUniversity degree`
## countryCanada
## countryIreland
## `countryNew Zealand` *
## countryOther *
## countryUK ***
## countryUSA .
## ethnicityBlack .
## `ethnicityMixed-Black/Asian` ***
## `ethnicityMixed-White/Asian`
## `ethnicityMixed-White/Black` .
## ethnicityOther **
## ethnicityWhite
## personality_neuroticism ***
## personality_extraversion ***
## personality_openness ***
## personality_agreeableness ***
## personality_conscientiousness **
## personality_impulsiveness
## personality_sensation ***
## `consumption_alcoholused in last day`
## `consumption_alcoholused in last decade`
## `consumption_alcoholused in last month`
## `consumption_alcoholused in last week`
## `consumption_alcoholused in last year`
## `consumption_alcoholused over a decade ago`
## `consumption_amphetaminesused in last day` ***
## `consumption_amphetaminesused in last decade` **
## `consumption_amphetaminesused in last month` ***
## `consumption_amphetaminesused in last week` ***
## `consumption_amphetaminesused in last year` ***
## `consumption_amphetaminesused over a decade ago`
## `consumption_caffeineused in last day` **
## `consumption_caffeineused in last decade`
## `consumption_caffeineused in last month`
## `consumption_caffeineused in last week` *
## `consumption_caffeineused in last year`
## `consumption_caffeineused over a decade ago`
## `consumption_cannabisused in last day` *
## `consumption_cannabisused in last decade` .
## `consumption_cannabisused in last month`
## `consumption_cannabisused in last week` ***
## `consumption_cannabisused in last year` ***
## `consumption_cannabisused over a decade ago`
## `consumption_chocolateused in last day`
## `consumption_chocolateused in last decade`
## `consumption_chocolateused in last month`
## `consumption_chocolateused in last week`
## `consumption_chocolateused in last year`
## `consumption_chocolateused over a decade ago`
## `consumption_mushroomsused in last day` **
## `consumption_mushroomsused in last decade` ***
## `consumption_mushroomsused in last month` ***
## `consumption_mushroomsused in last week` *
## `consumption_mushroomsused in last year` ***
## `consumption_mushroomsused over a decade ago` ***
## `consumption_nicotineused in last day` .
## `consumption_nicotineused in last decade` *
## `consumption_nicotineused in last month`
## `consumption_nicotineused in last week`
## `consumption_nicotineused in last year` ***
## `consumption_nicotineused over a decade ago`
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5056.4 on 3648 degrees of freedom
## Residual deviance: 2468.5 on 3573 degrees of freedom
## AIC: 2620.5
##
## Number of Fisher Scoring iterations: 17
drugs_true_test <- read_csv("drugs_test.csv")
## Rows: 385 Columns: 20
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (13): id, age, gender, education, country, ethnicity, consumption_alcoho...
## dbl (7): personality_neuroticism, personality_extraversion, personality_ope...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
drugs_true_test_drugs_logit2_train5_balanced = predict(drugs_logit2_train5_balanced,
drugs_true_test)
head(drugs_true_test_drugs_logit2_train5_balanced)
## [1] Yes No No No No Yes
## Levels: No Yes
table(drugs_true_test_drugs_logit2_train5_balanced)
## drugs_true_test_drugs_logit2_train5_balanced
## No Yes
## 299 86
#looks promising
Oversampling method with the use of SMOTE function increased significantly the Balanced Accuracy (BA) values. The imbalanced logit and probit had the 51% of the BA. Balanced models have 69% of BA - much improvement. Let’s see how other classification methods will perform on the given data sets.
library(readr)
library(class)
require(caret)
require(dplyr)
library(AER)
library(lmtest)
library(nnet)
library(verification)
library(janitor)
# load also the functions which we created
# in lab 5 (stored in separate files
# in the folder called functions)
source("F_summary_binary.R")
# 2nd variant if only classes predicted,
# not probabilities - check its definition!
source("F_summary_binary_class.R")
Firstly, let’s set the train control method as “none” and run a basic model with the kNN approach
ctrl_nocv <- trainControl(method = "none")
# and run the train() function
set.seed(77777)
drugs_my_train_knn5 <-
train(consumption_cocaine_last_month ~ .,
data = drugs_my_train %>%
# we exclude id
dplyr::select(-id),
# model type - now knn!!
method = "knn",
# train control
trControl = ctrl_nocv)
drugs_my_train_knn5
## k-Nearest Neighbors
##
## 1051 samples
## 19 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: None
# the default parameter value in this case is 5
# it is saved in the "finalModel" element
# results of train()
drugs_my_train_knn5$finalModel
## 5-nearest neighbor model
## Training set outcome distribution:
##
## No Yes
## 962 89
drugs_my_train_knn5$finalModel$k
## [1] 5
# another k ---------------------
sqrt(nrow(drugs_my_train))
## [1] 32.41913
# lets use 33 - the first ODD number larger than
# the square root of n
k_value <- data.frame(k = 33)
set.seed(77777)
drugs_my_train_knn33 <-
train(consumption_cocaine_last_month ~ .,
data = drugs_my_train %>%
# we exclude id
dplyr::select(-id),
# model type - now knn!!
method = "knn",
# train control
trControl = ctrl_nocv,
# we give the parameter(s)
# required by the model
tuneGrid = k_value)
drugs_my_train_knn33$finalModel$k
## [1] 33
# lets calculate fitted values - prediction on the training sample
drugs_my_train_knn33_fitted <- predict(drugs_my_train_knn33,
drugs_my_train)
# it may take a while - prediction might be
# time-consuming for knn!
# Let's see the frequencies of fitted values
table(drugs_my_train_knn33_fitted)
## drugs_my_train_knn33_fitted
## No Yes
## 1051 0
# check different accuracy measures
summary_binary_class(predicted_classes = drugs_my_train_knn33_fitted,
real = drugs_my_train$consumption_cocaine_last_month)
## Accuracy Sensitivity Specificity Pos Pred Value
## 0.91532 0.00000 1.00000 NaN
## Neg Pred Value F1 Balanced Accuracy
## 0.91532 NA 0.50000
# level_positive = "Yes" by default
# lets compare prediction results in the test sample
drugs_my_train_knn33_forecasts <- predict(drugs_my_train_knn33,
drugs_my_test)
table(drugs_my_train_knn33_forecasts)
## drugs_my_train_knn33_forecasts
## No Yes
## 449 0
# check different measures
summary_binary_class(predicted_classes = drugs_my_train_knn33_forecasts,
real = drugs_my_test$consumption_cocaine_last_month)
## Accuracy Sensitivity Specificity Pos Pred Value
## 0.91537 0.00000 1.00000 NaN
## Neg Pred Value F1 Balanced Accuracy
## 0.91537 NA 0.50000
Let’s see the predictions
drugs_my_train_knn5_fitted <- predict(drugs_my_train_knn5,
drugs_my_train)
# check different accuracy measures
summary_binary_class(predicted_classes = drugs_my_train_knn5_fitted,
real = drugs_my_train$consumption_cocaine_last_month)
## Accuracy Sensitivity Specificity Pos Pred Value
## 0.91722 0.06742 0.99584 0.60000
## Neg Pred Value F1 Balanced Accuracy
## 0.92027 0.12121 0.53163
# lets compare prediction results in the test sample
drugs_my_train_knn5_forecasts <- predict(drugs_my_train_knn5,
drugs_my_test)
summary_binary_class(predicted_classes = drugs_my_train_knn5_forecasts,
real = drugs_my_test$consumption_cocaine_last_month)
## Accuracy Sensitivity Specificity Pos Pred Value
## 0.90423 0.02632 0.98540 0.14286
## Neg Pred Value F1 Balanced Accuracy
## 0.91629 0.04444 0.50586
Let’s create the data frame of different “k’s” to make several different models
different_k <- data.frame(k = seq(1, 99, 4))
# define the training control -
# use 5-fold cross validation
ctrl_cv5 <- trainControl(method = "cv",
number = 5)
set.seed(77777)
drugs_my_train_knn_tuned <-
train(consumption_cocaine_last_month ~ .,
data = drugs_my_train %>%
# we exclude id
dplyr::select(-id),
method = "knn",
# validation used!
trControl = ctrl_cv5,
# parameters to be compared
tuneGrid = different_k)
# now validation is applied to EVERY
# SINGLE value of k, which takes few seconds
# lets check the results
drugs_my_train_knn_tuned
## k-Nearest Neighbors
##
## 1051 samples
## 19 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 841, 841, 840, 841, 841
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.8563078 0.037138919
## 5 0.9086617 0.022829645
## 9 0.9134146 -0.003641882
## 13 0.9153193 0.000000000
## 17 0.9153193 0.000000000
## 21 0.9153193 0.000000000
## 25 0.9153193 0.000000000
## 29 0.9153193 0.000000000
## 33 0.9153193 0.000000000
## 37 0.9153193 0.000000000
## 41 0.9153193 0.000000000
## 45 0.9153193 0.000000000
## 49 0.9153193 0.000000000
## 53 0.9153193 0.000000000
## 57 0.9153193 0.000000000
## 61 0.9153193 0.000000000
## 65 0.9153193 0.000000000
## 69 0.9153193 0.000000000
## 73 0.9153193 0.000000000
## 77 0.9153193 0.000000000
## 81 0.9153193 0.000000000
## 85 0.9153193 0.000000000
## 89 0.9153193 0.000000000
## 93 0.9153193 0.000000000
## 97 0.9153193 0.000000000
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 97.
#------------------------------
set.seed(77777)
drugs_my_train_knn_tuned_scaled <-
train(consumption_cocaine_last_month ~ .,
data = drugs_my_train %>%
# we exclude id
dplyr::select(-id),
# model type - now knn!!
method = "knn",
# validation used!
trControl = ctrl_cv5,
# parameters to be compared
tuneGrid = different_k,
# data transformation
preProcess = c("range"))
drugs_my_train_knn_tuned_scaled
## k-Nearest Neighbors
##
## 1051 samples
## 19 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: re-scaling to [0, 1] (75)
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 841, 841, 840, 841, 841
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.8696141 0.093055336
## 5 0.9086572 0.048567860
## 9 0.9153193 0.016860465
## 13 0.9143670 -0.001820941
## 17 0.9153193 0.000000000
## 21 0.9153193 0.000000000
## 25 0.9153193 0.000000000
## 29 0.9153193 0.000000000
## 33 0.9153193 0.000000000
## 37 0.9153193 0.000000000
## 41 0.9153193 0.000000000
## 45 0.9153193 0.000000000
## 49 0.9153193 0.000000000
## 53 0.9153193 0.000000000
## 57 0.9153193 0.000000000
## 61 0.9153193 0.000000000
## 65 0.9153193 0.000000000
## 69 0.9153193 0.000000000
## 73 0.9153193 0.000000000
## 77 0.9153193 0.000000000
## 81 0.9153193 0.000000000
## 85 0.9153193 0.000000000
## 89 0.9153193 0.000000000
## 93 0.9153193 0.000000000
## 97 0.9153193 0.000000000
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 97.
# here the best accuracy obtained for
# k = 9 and the accuracy seems to increase
# with k
ctrl_cv5a <- trainControl(method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary)
set.seed(77777)
drugs_my_train_knn_tuned_scaled2 <-
train(consumption_cocaine_last_month ~ .,
data = drugs_my_train %>%
# we exclude id
dplyr::select(-id),
method = "knn",
trControl = ctrl_cv5a,
# parameters to be compared
tuneGrid = different_k,
# data transformation
preProcess = c("range"),
metric = "ROC")
drugs_my_train_knn_tuned_scaled2
## k-Nearest Neighbors
##
## 1051 samples
## 19 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: re-scaling to [0, 1] (75)
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 841, 841, 840, 841, 841
## Resampling results across tuning parameters:
##
## k ROC Sens Spec
## 1 0.5411563 0.9365609 0.14575163
## 5 0.6675596 0.9885525 0.04444444
## 9 0.7060753 0.9989583 0.01111111
## 13 0.7290792 0.9989583 0.00000000
## 17 0.7530143 1.0000000 0.00000000
## 21 0.7516826 1.0000000 0.00000000
## 25 0.7543753 1.0000000 0.00000000
## 29 0.7781210 1.0000000 0.00000000
## 33 0.7829416 1.0000000 0.00000000
## 37 0.7830257 1.0000000 0.00000000
## 41 0.7995605 1.0000000 0.00000000
## 45 0.7966958 1.0000000 0.00000000
## 49 0.7926542 1.0000000 0.00000000
## 53 0.8008991 1.0000000 0.00000000
## 57 0.8005916 1.0000000 0.00000000
## 61 0.7976926 1.0000000 0.00000000
## 65 0.7975848 1.0000000 0.00000000
## 69 0.8000026 1.0000000 0.00000000
## 73 0.7979342 1.0000000 0.00000000
## 77 0.7999412 1.0000000 0.00000000
## 81 0.7980627 1.0000000 0.00000000
## 85 0.7977914 1.0000000 0.00000000
## 89 0.7955148 1.0000000 0.00000000
## 93 0.7949000 1.0000000 0.00000000
## 97 0.7941367 1.0000000 0.00000000
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was k = 53.
source("F_own_summary_functions.R")
ctrl_cv5b <- trainControl(method = "cv",
number = 5,
# probabilities of each level
# predicted in cross-validation
classProbs = TRUE,
# and summary function
# that includes F1
summaryFunction = mySummary)
set.seed(77777)
drugs_my_train_knn_tuned_scaled3 <-
train(consumption_cocaine_last_month ~ .,
data = drugs_my_train %>%
# we exclude id
dplyr::select(-id),
method = "knn",
trControl = ctrl_cv5b,
# parameters to be compared
tuneGrid = different_k,
# data transformation
preProcess = c("range"),
metric = "F1")
drugs_my_train_knn_tuned_scaled3
## k-Nearest Neighbors
##
## 1051 samples
## 19 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: re-scaling to [0, 1] (75)
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 841, 841, 840, 841, 841
## Resampling results across tuning parameters:
##
## k ROC Accuracy Kappa Sensitivity Specificity
## 1 0.5411563 0.8696141 0.093055336 0.9365609 0.14575163
## 5 0.6675596 0.9086572 0.048567860 0.9885525 0.04444444
## 9 0.7060753 0.9153193 0.016860465 0.9989583 0.01111111
## 13 0.7290792 0.9143670 -0.001820941 0.9989583 0.00000000
## 17 0.7530143 0.9153193 0.000000000 1.0000000 0.00000000
## 21 0.7516826 0.9153193 0.000000000 1.0000000 0.00000000
## 25 0.7543753 0.9153193 0.000000000 1.0000000 0.00000000
## 29 0.7781210 0.9153193 0.000000000 1.0000000 0.00000000
## 33 0.7829416 0.9153193 0.000000000 1.0000000 0.00000000
## 37 0.7830257 0.9153193 0.000000000 1.0000000 0.00000000
## 41 0.7995605 0.9153193 0.000000000 1.0000000 0.00000000
## 45 0.7966958 0.9153193 0.000000000 1.0000000 0.00000000
## 49 0.7926542 0.9153193 0.000000000 1.0000000 0.00000000
## 53 0.8008991 0.9153193 0.000000000 1.0000000 0.00000000
## 57 0.8005916 0.9153193 0.000000000 1.0000000 0.00000000
## 61 0.7976926 0.9153193 0.000000000 1.0000000 0.00000000
## 65 0.7975848 0.9153193 0.000000000 1.0000000 0.00000000
## 69 0.8000026 0.9153193 0.000000000 1.0000000 0.00000000
## 73 0.7979342 0.9153193 0.000000000 1.0000000 0.00000000
## 77 0.7999412 0.9153193 0.000000000 1.0000000 0.00000000
## 81 0.7980627 0.9153193 0.000000000 1.0000000 0.00000000
## 85 0.7977914 0.9153193 0.000000000 1.0000000 0.00000000
## 89 0.7955148 0.9153193 0.000000000 1.0000000 0.00000000
## 93 0.7949000 0.9153193 0.000000000 1.0000000 0.00000000
## 97 0.7941367 0.9153193 0.000000000 1.0000000 0.00000000
## Pos Pred Value Neg Pred Value Precision Recall F1
## 0.9221980 0.2003259 0.9221980 0.9365609 0.9291554
## 0.9179701 0.2172619 0.9179701 0.9885525 0.9519275
## 0.9161160 0.5000000 0.9161160 0.9989583 0.9557419
## 0.9152373 0.0000000 0.9152373 0.9989583 0.9552656
## 0.9153193 NaN 0.9153193 1.0000000 0.9557867
## 0.9153193 NaN 0.9153193 1.0000000 0.9557867
## 0.9153193 NaN 0.9153193 1.0000000 0.9557867
## 0.9153193 NaN 0.9153193 1.0000000 0.9557867
## 0.9153193 NaN 0.9153193 1.0000000 0.9557867
## 0.9153193 NaN 0.9153193 1.0000000 0.9557867
## 0.9153193 NaN 0.9153193 1.0000000 0.9557867
## 0.9153193 NaN 0.9153193 1.0000000 0.9557867
## 0.9153193 NaN 0.9153193 1.0000000 0.9557867
## 0.9153193 NaN 0.9153193 1.0000000 0.9557867
## 0.9153193 NaN 0.9153193 1.0000000 0.9557867
## 0.9153193 NaN 0.9153193 1.0000000 0.9557867
## 0.9153193 NaN 0.9153193 1.0000000 0.9557867
## 0.9153193 NaN 0.9153193 1.0000000 0.9557867
## 0.9153193 NaN 0.9153193 1.0000000 0.9557867
## 0.9153193 NaN 0.9153193 1.0000000 0.9557867
## 0.9153193 NaN 0.9153193 1.0000000 0.9557867
## 0.9153193 NaN 0.9153193 1.0000000 0.9557867
## 0.9153193 NaN 0.9153193 1.0000000 0.9557867
## 0.9153193 NaN 0.9153193 1.0000000 0.9557867
## 0.9153193 NaN 0.9153193 1.0000000 0.9557867
## Balanced Accuracy
## 0.5411563
## 0.5164985
## 0.5050347
## 0.4994792
## 0.5000000
## 0.5000000
## 0.5000000
## 0.5000000
## 0.5000000
## 0.5000000
## 0.5000000
## 0.5000000
## 0.5000000
## 0.5000000
## 0.5000000
## 0.5000000
## 0.5000000
## 0.5000000
## 0.5000000
## 0.5000000
## 0.5000000
## 0.5000000
## 0.5000000
## 0.5000000
## 0.5000000
##
## F1 was used to select the optimal model using the largest value.
## The final value used for the model was k = 97.
Let’s see the forecasts
drugs_my_test_forecasts <-
data.frame(drugs_my_train_knn5 = predict(drugs_my_train_knn5,
drugs_my_test),
drugs_my_train_knn39 = predict(drugs_my_train_knn33,
drugs_my_test),
drugs_my_train_knn_tuned = predict(drugs_my_train_knn_tuned,
drugs_my_test),
drugs_my_train_knn_tuned_scaled = predict(drugs_my_train_knn_tuned_scaled,
drugs_my_test),
drugs_my_train_knn_tuned_scaled2 = predict(drugs_my_train_knn_tuned_scaled2,
drugs_my_test),
drugs_my_train_knn_tuned_scaled3 = predict(drugs_my_train_knn_tuned_scaled3,
drugs_my_test)
)
head(drugs_my_test_forecasts)
## drugs_my_train_knn5 drugs_my_train_knn39 drugs_my_train_knn_tuned
## 1 No No No
## 2 No No No
## 3 Yes No No
## 4 No No No
## 5 No No No
## 6 No No No
## drugs_my_train_knn_tuned_scaled drugs_my_train_knn_tuned_scaled2
## 1 No No
## 2 No No
## 3 No No
## 4 No No
## 5 No No
## 6 No No
## drugs_my_train_knn_tuned_scaled3
## 1 No
## 2 No
## 3 No
## 4 No
## 5 No
## 6 No
sapply(drugs_my_test_forecasts,
function(x) summary_binary_class(predicted_classes = x,
real = drugs_my_test$consumption_cocaine_last_month)) %>%
# transpose the results to have statistics in columns
# for easier comparison
t()
## Accuracy Sensitivity Specificity
## drugs_my_train_knn5 0.90423 0.02632 0.9854
## drugs_my_train_knn39 0.91537 0.00000 1.0000
## drugs_my_train_knn_tuned 0.91537 0.00000 1.0000
## drugs_my_train_knn_tuned_scaled 0.91537 0.00000 1.0000
## drugs_my_train_knn_tuned_scaled2 0.91537 0.00000 1.0000
## drugs_my_train_knn_tuned_scaled3 0.91537 0.00000 1.0000
## Pos Pred Value Neg Pred Value F1
## drugs_my_train_knn5 0.14286 0.91629 0.04444
## drugs_my_train_knn39 NaN 0.91537 NA
## drugs_my_train_knn_tuned NaN 0.91537 NA
## drugs_my_train_knn_tuned_scaled NaN 0.91537 NA
## drugs_my_train_knn_tuned_scaled2 NaN 0.91537 NA
## drugs_my_train_knn_tuned_scaled3 NaN 0.91537 NA
## Balanced Accuracy
## drugs_my_train_knn5 0.50586
## drugs_my_train_knn39 0.50000
## drugs_my_train_knn_tuned 0.50000
## drugs_my_train_knn_tuned_scaled 0.50000
## drugs_my_train_knn_tuned_scaled2 0.50000
## drugs_my_train_knn_tuned_scaled3 0.50000
Similarly as in logit estimations, I will use the SMOTEd balanced data to perform models
set.seed(77777)
drugs_my_train_knn5_balanced <-
train(consumption_cocaine_last_month ~ .,
data = train_balanced,
method = "knn",
trControl = ctrl_nocv)
drugs_my_train_knn5_balanced
## k-Nearest Neighbors
##
## 3649 samples
## 19 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: None
drugs_my_train_knn5_balanced$finalModel
## 5-nearest neighbor model
## Training set outcome distribution:
##
## No Yes
## 1780 1869
drugs_my_train_knn5_balanced$finalModel$k
## [1] 5
#another k ---------------------
sqrt(nrow(train_balanced))
## [1] 60.40695
# lets use 61 - the first ODD number larger than
# the square root of n
k_value_balanced <- data.frame(k = 61)
set.seed(77777)
drugs_my_train_knn61_balanced <-
train(consumption_cocaine_last_month ~ .,
data = train_balanced,
method = "knn",
trControl = ctrl_nocv,
tuneGrid = k_value_balanced)
drugs_my_train_knn61_balanced$finalModel$k
## [1] 61
# lets calculate fitted values - prediction on the training sample
drugs_my_train_knn61_fitted_balanced <- predict(drugs_my_train_knn61_balanced,
train_balanced)
# Let's see the frequencies of fitted values
table(drugs_my_train_knn61_fitted_balanced)
## drugs_my_train_knn61_fitted_balanced
## No Yes
## 1128 2521
# check various accuracy measures
summary_binary_class(predicted_classes = drugs_my_train_knn61_fitted_balanced,
real = train_balanced$consumption_cocaine_last_month)
## Accuracy Sensitivity Specificity Pos Pred Value
## 0.76048 0.94061 0.57135 0.69734
## Neg Pred Value F1 Balanced Accuracy
## 0.90160 0.80091 0.75598
# lets compare prediction results in the test sample
drugs_my_train_knn61_forecasts_balanced <- predict(drugs_my_train_knn61_balanced,
drugs_my_test)
table(drugs_my_train_knn61_forecasts_balanced)
## drugs_my_train_knn61_forecasts_balanced
## No Yes
## 239 210
summary_binary_class(predicted_classes = drugs_my_train_knn61_forecasts_balanced,
real = drugs_my_test$consumption_cocaine_last_month)
## Accuracy Sensitivity Specificity Pos Pred Value
## 0.56793 0.71053 0.55474 0.12857
## Neg Pred Value F1 Balanced Accuracy
## 0.95397 0.21774 0.63264
#--------------------------------
drugs_my_train_knn5_fitted_balanced <- predict(drugs_my_train_knn5_balanced,
train_balanced)
summary_binary_class(predicted_classes = drugs_my_train_knn5_fitted_balanced,
real = train_balanced$consumption_cocaine_last_month)
## Accuracy Sensitivity Specificity Pos Pred Value
## 0.93532 0.96897 0.90000 0.91051
## Neg Pred Value F1 Balanced Accuracy
## 0.96506 0.93883 0.93448
# accuracy measures improved significantly!!!
# lets compare prediction results in the test sample
drugs_my_train_knn5_forecasts_balanced <- predict(drugs_my_train_knn5_balanced,
drugs_my_test)
summary_binary_class(predicted_classes = drugs_my_train_knn5_forecasts_balanced,
real = drugs_my_test$consumption_cocaine_last_month)
## Accuracy Sensitivity Specificity Pos Pred Value
## 0.68597 0.57895 0.69586 0.14966
## Neg Pred Value F1 Balanced Accuracy
## 0.94702 0.23784 0.63741
#-------------------------------
different_k_balanced <- data.frame(k = seq(1, 99, 4))
# define the training control -
# use 5-fold cross validation
ctrl_cv5 <- trainControl(method = "cv",
number = 5)
set.seed(77777)
drugs_my_train_knn_tuned_balanced <-
train(consumption_cocaine_last_month ~ .,
data = train_balanced,
# model type - now knn!!
method = "knn",
# validation used!
trControl = ctrl_cv5,
# parameters to be compared
tuneGrid = different_k_balanced)
# now validation is applied to EVERY
# SINGLE value of k, which takes few seconds
# lets check the results
drugs_my_train_knn_tuned_balanced
## k-Nearest Neighbors
##
## 3649 samples
## 19 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 2919, 2919, 2920, 2919, 2919
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.9599861 0.9198338
## 5 0.8859970 0.7710440
## 9 0.8536603 0.7055890
## 13 0.8328309 0.6633875
## 17 0.8155676 0.6283827
## 21 0.8035087 0.6039289
## 25 0.7961095 0.5890504
## 29 0.7848691 0.5660772
## 33 0.7761024 0.5482614
## 37 0.7700727 0.5360536
## 41 0.7637706 0.5233645
## 45 0.7615769 0.5189247
## 49 0.7528072 0.5011551
## 53 0.7473289 0.4899339
## 57 0.7429434 0.4809958
## 61 0.7380100 0.4710150
## 65 0.7319830 0.4587752
## 69 0.7281459 0.4508836
## 73 0.7251314 0.4447271
## 77 0.7226653 0.4396636
## 81 0.7223921 0.4390928
## 85 0.7188286 0.4318911
## 89 0.7210207 0.4362742
## 93 0.7204709 0.4351047
## 97 0.7199241 0.4340137
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 1.
#------------------------------
set.seed(77777)
drugs_my_train_knn_tuned_scaled_balanced <-
train(consumption_cocaine_last_month ~ .,
data = train_balanced,
method = "knn",
trControl = ctrl_cv5,
tuneGrid = different_k_balanced,
preProcess = c("range"))
drugs_my_train_knn_tuned_scaled_balanced
## k-Nearest Neighbors
##
## 3649 samples
## 19 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: re-scaling to [0, 1] (75)
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 2919, 2919, 2920, 2919, 2919
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.9125787 0.8252070
## 5 0.8451619 0.6904442
## 9 0.8350193 0.6699822
## 13 0.8394096 0.6786761
## 17 0.8311855 0.6621513
## 21 0.8254336 0.6505970
## 25 0.8273484 0.6544527
## 29 0.8251547 0.6501029
## 33 0.8281699 0.6560386
## 37 0.8243340 0.6484461
## 41 0.8218693 0.6434782
## 45 0.8166635 0.6330322
## 49 0.8155680 0.6307687
## 53 0.8100844 0.6198853
## 57 0.8117290 0.6231597
## 61 0.8073443 0.6143785
## 65 0.8114561 0.6226228
## 69 0.8081662 0.6161317
## 73 0.8081673 0.6160707
## 77 0.8076190 0.6149561
## 81 0.8043283 0.6084044
## 85 0.8007633 0.6013076
## 89 0.7985723 0.5968965
## 93 0.7966567 0.5930853
## 97 0.7952831 0.5902953
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 1.
# here the best accuracy obtained for
# k = 9 and the accuracy seems to increase
# with k
ctrl_cv5a <- trainControl(method = "cv",
number = 5,
# probabilities of each level
# predicted in cross-validation
classProbs = TRUE,
# and summary function
# that includes ROC
summaryFunction = twoClassSummary)
set.seed(77777)
drugs_my_train_knn_tuned_scaled2_balanced <-
train(consumption_cocaine_last_month ~ .,
data = train_balanced,
method = "knn",
trControl = ctrl_cv5a,
tuneGrid = different_k_balanced,
preProcess = c("range"),
metric = "ROC")
drugs_my_train_knn_tuned_scaled2_balanced
## k-Nearest Neighbors
##
## 3649 samples
## 19 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: re-scaling to [0, 1] (75)
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 2919, 2919, 2920, 2919, 2919
## Resampling results across tuning parameters:
##
## k ROC Sens Spec
## 1 0.9129468 0.9280899 0.8978036
## 5 0.9264344 0.8617978 0.8293128
## 9 0.9179745 0.8393258 0.8309143
## 13 0.9173271 0.8376404 0.8410919
## 17 0.9126358 0.8252809 0.8368023
## 21 0.9120165 0.8174157 0.8330676
## 25 0.9108359 0.8207865 0.8335995
## 29 0.9103890 0.8207865 0.8293114
## 33 0.9115698 0.8179775 0.8378790
## 37 0.9115677 0.8191011 0.8293157
## 41 0.9085246 0.8146067 0.8287809
## 45 0.9060900 0.8078652 0.8250348
## 49 0.9038696 0.8028090 0.8277129
## 53 0.9027160 0.8022472 0.8175395
## 57 0.9016041 0.8028090 0.8202119
## 61 0.9003120 0.7983146 0.8159324
## 65 0.8990699 0.8033708 0.8191467
## 69 0.8978033 0.8050562 0.8111153
## 73 0.8973118 0.8016854 0.8143281
## 77 0.8961138 0.8000000 0.8148686
## 81 0.8950017 0.7983146 0.8100472
## 85 0.8931619 0.7966292 0.8046895
## 89 0.8918578 0.7932584 0.8036229
## 93 0.8901854 0.7926966 0.8004129
## 97 0.8900391 0.7887640 0.8014781
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
ctrl_cv5b <- trainControl(method = "cv",
number = 5,
# probabilities of each level
# predicted in cross-validation
classProbs = TRUE,
# and summary function
# that includes F1
summaryFunction = mySummary)
set.seed(77777)
drugs_my_train_knn_tuned_scaled3_balanced <-
train(consumption_cocaine_last_month ~ .,
data = train_balanced,
method = "knn",
trControl = ctrl_cv5b,
tuneGrid = different_k_balanced,
preProcess = c("range"),
metric = "F1")
drugs_my_train_knn_tuned_scaled3_balanced
## k-Nearest Neighbors
##
## 3649 samples
## 19 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: re-scaling to [0, 1] (75)
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 2919, 2919, 2920, 2919, 2919
## Resampling results across tuning parameters:
##
## k ROC Accuracy Kappa Sensitivity Specificity Pos Pred Value
## 1 0.9129468 0.9125787 0.8252070 0.9280899 0.8978036 0.8966884
## 5 0.9264344 0.8451619 0.6904442 0.8617978 0.8293128 0.8280164
## 9 0.9179745 0.8350193 0.6699822 0.8393258 0.8309143 0.8256993
## 13 0.9173271 0.8394096 0.6786761 0.8376404 0.8410919 0.8345987
## 17 0.9126358 0.8311855 0.6621513 0.8252809 0.8368023 0.8286890
## 21 0.9120165 0.8254336 0.6505970 0.8174157 0.8330676 0.8235378
## 25 0.9108359 0.8273484 0.6544527 0.8207865 0.8335995 0.8245569
## 29 0.9103890 0.8251547 0.6501029 0.8207865 0.8293114 0.8208898
## 33 0.9115698 0.8281699 0.6560386 0.8179775 0.8378790 0.8277818
## 37 0.9115677 0.8243340 0.6484461 0.8191011 0.8293157 0.8205509
## 41 0.9085246 0.8218693 0.6434782 0.8146067 0.8287809 0.8193639
## 45 0.9060900 0.8166635 0.6330322 0.8078652 0.8250348 0.8148293
## 49 0.9038696 0.8155680 0.6307687 0.8028090 0.8277129 0.8161240
## 53 0.9027160 0.8100844 0.6198853 0.8022472 0.8175395 0.8073928
## 57 0.9016041 0.8117290 0.6231597 0.8028090 0.8202119 0.8100278
## 61 0.9003120 0.8073443 0.6143785 0.7983146 0.8159324 0.8052955
## 65 0.8990699 0.8114561 0.6226228 0.8033708 0.8191467 0.8089259
## 69 0.8978033 0.8081662 0.6161317 0.8050562 0.8111153 0.8025758
## 73 0.8973118 0.8081673 0.6160707 0.8016854 0.8143281 0.8046126
## 77 0.8961138 0.8076190 0.6149561 0.8000000 0.8148686 0.8046241
## 81 0.8950017 0.8043283 0.6084044 0.7983146 0.8100472 0.8003485
## 85 0.8931619 0.8007633 0.6013076 0.7966292 0.8046895 0.7955719
## 89 0.8918578 0.7985723 0.5968965 0.7932584 0.8036229 0.7939245
## 93 0.8901854 0.7966567 0.5930853 0.7926966 0.8004129 0.7911669
## 97 0.8900391 0.7952831 0.5902953 0.7887640 0.8014781 0.7913221
## Neg Pred Value Precision Recall F1 Balanced Accuracy
## 0.9291886 0.8966884 0.9280899 0.9120126 0.9129468
## 0.8630398 0.8280164 0.8617978 0.8445148 0.8455553
## 0.8444004 0.8256993 0.8393258 0.8323926 0.8351200
## 0.8447821 0.8345987 0.8376404 0.8358764 0.8393662
## 0.8342635 0.8286890 0.8252809 0.8267418 0.8310416
## 0.8274305 0.8235378 0.8174157 0.8203879 0.8252417
## 0.8301899 0.8245569 0.8207865 0.8225959 0.8271930
## 0.8293044 0.8208898 0.8207865 0.8208095 0.8250490
## 0.8286355 0.8277818 0.8179775 0.8228133 0.8279283
## 0.8279705 0.8205509 0.8191011 0.8198107 0.8242084
## 0.8244659 0.8193639 0.8146067 0.8168969 0.8216938
## 0.8185602 0.8148293 0.8078652 0.8112714 0.8164500
## 0.8151600 0.8161240 0.8028090 0.8093741 0.8152609
## 0.8128353 0.8073928 0.8022472 0.8047367 0.8098934
## 0.8138170 0.8100278 0.8028090 0.8062329 0.8115104
## 0.8095852 0.8052955 0.7983146 0.8016785 0.8071235
## 0.8140542 0.8089259 0.8033708 0.8060636 0.8112587
## 0.8138944 0.8025758 0.8050562 0.8036917 0.8080857
## 0.8119401 0.8046126 0.8016854 0.8030065 0.8080068
## 0.8105868 0.8046241 0.8000000 0.8022549 0.8074343
## 0.8083524 0.8003485 0.7983146 0.7992445 0.8041809
## 0.8060077 0.7955719 0.7966292 0.7960027 0.8006594
## 0.8033568 0.7939245 0.7932584 0.7934608 0.7984407
## 0.8024606 0.7911669 0.7926966 0.7917404 0.7965548
## 0.7994490 0.7913221 0.7887640 0.7898954 0.7951211
##
## F1 was used to select the optimal model using the largest value.
## The final value used for the model was k = 1.
Let’s take a look on the forecasts on test sample
drugs_my_test_forecasts_balanced <-
data.frame(drugs_my_train_knn5_balanced = predict(drugs_my_train_knn5_balanced,
drugs_my_test),
drugs_my_train_knn61_balanced = predict(drugs_my_train_knn61_balanced,
drugs_my_test),
drugs_my_train_knn_tuned_balanced = predict(drugs_my_train_knn_tuned_balanced,
drugs_my_test),
drugs_my_train_knn_tuned_scaled_balanced = predict(drugs_my_train_knn_tuned_scaled_balanced,
drugs_my_test),
drugs_my_train_knn_tuned_scaled2_balanced = predict(drugs_my_train_knn_tuned_scaled2_balanced,
drugs_my_test),
drugs_my_train_knn_tuned_scaled3_balanced = predict(drugs_my_train_knn_tuned_scaled3_balanced,
drugs_my_test)
)
head(drugs_my_test_forecasts_balanced)
## drugs_my_train_knn5_balanced drugs_my_train_knn61_balanced
## 1 No No
## 2 No No
## 3 Yes Yes
## 4 No Yes
## 5 Yes Yes
## 6 Yes Yes
## drugs_my_train_knn_tuned_balanced drugs_my_train_knn_tuned_scaled_balanced
## 1 No No
## 2 No No
## 3 Yes Yes
## 4 Yes Yes
## 5 Yes No
## 6 Yes No
## drugs_my_train_knn_tuned_scaled2_balanced
## 1 No
## 2 No
## 3 Yes
## 4 No
## 5 No
## 6 No
## drugs_my_train_knn_tuned_scaled3_balanced
## 1 No
## 2 No
## 3 Yes
## 4 Yes
## 5 No
## 6 No
sapply(drugs_my_test_forecasts_balanced,
function(x) summary_binary_class(predicted_classes = x,
real = drugs_my_test$consumption_cocaine_last_month)) %>%
# transpose the results to have statistics in columns
# for easier comparison
t()
## Accuracy Sensitivity Specificity
## drugs_my_train_knn5_balanced 0.67706 0.57895 0.68613
## drugs_my_train_knn61_balanced 0.56793 0.71053 0.55474
## drugs_my_train_knn_tuned_balanced 0.65924 0.55263 0.66910
## drugs_my_train_knn_tuned_scaled_balanced 0.61247 0.55263 0.61800
## drugs_my_train_knn_tuned_scaled2_balanced 0.76392 0.44737 0.79319
## drugs_my_train_knn_tuned_scaled3_balanced 0.61247 0.55263 0.61800
## Pos Pred Value Neg Pred Value F1
## drugs_my_train_knn5_balanced 0.14570 0.94631 0.23280
## drugs_my_train_knn61_balanced 0.12857 0.95397 0.21774
## drugs_my_train_knn_tuned_balanced 0.13376 0.94178 0.21538
## drugs_my_train_knn_tuned_scaled_balanced 0.11798 0.93727 0.19444
## drugs_my_train_knn_tuned_scaled2_balanced 0.16667 0.93948 0.24286
## drugs_my_train_knn_tuned_scaled3_balanced 0.11798 0.93727 0.19444
## Balanced Accuracy
## drugs_my_train_knn5_balanced 0.63254
## drugs_my_train_knn61_balanced 0.63264
## drugs_my_train_knn_tuned_balanced 0.61087
## drugs_my_train_knn_tuned_scaled_balanced 0.58532
## drugs_my_train_knn_tuned_scaled2_balanced 0.62028
## drugs_my_train_knn_tuned_scaled3_balanced 0.58532
Let’s perform the predictions for true test sample
drugs_my_test_knn_tuned_scaled3_balanced = predict(drugs_my_train_knn_tuned_scaled3_balanced,
drugs_true_test)
head(drugs_my_test_knn_tuned_scaled3_balanced)
## [1] Yes No No No No Yes
## Levels: No Yes
table(drugs_my_test_knn_tuned_scaled3_balanced)
## drugs_my_test_knn_tuned_scaled3_balanced
## No Yes
## 252 133
Oversampling method with the use of SMOTE function increased significantly the Balanced Accuracy (BA) values. Two of balanced models have 62% of BA - much better than imbalanced kNN models, however poorer in comparison with the best performing balanced logit model. Let’s take a look on the Support Vector Machine classification method.
This method creates a line or a hyperplane which separates the data into separate classes.
library(kernlab)
library(verification)
library(ggplot2)
library(tidyverse)
library(caret)
Let’s perform a simple model with the use of SVM method of linear type
set.seed(77777)
ctrl_nocv <- trainControl(method = "none")
# model is trained on the train sample
# initially with the linear kernel function
# method = "svmLinear"
set.seed(77777)
svm_Linear1 <- train(consumption_cocaine_last_month ~ .,
drugs_my_train %>%
# we exclude id
dplyr::select(-id),
method = "svmLinear",
trControl = ctrl_nocv)
# check the result
svm_Linear1
## Support Vector Machines with Linear Kernel
##
## 1051 samples
## 19 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: None
svm_Linear1$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1
##
## Linear (vanilla) kernel function.
##
## Number of Support Vectors : 260
##
## Objective Function Value : -178
## Training error : 0.084681
# here C = 1 (default for the linear kernel)
# forecasts
drugs_my_test$fore_svm_Linear1 <- predict(svm_Linear1,
newdata = drugs_my_test %>%
# we exclude id
dplyr::select(-id))
head(drugs_my_test)
## # A tibble: 6 x 22
## id age gender education country ethnicity personality_neur~
## <chr> <fct> <fct> <fct> <fct> <fct> <dbl>
## 1 train_0001 45-54 male Masters degree USA Mixed-Bla~ 57.6
## 2 train_0002 25-34 male University degree USA Mixed-Bla~ 47.8
## 3 train_0009 18-24 female University degree Austra~ Mixed-Bla~ 60.8
## 4 train_0012 18-24 female Some college or ~ Austra~ Mixed-Bla~ 66.6
## 5 train_0013 35-44 female Professional cer~ USA Mixed-Bla~ 69.7
## 6 train_0014 18-24 female Left school at 1~ Other Mixed-Bla~ 49.2
## # ... with 15 more variables: personality_extraversion <dbl>,
## # personality_openness <dbl>, personality_agreeableness <dbl>,
## # personality_conscientiousness <dbl>, personality_impulsiveness <dbl>,
## # personality_sensation <dbl>, consumption_alcohol <fct>,
## # consumption_amphetamines <fct>, consumption_caffeine <fct>,
## # consumption_cannabis <fct>, consumption_chocolate <fct>,
## # consumption_mushrooms <fct>, consumption_nicotine <fct>, ...
table(drugs_my_test$fore_svm_Linear1)
##
## No Yes
## 449 0
# all observations predicted as group 1
# confusion matrix
confusionMatrix(drugs_my_test$fore_svm_Linear1, # forecasts
drugs_my_test$consumption_cocaine_last_month, # real values
# here it does not matter which
# group is treated as positive
# lets take the 1st
positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 411 38
## Yes 0 0
##
## Accuracy : 0.9154
## 95% CI : (0.8857, 0.9394)
## No Information Rate : 0.9154
## P-Value [Acc > NIR] : 0.543
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 0.000000001947
##
## Sensitivity : 0.00000
## Specificity : 1.00000
## Pos Pred Value : NaN
## Neg Pred Value : 0.91537
## Prevalence : 0.08463
## Detection Rate : 0.00000
## Detection Prevalence : 0.00000
## Balanced Accuracy : 0.50000
##
## 'Positive' Class : Yes
##
modelLookup("svmLinear")
## model parameter label forReg forClass probModel
## 1 svmLinear C Cost TRUE TRUE TRUE
set.seed(77777)
ctrl_cv5x3 <- trainControl(method = "repeatedcv",
number = 5,
repeats = 3)
parametersC <- data.frame(C = c(0.001, 0.01, 0.02, 0.05,
0.1, 0.2, 0.5, 1, 2, 5))
set.seed(77777)
svm_Linear2 <- train(consumption_cocaine_last_month ~ .,
drugs_my_train %>%
dplyr::select(-id),
method = "svmLinear",
tuneGrid = parametersC,
trControl = ctrl_cv5x3)
svm_Linear2
## Support Vector Machines with Linear Kernel
##
## 1051 samples
## 19 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 841, 841, 840, 841, 841, 840, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.001 0.9153219 0.000000000
## 0.010 0.9146899 -0.001157184
## 0.020 0.9146899 -0.001157184
## 0.050 0.9137376 -0.002926955
## 0.100 0.9070724 -0.003010243
## 0.200 0.9061185 0.010749087
## 0.500 0.9054821 0.043975949
## 1.000 0.9029363 0.074897105
## 2.000 0.9000897 0.094891631
## 5.000 0.8988259 0.119849143
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 0.001.
Now let’s take a look on the SVM Poly type
modelLookup("svmPoly")
## model parameter label forReg forClass probModel
## 1 svmPoly degree Polynomial Degree TRUE TRUE TRUE
## 2 svmPoly scale Scale TRUE TRUE TRUE
## 3 svmPoly C Cost TRUE TRUE TRUE
svm_parametersPoly <- expand.grid(C = c(0.001, 1),
degree = 2:5,
scale = 1)
svm_parametersPoly
## C degree scale
## 1 0.001 2 1
## 2 1.000 2 1
## 3 0.001 3 1
## 4 1.000 3 1
## 5 0.001 4 1
## 6 1.000 4 1
## 7 0.001 5 1
## 8 1.000 5 1
set.seed(77777)
svm_poly <- train(consumption_cocaine_last_month ~ .,
drugs_my_train %>%
dplyr::select(-id),
method = "svmPoly",
tuneGrid = svm_parametersPoly,
trControl = ctrl_cv5x3)
svm_poly
## Support Vector Machines with Polynomial Kernel
##
## 1051 samples
## 19 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 841, 841, 840, 841, 841, 840, ...
## Resampling results across tuning parameters:
##
## C degree Accuracy Kappa
## 0.001 2 0.8896299 0.13495794
## 0.001 3 0.8918461 0.08281164
## 0.001 4 0.8972399 0.01734128
## 0.001 5 0.9013624 0.02520789
## 1.000 2 0.8839185 0.18198722
## 1.000 3 0.8918461 0.08281164
## 1.000 4 0.8972399 0.01734128
## 1.000 5 0.9013624 0.02520789
##
## Tuning parameter 'scale' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were degree = 5, scale = 1 and C = 0.001.
#The final values used for the model were degree = 5, scale = 1 and C= 0.001.
drugs_my_test$fore_svm_poly <- predict(svm_poly,
newdata = drugs_my_test %>%
dplyr::select(-id))
table(drugs_my_test$fore_svm_poly)
##
## No Yes
## 447 2
confusionMatrix(drugs_my_test$fore_svm_poly,
drugs_my_test$consumption_cocaine_last_month,
positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 410 37
## Yes 1 1
##
## Accuracy : 0.9154
## 95% CI : (0.8857, 0.9394)
## No Information Rate : 0.9154
## P-Value [Acc > NIR] : 0.543
##
## Kappa : 0.0419
##
## Mcnemar's Test P-Value : 0.00000001365
##
## Sensitivity : 0.026316
## Specificity : 0.997567
## Pos Pred Value : 0.500000
## Neg Pred Value : 0.917226
## Prevalence : 0.084633
## Detection Rate : 0.002227
## Detection Prevalence : 0.004454
## Balanced Accuracy : 0.511941
##
## 'Positive' Class : Yes
##
# poor results
#-----------true test
drugs_true_test <- read_csv("drugs_test.csv")
#View(drugs_true_test)
drugs_my_test_SVM = predict(svm_poly, drugs_true_test %>%
dplyr::select(-id))
head(drugs_my_test_SVM)
## [1] No No No No No No
## Levels: No Yes
table(drugs_my_test_SVM)
## drugs_my_test_SVM
## No Yes
## 383 2
table(drugs_my_test$consumption_cocaine_last_month)
##
## No Yes
## 411 38
Now let’s implement the SVM Radial type
set.seed(77777)
svm_Radial1 <- train(consumption_cocaine_last_month ~ .,
drugs_my_train %>%
# we exclude id
dplyr::select(-id),
method = "svmRadial",
trControl = ctrl_nocv)
# the result
svm_Radial1$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 0.25
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.00853834775091178
##
## Number of Support Vectors : 387
##
## Objective Function Value : -43.4055
## Training error : 0.084681
# also looks better than the linear
# this algorithm has two parameters:
# - cost C (here by default C = 0.25)
# and sigma - smoothing parameter for
# the radial basis kernel
# let's compare the forecasts
drugs_my_test$fore_svm_Radial1 <- predict(svm_Radial1,
newdata = drugs_my_test %>%
dplyr::select(-id))
table(drugs_my_test$fore_svm_Radial1)
##
## No Yes
## 449 0
# confusion matrix for the test sample
confusionMatrix(drugs_my_test$fore_svm_Radial1,
drugs_my_test$consumption_cocaine_last_month,
positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 411 38
## Yes 0 0
##
## Accuracy : 0.9154
## 95% CI : (0.8857, 0.9394)
## No Information Rate : 0.9154
## P-Value [Acc > NIR] : 0.543
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 0.000000001947
##
## Sensitivity : 0.00000
## Specificity : 1.00000
## Pos Pred Value : NaN
## Neg Pred Value : 0.91537
## Prevalence : 0.08463
## Detection Rate : 0.00000
## Detection Prevalence : 0.00000
## Balanced Accuracy : 0.50000
##
## 'Positive' Class : Yes
##
# but is this the optimal set of hyperparameters?
# one can check this with cross-validation
# below 6 values of C and 5 sigmas are combined
parametersC_sigma <-
expand.grid(C = c(0.01, 0.05, 0.1, 0.5, 1, 5),
sigma = c(0.05, 0.1, 0.2, 0.5, 1))
head(parametersC_sigma)
## C sigma
## 1 0.01 0.05
## 2 0.05 0.05
## 3 0.10 0.05
## 4 0.50 0.05
## 5 1.00 0.05
## 6 5.00 0.05
nrow(parametersC_sigma)
## [1] 30
set.seed(77777)
svm_Radial2 <- train(consumption_cocaine_last_month ~ .,
drugs_my_train %>%
dplyr::select(-id),
method = "svmRadial",
tuneGrid = parametersC_sigma,
trControl = ctrl_cv5x3)
svm_Radial2
## Support Vector Machines with Radial Basis Function Kernel
##
## 1051 samples
## 19 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 841, 841, 840, 841, 841, 840, ...
## Resampling results across tuning parameters:
##
## C sigma Accuracy Kappa
## 0.01 0.05 0.9153219 0.000000000
## 0.01 0.10 0.9153219 0.000000000
## 0.01 0.20 0.9153219 0.000000000
## 0.01 0.50 0.9153219 0.000000000
## 0.01 1.00 0.9153219 0.000000000
## 0.05 0.05 0.9153219 0.000000000
## 0.05 0.10 0.9153219 0.000000000
## 0.05 0.20 0.9153219 0.000000000
## 0.05 0.50 0.9153219 0.000000000
## 0.05 1.00 0.9153219 0.000000000
## 0.10 0.05 0.9153219 0.000000000
## 0.10 0.10 0.9153219 0.000000000
## 0.10 0.20 0.9153219 0.000000000
## 0.10 0.50 0.9153219 0.000000000
## 0.10 1.00 0.9153219 0.000000000
## 0.50 0.05 0.9153219 0.000000000
## 0.50 0.10 0.9153219 0.000000000
## 0.50 0.20 0.9153219 0.000000000
## 0.50 0.50 0.9153219 0.000000000
## 0.50 1.00 0.9153219 0.000000000
## 1.00 0.05 0.9153219 0.000000000
## 1.00 0.10 0.9153219 0.000000000
## 1.00 0.20 0.9153219 0.000000000
## 1.00 0.50 0.9153219 0.000000000
## 1.00 1.00 0.9153219 0.000000000
## 5.00 0.05 0.9143695 -0.001769771
## 5.00 0.10 0.9153219 0.000000000
## 5.00 0.20 0.9153219 0.000000000
## 5.00 0.50 0.9153219 0.000000000
## 5.00 1.00 0.9153219 0.000000000
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 1 and C = 0.01.
# also looks good
# optimal values: sigma = 1, C = 0.01
drugs_my_test$fore_svm_Radial2 <- predict(svm_Radial2,
newdata = drugs_my_test)
table(drugs_my_test$fore_svm_Radial2)
##
## No Yes
## 449 0
# now both groups predicted,
# but how correct is it?
# confusion matrix on the test sample
confusionMatrix(drugs_my_test$fore_svm_Radial2,
drugs_my_test$consumption_cocaine_last_month,
positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 411 38
## Yes 0 0
##
## Accuracy : 0.9154
## 95% CI : (0.8857, 0.9394)
## No Information Rate : 0.9154
## P-Value [Acc > NIR] : 0.543
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 0.000000001947
##
## Sensitivity : 0.00000
## Specificity : 1.00000
## Pos Pred Value : NaN
## Neg Pred Value : 0.91537
## Prevalence : 0.08463
## Detection Rate : 0.00000
## Detection Prevalence : 0.00000
## Balanced Accuracy : 0.50000
##
## 'Positive' Class : Yes
##
# again poor results
#let's create different parameters
parametersC_sigma2 <-
expand.grid(C = c(1, 5, 10, 25, 50, 100),
sigma = c(0.001, 0.01, 0.1, 1, 10, 100, 1000))
set.seed(77777)
svm_Radial2_C2 <- train(consumption_cocaine_last_month ~ .,
drugs_my_train %>%
# we exclude id
dplyr::select(-id),
method = "svmRadial",
tuneGrid = parametersC_sigma2,
trControl = ctrl_cv5x3)
svm_Radial2_C2
## Support Vector Machines with Radial Basis Function Kernel
##
## 1051 samples
## 19 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 841, 841, 840, 841, 841, 840, ...
## Resampling results across tuning parameters:
##
## C sigma Accuracy Kappa
## 1 0.001 0.9153219 0.000000000
## 1 0.010 0.9153219 0.000000000
## 1 0.100 0.9153219 0.000000000
## 1 1.000 0.9153219 0.000000000
## 1 10.000 0.9153219 0.000000000
## 1 100.000 0.9153219 0.000000000
## 1 1000.000 0.9153219 0.000000000
## 5 0.001 0.9115153 -0.006284598
## 5 0.010 0.9070844 0.063076403
## 5 0.100 0.9153219 0.000000000
## 5 1.000 0.9153219 0.000000000
## 5 10.000 0.9153219 0.000000000
## 5 100.000 0.9153219 0.000000000
## 5 1000.000 0.9153219 0.000000000
## 10 0.001 0.9112054 0.007931627
## 10 0.010 0.9048637 0.119927809
## 10 0.100 0.9153219 0.000000000
## 10 1.000 0.9153219 0.000000000
## 10 10.000 0.9153219 0.000000000
## 10 100.000 0.9153219 0.000000000
## 10 1000.000 0.9153219 0.000000000
## 25 0.001 0.9099249 0.016740712
## 25 0.010 0.9035848 0.129620973
## 25 0.100 0.9153219 0.000000000
## 25 1.000 0.9153219 0.000000000
## 25 10.000 0.9153219 0.000000000
## 25 100.000 0.9153219 0.000000000
## 25 1000.000 0.9153219 0.000000000
## 50 0.001 0.9061199 0.058412063
## 50 0.010 0.9035848 0.129620973
## 50 0.100 0.9153219 0.000000000
## 50 1.000 0.9153219 0.000000000
## 50 10.000 0.9153219 0.000000000
## 50 100.000 0.9153219 0.000000000
## 50 1000.000 0.9153219 0.000000000
## 100 0.001 0.9042212 0.123802672
## 100 0.010 0.9035848 0.129620973
## 100 0.100 0.9153219 0.000000000
## 100 1.000 0.9153219 0.000000000
## 100 10.000 0.9153219 0.000000000
## 100 100.000 0.9153219 0.000000000
## 100 1000.000 0.9153219 0.000000000
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 1000 and C = 1.
# optimal sigma = 10 and C = 50
# looks promising
# confusion matrix on the test sample
drugs_my_test$fore_svm_Radial2_C2 <- predict(svm_Radial2_C2,
newdata = drugs_my_test)
table(drugs_my_test$fore_svm_Radial2)
##
## No Yes
## 449 0
confusionMatrix(predict(svm_Radial2_C2,
newdata = drugs_my_test),
drugs_my_test$consumption_cocaine_last_month,
positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 411 38
## Yes 0 0
##
## Accuracy : 0.9154
## 95% CI : (0.8857, 0.9394)
## No Information Rate : 0.9154
## P-Value [Acc > NIR] : 0.543
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 0.000000001947
##
## Sensitivity : 0.00000
## Specificity : 1.00000
## Pos Pred Value : NaN
## Neg Pred Value : 0.91537
## Prevalence : 0.08463
## Detection Rate : 0.00000
## Detection Prevalence : 0.00000
## Balanced Accuracy : 0.50000
##
## 'Positive' Class : Yes
##
Again, results are extremely poor. Let’s take a look on performance of SVM method on balanced models. I will again use the SMOTEd balanced data.
set.seed(77777)
ctrl_nocv <- trainControl(method = "none")
set.seed(77777)
svm_Linear1_balanced <- train(consumption_cocaine_last_month ~ .,
train_balanced,
method = "svmLinear",
trControl = ctrl_nocv)
# check the result
svm_Linear1_balanced
## Support Vector Machines with Linear Kernel
##
## 3649 samples
## 19 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: None
svm_Linear1_balanced$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1
##
## Linear (vanilla) kernel function.
##
## Number of Support Vectors : 1377
##
## Objective Function Value : -1311.955
## Training error : 0.151
# here C = 1 (default for the linear kernel)
# forecasts
drugs_my_test$fore_svm_Linear1_balanced <- predict(svm_Linear1_balanced,
newdata = drugs_my_test %>%
dplyr::select(-id))
table(drugs_my_test$fore_svm_Linear1_balanced)
##
## No Yes
## 336 113
# Looks much better
Now, I will repeat the whole procedure but on the balanced dataset
confusionMatrix(drugs_my_test$fore_svm_Linear1_balanced, # forecasts
drugs_my_test$consumption_cocaine_last_month,
positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 322 14
## Yes 89 24
##
## Accuracy : 0.7706
## 95% CI : (0.7289, 0.8087)
## No Information Rate : 0.9154
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2189
##
## Mcnemar's Test P-Value : 0.0000000000003067
##
## Sensitivity : 0.63158
## Specificity : 0.78345
## Pos Pred Value : 0.21239
## Neg Pred Value : 0.95833
## Prevalence : 0.08463
## Detection Rate : 0.05345
## Detection Prevalence : 0.25167
## Balanced Accuracy : 0.70752
##
## 'Positive' Class : Yes
##
set.seed(77777)
ctrl_cv5x3 <- trainControl(method = "repeatedcv",
number = 5,
repeats = 3)
parametersC <- data.frame(C = c(0.001, 0.01, 0.02, 0.05,
0.1, 0.2, 0.5, 1, 2, 5))
set.seed(77777)
svm_Linear2_balanced <- train(consumption_cocaine_last_month ~ .,
train_balanced,
method = "svmLinear",
tuneGrid = parametersC,
trControl = ctrl_cv5x3)
svm_Linear2_balanced
## Support Vector Machines with Linear Kernel
##
## 3649 samples
## 19 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 2919, 2919, 2920, 2919, 2919, 2920, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.001 0.8375801 0.6732123
## 0.010 0.8386775 0.6760589
## 0.020 0.8397745 0.6783688
## 0.050 0.8406884 0.6802832
## 0.100 0.8419673 0.6828985
## 0.200 0.8420582 0.6830678
## 0.500 0.8413275 0.6816174
## 1.000 0.8427896 0.6845471
## 2.000 0.8426069 0.6841927
## 5.000 0.8426978 0.6843877
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 1.
svm_Linear2_balanced$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1
##
## Linear (vanilla) kernel function.
##
## Number of Support Vectors : 1377
##
## Objective Function Value : -1311.955
## Training error : 0.151
#cost C = 0.05
# forecasts
drugs_my_test$fore_svm_Linear2_balanced <- predict(svm_Linear2_balanced,
newdata = drugs_my_test %>%
dplyr::select(-id))
table(drugs_my_test$fore_svm_Linear2_balanced)
##
## No Yes
## 336 113
# confusion matrix
confusionMatrix(drugs_my_test$fore_svm_Linear2_balanced, # forecasts
drugs_my_test$consumption_cocaine_last_month,
positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 322 14
## Yes 89 24
##
## Accuracy : 0.7706
## 95% CI : (0.7289, 0.8087)
## No Information Rate : 0.9154
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2189
##
## Mcnemar's Test P-Value : 0.0000000000003067
##
## Sensitivity : 0.63158
## Specificity : 0.78345
## Pos Pred Value : 0.21239
## Neg Pred Value : 0.95833
## Prevalence : 0.08463
## Detection Rate : 0.05345
## Detection Prevalence : 0.25167
## Balanced Accuracy : 0.70752
##
## 'Positive' Class : Yes
##
#- SVM Poly
modelLookup("svmPoly")
## model parameter label forReg forClass probModel
## 1 svmPoly degree Polynomial Degree TRUE TRUE TRUE
## 2 svmPoly scale Scale TRUE TRUE TRUE
## 3 svmPoly C Cost TRUE TRUE TRUE
svm_parametersPoly <- expand.grid(C = c(0.001, 1),
degree = 2:5,
scale = 1)
svm_parametersPoly
## C degree scale
## 1 0.001 2 1
## 2 1.000 2 1
## 3 0.001 3 1
## 4 1.000 3 1
## 5 0.001 4 1
## 6 1.000 4 1
## 7 0.001 5 1
## 8 1.000 5 1
set.seed(77777)
svm_poly_balanced <- train(consumption_cocaine_last_month ~ .,
train_balanced,
method = "svmPoly",
tuneGrid = svm_parametersPoly,
trControl = ctrl_cv5x3)
svm_poly_balanced
## Support Vector Machines with Polynomial Kernel
##
## 3649 samples
## 19 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 2919, 2919, 2920, 2919, 2919, 2920, ...
## Resampling results across tuning parameters:
##
## C degree Accuracy Kappa
## 0.001 2 0.9324919 0.8648752
## 0.001 3 0.9558781 0.9116591
## 0.001 4 0.9600803 0.9200251
## 0.001 5 0.9564265 0.9126658
## 1.000 2 0.9347755 0.8695766
## 1.000 3 0.9558781 0.9116591
## 1.000 4 0.9600803 0.9200251
## 1.000 5 0.9564265 0.9126658
##
## Tuning parameter 'scale' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were degree = 4, scale = 1 and C = 0.001.
#The final values used for the model were degree = 4, scale = 1 and C= 0.001.
drugs_my_test$fore_svm_poly_balanced <- predict(svm_poly_balanced,
newdata = drugs_my_test %>%
dplyr::select(-id))
table(drugs_my_test$fore_svm_poly_balanced)
##
## No Yes
## 301 148
confusionMatrix(drugs_my_test$fore_svm_poly_balanced,
drugs_my_test$consumption_cocaine_last_month,
positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 285 16
## Yes 126 22
##
## Accuracy : 0.6837
## 95% CI : (0.6385, 0.7265)
## No Information Rate : 0.9154
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1177
##
## Mcnemar's Test P-Value : <0.0000000000000002
##
## Sensitivity : 0.57895
## Specificity : 0.69343
## Pos Pred Value : 0.14865
## Neg Pred Value : 0.94684
## Prevalence : 0.08463
## Detection Rate : 0.04900
## Detection Prevalence : 0.32962
## Balanced Accuracy : 0.63619
##
## 'Positive' Class : Yes
##
#
set.seed(77777)
svm_Radial1_balanced <- train(consumption_cocaine_last_month ~ .,
train_balanced,
method = "svmRadial",
trControl = ctrl_nocv)
# the result
svm_Radial1_balanced$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 0.25
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.00825663697982993
##
## Number of Support Vectors : 1941
##
## Objective Function Value : -371.4748
## Training error : 0.107975
# also looks better than the linear
# this algorithm has two parameters:
# - cost C (here by default C = 0.25)
# and sigma - smoothing parameter for
# the radial basis kernel
# let's compare the forecasts
drugs_my_test$fore_svm_Radial1_balanced <- predict(svm_Radial1_balanced,
newdata = drugs_my_test %>%
dplyr::select(-id))
table(drugs_my_test$fore_svm_Radial1_balanced)
##
## No Yes
## 314 135
# confusion matrix for the test sample
confusionMatrix(drugs_my_test$fore_svm_Radial1_balanced,
drugs_my_test$consumption_cocaine_last_month,
positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 301 13
## Yes 110 25
##
## Accuracy : 0.7261
## 95% CI : (0.6823, 0.7668)
## No Information Rate : 0.9154
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1808
##
## Mcnemar's Test P-Value : <0.0000000000000002
##
## Sensitivity : 0.65789
## Specificity : 0.73236
## Pos Pred Value : 0.18519
## Neg Pred Value : 0.95860
## Prevalence : 0.08463
## Detection Rate : 0.05568
## Detection Prevalence : 0.30067
## Balanced Accuracy : 0.69513
##
## 'Positive' Class : Yes
##
#- SVM Radial
set.seed(77777)
svm_Radial2_balanced <- train(consumption_cocaine_last_month ~ .,
train_balanced,
method = "svmRadial",
tuneGrid = parametersC_sigma,
trControl = ctrl_cv5x3)
# it takes quite long
svm_Radial2_balanced
## Support Vector Machines with Radial Basis Function Kernel
##
## 3649 samples
## 19 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 2919, 2919, 2920, 2919, 2919, 2920, ...
## Resampling results across tuning parameters:
##
## C sigma Accuracy Kappa
## 0.01 0.05 0.5121950 0.000000000
## 0.01 0.10 0.5121950 0.000000000
## 0.01 0.20 0.5121950 0.000000000
## 0.01 0.50 0.5121950 0.000000000
## 0.01 1.00 0.5121950 0.000000000
## 0.05 0.05 0.5692880 0.119563161
## 0.05 0.10 0.5121950 0.000000000
## 0.05 0.20 0.5121950 0.000000000
## 0.05 0.50 0.5121950 0.000000000
## 0.05 1.00 0.5121950 0.000000000
## 0.10 0.05 0.6737897 0.336717293
## 0.10 0.10 0.5168539 0.009780721
## 0.10 0.20 0.5121950 0.000000000
## 0.10 0.50 0.5121950 0.000000000
## 0.10 1.00 0.5121950 0.000000000
## 0.50 0.05 0.9280196 0.855510890
## 0.50 0.10 0.8743963 0.747085737
## 0.50 0.20 0.7403885 0.473776882
## 0.50 0.50 0.7265036 0.445252516
## 0.50 1.00 0.7255897 0.443376096
## 1.00 0.05 0.9501232 0.899993989
## 1.00 0.10 0.9032606 0.805487632
## 1.00 0.20 0.8861791 0.770944653
## 1.00 0.50 0.8854485 0.769465454
## 1.00 1.00 0.8854485 0.769465454
## 5.00 0.05 0.9533206 0.906433362
## 5.00 0.10 0.9058186 0.810664149
## 5.00 0.20 0.8863618 0.771314309
## 5.00 0.50 0.8854485 0.769465454
## 5.00 1.00 0.8854485 0.769465454
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.05 and C = 5.
# also looks good
# optimal values: sigma = 0.05 and C = 5
# then forecasts are generated
drugs_my_test$fore_svm_Radial2_balanced <- predict(svm_Radial2_balanced,
newdata = drugs_my_test %>%
dplyr::select(-id))
table(drugs_my_test$fore_svm_Radial2_balanced)
##
## No Yes
## 265 184
# confusion matrix on the test sample
confusionMatrix(drugs_my_test$fore_svm_Radial2,
drugs_my_test$consumption_cocaine_last_month,
positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 411 38
## Yes 0 0
##
## Accuracy : 0.9154
## 95% CI : (0.8857, 0.9394)
## No Information Rate : 0.9154
## P-Value [Acc > NIR] : 0.543
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 0.000000001947
##
## Sensitivity : 0.00000
## Specificity : 1.00000
## Pos Pred Value : NaN
## Neg Pred Value : 0.91537
## Prevalence : 0.08463
## Detection Rate : 0.00000
## Detection Prevalence : 0.00000
## Balanced Accuracy : 0.50000
##
## 'Positive' Class : Yes
##
parametersC_sigma2 <-
expand.grid(C = c(1, 5, 10, 25, 50, 100),
sigma = c(0.001, 0.01, 0.1, 1, 10, 100, 1000))
set.seed(77777)
svm_Radial2_C2_balanced <- train(consumption_cocaine_last_month ~ .,
train_balanced,
method = "svmRadial",
tuneGrid = parametersC_sigma2,
trControl = ctrl_cv5x3)
svm_Radial2_C2_balanced
## Support Vector Machines with Radial Basis Function Kernel
##
## 3649 samples
## 19 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 2919, 2919, 2920, 2919, 2919, 2920, ...
## Resampling results across tuning parameters:
##
## C sigma Accuracy Kappa
## 1 0.001 0.8460749 0.6904714
## 1 0.010 0.9198865 0.8392675
## 1 0.100 0.9032606 0.8054876
## 1 1.000 0.8854485 0.7694655
## 1 10.000 0.8854485 0.7694655
## 1 100.000 0.8854485 0.7694655
## 1 1000.000 0.8854485 0.7694655
## 5 0.001 0.8569486 0.7127565
## 5 0.010 0.9453728 0.8906137
## 5 0.100 0.9058186 0.8106641
## 5 1.000 0.8854485 0.7694655
## 5 10.000 0.8854485 0.7694655
## 5 100.000 0.8854485 0.7694655
## 5 1000.000 0.8854485 0.7694655
## 10 0.001 0.8684587 0.7359447
## 10 0.010 0.9524984 0.9049280
## 10 0.100 0.9058186 0.8106641
## 10 1.000 0.8854485 0.7694655
## 10 10.000 0.8854485 0.7694655
## 10 100.000 0.8854485 0.7694655
## 10 1000.000 0.8854485 0.7694655
## 25 0.001 0.8904733 0.7802217
## 25 0.010 0.9537764 0.9075101
## 25 0.100 0.9058186 0.8106641
## 25 1.000 0.8854485 0.7694655
## 25 10.000 0.8854485 0.7694655
## 25 100.000 0.8854485 0.7694655
## 25 1000.000 0.8854485 0.7694655
## 50 0.001 0.9125786 0.8246716
## 50 0.010 0.9541423 0.9082414
## 50 0.100 0.9058186 0.8106641
## 50 1.000 0.8854485 0.7694655
## 50 10.000 0.8854485 0.7694655
## 50 100.000 0.8854485 0.7694655
## 50 1000.000 0.8854485 0.7694655
## 100 0.001 0.9241809 0.8480363
## 100 0.010 0.9539594 0.9078764
## 100 0.100 0.9058186 0.8106641
## 100 1.000 0.8854485 0.7694655
## 100 10.000 0.8854485 0.7694655
## 100 100.000 0.8854485 0.7694655
## 100 1000.000 0.8854485 0.7694655
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.01 and C = 50.
drugs_my_test$fore_svm_Radial2_C2_balanced <- predict(svm_Radial2_C2_balanced,
newdata = drugs_my_test %>%
dplyr::select(-id))
table(drugs_my_test$fore_svm_Radial2_balanced)
##
## No Yes
## 265 184
confusionMatrix(predict(svm_Radial2_C2,
newdata = drugs_my_test),
drugs_my_test$consumption_cocaine_last_month,
positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 411 38
## Yes 0 0
##
## Accuracy : 0.9154
## 95% CI : (0.8857, 0.9394)
## No Information Rate : 0.9154
## P-Value [Acc > NIR] : 0.543
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 0.000000001947
##
## Sensitivity : 0.00000
## Specificity : 1.00000
## Pos Pred Value : NaN
## Neg Pred Value : 0.91537
## Prevalence : 0.08463
## Detection Rate : 0.00000
## Detection Prevalence : 0.00000
## Balanced Accuracy : 0.50000
##
## 'Positive' Class : Yes
##
drugs_my_test_forecasts_balanced <-
data.frame(
drugs_my_train_svm_Linear1 = predict(svm_Linear1,
drugs_my_test),
drugs_my_train_svm_poly = predict(svm_poly,
drugs_my_test),
drugs_my_train_svm_Radial1 = predict(svm_Radial1,
drugs_my_test),
drugs_my_train_svm_Radial2 = predict(svm_Radial2,
drugs_my_test),
drugs_my_train_svm_Radial2_C2 = predict(svm_Radial2_C2,
drugs_my_test),
drugs_my_train_svm_Linear1_balanced = predict(svm_Linear1_balanced,
drugs_my_test),
drugs_my_train_svm_Linear2_balanced = predict(svm_Linear2_balanced,
drugs_my_test),
drugs_my_train_svm_poly_balanced = predict(svm_poly_balanced,
drugs_my_test),
drugs_my_train_svm_Radial1_balanced = predict(svm_Radial1_balanced,
drugs_my_test),
drugs_my_train_svm_Radial2_balanced = predict(svm_Radial2_balanced,
drugs_my_test),
drugs_my_train_svm_Radial2_C2_balanced = predict(svm_Radial2_C2_balanced,
drugs_my_test)
)
head(drugs_my_test_forecasts_balanced)
## drugs_my_train_svm_Linear1 drugs_my_train_svm_poly drugs_my_train_svm_Radial1
## 1 No No No
## 2 No No No
## 3 No No No
## 4 No No No
## 5 No No No
## 6 No No No
## drugs_my_train_svm_Radial2 drugs_my_train_svm_Radial2_C2
## 1 No No
## 2 No No
## 3 No No
## 4 No No
## 5 No No
## 6 No No
## drugs_my_train_svm_Linear1_balanced drugs_my_train_svm_Linear2_balanced
## 1 No No
## 2 No No
## 3 Yes Yes
## 4 Yes Yes
## 5 Yes Yes
## 6 Yes Yes
## drugs_my_train_svm_poly_balanced drugs_my_train_svm_Radial1_balanced
## 1 No No
## 2 No No
## 3 Yes Yes
## 4 Yes Yes
## 5 No No
## 6 No Yes
## drugs_my_train_svm_Radial2_balanced drugs_my_train_svm_Radial2_C2_balanced
## 1 No No
## 2 No No
## 3 Yes Yes
## 4 Yes Yes
## 5 No No
## 6 No No
# and lets apply the summary_binary_class()
# function to each of the columns with sapply
sapply(drugs_my_test_forecasts_balanced,
function(x) summary_binary_class(predicted_classes = x,
real = drugs_my_test$consumption_cocaine_last_month)) %>%
# transpose the results to have statistics in columns
# for easier comparison
t()
## Accuracy Sensitivity Specificity
## drugs_my_train_svm_Linear1 0.91537 0.00000 1.00000
## drugs_my_train_svm_poly 0.91537 0.02632 0.99757
## drugs_my_train_svm_Radial1 0.91537 0.00000 1.00000
## drugs_my_train_svm_Radial2 0.91537 0.00000 1.00000
## drugs_my_train_svm_Radial2_C2 0.91537 0.00000 1.00000
## drugs_my_train_svm_Linear1_balanced 0.77060 0.63158 0.78345
## drugs_my_train_svm_Linear2_balanced 0.77060 0.63158 0.78345
## drugs_my_train_svm_poly_balanced 0.68374 0.57895 0.69343
## drugs_my_train_svm_Radial1_balanced 0.72606 0.65789 0.73236
## drugs_my_train_svm_Radial2_balanced 0.62584 0.71053 0.61800
## drugs_my_train_svm_Radial2_C2_balanced 0.79287 0.44737 0.82482
## Pos Pred Value Neg Pred Value F1
## drugs_my_train_svm_Linear1 NaN 0.91537 NA
## drugs_my_train_svm_poly 0.50000 0.91723 0.05000
## drugs_my_train_svm_Radial1 NaN 0.91537 NA
## drugs_my_train_svm_Radial2 NaN 0.91537 NA
## drugs_my_train_svm_Radial2_C2 NaN 0.91537 NA
## drugs_my_train_svm_Linear1_balanced 0.21239 0.95833 0.31788
## drugs_my_train_svm_Linear2_balanced 0.21239 0.95833 0.31788
## drugs_my_train_svm_poly_balanced 0.14865 0.94684 0.23656
## drugs_my_train_svm_Radial1_balanced 0.18519 0.95860 0.28902
## drugs_my_train_svm_Radial2_balanced 0.14674 0.95849 0.24324
## drugs_my_train_svm_Radial2_C2_balanced 0.19101 0.94167 0.26772
## Balanced Accuracy
## drugs_my_train_svm_Linear1 0.50000
## drugs_my_train_svm_poly 0.51194
## drugs_my_train_svm_Radial1 0.50000
## drugs_my_train_svm_Radial2 0.50000
## drugs_my_train_svm_Radial2_C2 0.50000
## drugs_my_train_svm_Linear1_balanced 0.70752
## drugs_my_train_svm_Linear2_balanced 0.70752
## drugs_my_train_svm_poly_balanced 0.63619
## drugs_my_train_svm_Radial1_balanced 0.69513
## drugs_my_train_svm_Radial2_balanced 0.66427
## drugs_my_train_svm_Radial2_C2_balanced 0.63609
And the final original test sample predictions
summary(svm_Linear2_balanced)
## Length Class Mode
## 1 ksvm S4
drugs_my_test_svm_Linear2_balanced = predict(svm_Linear2_balanced,
drugs_true_test)
head(drugs_my_test_svm_Linear2_balanced)
## [1] Yes No No No No Yes
## Levels: No Yes
table(drugs_my_test_svm_Linear2_balanced)
## drugs_my_test_svm_Linear2_balanced
## No Yes
## 290 95
# fingers crossed for the results of predictions!!!
Applying the various Machine Learning methods, even those quite sophisticated, may not have any impact on model’s performance if the data is heavily imbalanced. Therefore implementation of the SMOTE algorithm has to be the first step to think of.
When the data is balanced, we can implement variety of ML classification algorithms, as I did above.
The results showed that the best trained model is the svm_Linear2_balanced, which was developed with the use of Support Vector Machine linear method and 5x3 cross validation.