Initial data mutations include: 1. substituting “unknown” poutcome
(ie previous outcome) to a more significant factor label
2. imputing the mode on contact, job
and education <- grouping by combinations of at most
two other variables
3. convert all character columns to factor datatypes
bank2 %>%
mutate(poutcome = as.character(poutcome)) %>%
mutate(poutcome = ifelse(poutcome == "unknown","nonexistent",poutcome)) %>%
mutate(poutcome = as.factor(poutcome))-> bank2
#imputing values manually
most<- function(vec){
mode <- names(which.max(table(vec)))
return(mode)
}
bank2 %>%
group_by(job,marital) %>%
mutate(contact2 = ifelse(contact=="unknown",most(contact),contact)) %>%
ungroup() -> bank2
bank2 %>%
group_by(marital,housing) %>%
mutate(job2 = ifelse(job=="unknown",most(job),job)) %>%
ungroup() -> bank2
bank2 %>%
group_by(marital,housing) %>%
mutate(education2 = ifelse(education=="unknown",most(education),education)) %>%
ungroup() -> bank2
#end manual imputing
bank2 %>%
mutate_if(is.character,as.factor) -> bank2.f.2
#names(bank2.f.2)
tgt = c('age','job2','education2','y','marital','housing','loan','default','contact2','day','month','campaign','poutcome','pdays_f','balance2','duration2')
#DATA PRE-PROCESSING
# one-hot encoding of categorical variables
# scale numeric values
bank.flt <- bank2.f.2[,tgt]
This results in 15 predictors. Here is a summary of them
summary(subset(bank.flt,select = -c(y)))
## age job2 education2 marital
## Min. :18.00 blue-collar:9745 primary : 6851 divorced: 5207
## 1st Qu.:33.00 management :9733 secondary:25059 married :27214
## Median :39.00 technician :7597 tertiary :13301 single :12790
## Mean :40.94 admin. :5171
## 3rd Qu.:48.00 services :4154
## Max. :95.00 retired :2264
## (Other) :6547
## housing loan default contact2 day
## no :20081 no :37967 no :44396 cellular :42266 Min. : 1.00
## yes:25130 yes: 7244 yes: 815 telephone: 2906 1st Qu.: 8.00
## unknown : 39 Median :16.00
## Mean :15.81
## 3rd Qu.:21.00
## Max. :31.00
##
## month campaign poutcome pdays_f
## may :13766 Min. : 1.000 failure : 4901 max : 308
## jul : 6895 1st Qu.: 1.000 nonexistent:36959 med1: 2835
## aug : 6247 Median : 2.000 other : 1840 med2: 3153
## jun : 5341 Mean : 2.764 success : 1511 min : 1961
## nov : 3970 3rd Qu.: 3.000 new :36954
## apr : 2932 Max. :63.000
## (Other): 6060
## balance2 duration2
## Min. :-8019.0 Min. : 0.0
## 1st Qu.: 72.0 1st Qu.:103.0
## Median : 448.0 Median :180.0
## Mean : 788.3 Mean :244.3
## 3rd Qu.: 1345.0 3rd Qu.:319.0
## Max. : 8971.0 Max. :773.2
##
sapply(bank.flt,class)
## age job2 education2 y marital housing loan
## "integer" "factor" "factor" "factor" "factor" "factor" "factor"
## default contact2 day month campaign poutcome pdays_f
## "factor" "factor" "integer" "factor" "integer" "factor" "factor"
## balance2 duration2
## "numeric" "numeric"
Pre-processing specifically for SVM. Even though e1071 does pre-processing automatically, I do it separately here because I don’t want the function to scale the dummy variables, which it will do as they will be recognized as numeric from here on out.
#df with dummy variables and numeric
#convert all factor variables to dummy except for the response ("y")
test.dummy <- caret::dummyVars( ~. , bank.flt[,-4], sep='_')
df <- predict(test.dummy,bank.flt)
df <- data.frame(df)
#repatriate the 'y'
df$y <- bank.flt$y
Now each level of each factor variable in the original dataset will be cast to its own column. This can lead to data leakage.
#how many columns to expect? ~ 53, but get 55
bank.flt |>
select_if(is.factor) |>
summarize(across(everything(), ~nlevels(.)))
## # A tibble: 1 × 11
## job2 education2 y marital housing loan default contact2 month poutcome
## <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 11 3 2 3 2 2 2 3 12 4
## # ℹ 1 more variable: pdays_f <int>
#compare the number of columns from dummyVars to the levels count in the pipe operation
dim(df)
## [1] 45211 53
The final step of data preprocessing is scaling the numeric variables. Before doing this, however, it’s appropriate to split the test and training set, so that the test set avoids any influence from the training set. The scaling range of the training set could influence the test set scaled data, so they are scaled independently.
In the code below, I split both the untreated and treated dataframes into test and training sets. Then to the “treated” dataframe (the one with the pre-processing), I scale the numeric values. I don’t do this to the test/train of the untreated dataset, as I will just feed it as-is to the svm function, and let it do the pre-processing.
# SPLITTING THE DATASET
set.seed(123)
#I create a test/training split on the original and dummied dataset to check whether the pre-processing had a negative effect compared with what the algorithm would do on its own.
train_ix <- caret::createDataPartition(bank.flt$y, p=.8,list=FALSE,times=1)
train_ix_d <- caret::createDataPartition(df$y, p=.8,list=FALSE,times=1)
#sampling for testing purposes
train_ix_sm <- sample(train_ix,10000, replace = FALSE)
train_sm <- bank.flt[train_ix_sm,]
#original df
bank2_train <- bank.flt[train_ix,]
bank2_test <- bank.flt[-train_ix,]
#dummy converted df
df_train <- df[train_ix_d,]
df_test <- df[-train_ix_d,]
#SCALE THE NUMERIC VALUES
#new method to remove variables by character vector; this df contains non-numeric (original vectors)
bank.flt |>
select_if(is.numeric) |>
names() -> numeric.names
df_train |>
select(-any_of(numeric.names)) -> df_train_ex
#52 cols
df_test |>
select(-any_of(numeric.names)) -> df_test_ex
train.scale <- scale(df_train[numeric.names])
test.scale <- scale(df_test[numeric.names])
final_train <- cbind(data.frame(train.scale),df_train_ex)
final_test <- cbind(data.frame(test.scale),df_test_ex)
#sampling final_train for speed
final_train_sm <- final_train[train_ix_sm,]
Verify that all columns except the response is of numeric datatype.
#verify that cols are numeric and that 'y' is categorical
sapply(final_train, class)
## age day campaign
## "numeric" "numeric" "numeric"
## balance2 duration2 job2_admin.
## "numeric" "numeric" "numeric"
## job2_blue.collar job2_entrepreneur job2_housemaid
## "numeric" "numeric" "numeric"
## job2_management job2_retired job2_self.employed
## "numeric" "numeric" "numeric"
## job2_services job2_student job2_technician
## "numeric" "numeric" "numeric"
## job2_unemployed education2_primary education2_secondary
## "numeric" "numeric" "numeric"
## education2_tertiary marital_divorced marital_married
## "numeric" "numeric" "numeric"
## marital_single housing_no housing_yes
## "numeric" "numeric" "numeric"
## loan_no loan_yes default_no
## "numeric" "numeric" "numeric"
## default_yes contact2_cellular contact2_telephone
## "numeric" "numeric" "numeric"
## contact2_unknown month_apr month_aug
## "numeric" "numeric" "numeric"
## month_dec month_feb month_jan
## "numeric" "numeric" "numeric"
## month_jul month_jun month_mar
## "numeric" "numeric" "numeric"
## month_may month_nov month_oct
## "numeric" "numeric" "numeric"
## month_sep poutcome_failure poutcome_nonexistent
## "numeric" "numeric" "numeric"
## poutcome_other poutcome_success pdays_f_max
## "numeric" "numeric" "numeric"
## pdays_f_med1 pdays_f_med2 pdays_f_min
## "numeric" "numeric" "numeric"
## pdays_f_new y
## "numeric" "factor"
After scaling, I should get a mean ~ 0 for the numeric values. I don’t quite get that, but I accept the results.
final_train |>
select(numeric.names) |>
summarize(across(everything(), ~mean(.)))
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(numeric.names)
##
## # Now:
## data %>% select(all_of(numeric.names))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## age day campaign balance2 duration2
## 1 2.800792e-16 -5.995073e-17 -5.825874e-17 -1.80449e-18 6.456245e-17
Moving on to fitting SVM models with various kernels. I’ll fit a linear and RBF kernel on this dataset. The outcome is a binary classfication one.
#neither scaled nor one-hot encoded; linear
#train_sm is the sampled untreated/original df
lin.mdl.hc <- svm(y ~ ., data = train_sm, method = "C-classification", kernel = "linear", scale = TRUE, cost = 5)
#clock the below: original, unscaled and categorical dataset
#support vector 1137
lin.mdl.hc1 <- svm(y ~ ., data = train_sm, method = "C-classification", kernel = "linear", scale = TRUE, cost = 50)
I’ll compile the results via the confusionMatrix function and produce them at the end alongside the other models.
#cost = 5
lin.mdl.1.pred <- predict(lin.mdl.hc,bank2_test)
cm.lin.1 <- confusionMatrix(lin.mdl.1.pred, bank2_test$y, positive = "yes")
#linear model with cost = 50
lin.mdl.2.pred <- predict(lin.mdl.hc1,bank2_test)
cm.lin.2 <- confusionMatrix(lin.mdl.2.pred, bank2_test$y, positive = "yes")
I’ll compare the two above with the fully pre-processed dataset to ensure equality.
#scaled and one-hot encoded; linear; default Cost = 1
lin.model.trt <- svm(y ~ ., data = final_train_sm, method = "C-classification", kernel = "linear", scale = FALSE, cost = 5)
#cost = 5
lin.mdl.trt.1.pred <- predict(lin.model.trt,final_test)
cm.lin.trt.1 <- confusionMatrix(lin.mdl.trt.1.pred, final_test$y, positive = "yes")
Compile all the results into a df containing model results from assignment #2.
results.df[9,"model"] = 'svm linear c5 original df'
results.df[10,"model"] = 'svm linear c50 original df'
results.df[11,"model"] = 'svm linear c5 treated df'
results.df[9,"F1"] = cm.lin.1$byClass['F1']
results.df[10,"F1"] = cm.lin.2$byClass['F1']
results.df[11,"F1"] = cm.lin.trt.1$byClass['F1']
results.df[9,"Recall"] = cm.lin.1$byClass['Recall']
results.df[10,"Recall"] = cm.lin.2$byClass['Recall']
results.df[11,"Recall"] = cm.lin.trt.1$byClass['Recall']
The results from the pre-processed dataset performed better. NOTE the model here was fit on a sample of the dataset: 10,000 out of 36,000 records. This was done to shorten processing time.
I’ll compare the above with the results of a tuned RBF. I’ll also tune the RBF kernel SVM on a sample of the training dataset. (The code below is commented out to reduce knitting time, but the best parameters extracted from the summary of the tuned object are hard coded in the svm model below)
#TUNE PARAMETERS
#try tuning linear, then polynomial and lastly RBF
#example 1
#polynomial involves gamma, coef0 and degree
#radial: gamma
#tune.rad <- tune.svm(y ~ ., data = final_train, gamma = 10^(-2:-4), cost = 10^(1:2), kernel = "radial", scale = FALSE)
#summary(tune.rad)
I’ll plug in the parameters with the best results for the % classification error rate.
best.rad <- svm(y ~ ., data = final_train, method = "C-classification", kernel = "radial", cost = 100, gamma = 1^-2, scale = FALSE)
rbf.pred <- predict(best.rad,final_test)
cm.rbf <- confusionMatrix(rbf.pred, final_test$y, positive = "yes")
Record the results from the RBF into the model aggregator.
results.df[12,"model"] = 'svm rbf'
results.df[12,"F1"] = cm.rbf$byClass['F1']
results.df[12,"Recall"] = cm.rbf$byClass['Recall']
View the results across all models:
results.df
## model F1 ROC_AUC Recall
## 1 dt original 0.4140088 0.7455501 0.3131504
## 2 dt mutated 0.4140088 0.7455501 0.3131504
## 3 tuned dt 0.4726651 0.7931427 0.3926206
## 4 dt balanced classes 0.4865156 NA 0.8533586
## 5 rf default 0.5000000 NA 0.4143803
## 6 rf tuned 0.5405405 NA 0.4730369
## 7 rf balanced dataset 0.5773585 NA 0.8684957
## 8 dt balanced classes 0.4806630 NA 0.4115421
## 9 svm linear c5 original df 0.2748320 NA 0.1740776
## 10 svm linear c50 original df 0.2748320 NA 0.1740776
## 11 svm linear c5 treated df 0.2915751 NA 0.1882687
## 12 svm rbf 0.1876833 NA 0.1210974