knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE, fig.width = 8, fig.height = 5)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(e1071)
##
## Attaching package: 'e1071'
##
## The following object is masked from 'package:ggplot2':
##
## element
theme_set(theme_minimal())
set.seed(123)
candidates <- c(
params$bank_path,
"data/BankData.csv",
"BankData.csv",
file.path(Sys.getenv("HOME"), "Downloads", "BankData.csv")
)
existing <- candidates[file.exists(candidates)]
if (length(existing) == 0) {
stop("BankData.csv not found. Set params$bank_path or place the file next to this .Rmd.\nTried:\n - ",
paste(candidates, collapse = "\n - "))
}
BankData <- read.csv(existing[1], stringsAsFactors = FALSE)
stopifnot(is.data.frame(BankData), nrow(BankData) > 0)
bank <- BankData %>%
mutate(
across(starts_with("bool"), ~ ifelse(. %in% c("t","T","true","TRUE",1), 1L, 0L)),
approval = factor(approval)
)
numeric_cols <- bank %>% select(where(is.numeric)) %>% names()
cat_cols <- c("approval", names(select(BankData, starts_with("bool"))))
bank %>%
pivot_longer(all_of(numeric_cols), names_to = "var", values_to = "value") %>%
ggplot(aes(value)) + geom_histogram(bins = 30) +
facet_wrap(~ var, scales = "free") + labs(title = "Numeric — Histograms")
## Warning: Removed 25 rows containing non-finite outside the scale range
## (`stat_bin()`).

bank %>%
pivot_longer(all_of(numeric_cols), names_to = "var", values_to = "value") %>%
ggplot(aes(value)) + geom_density() +
facet_wrap(~ var, scales = "free") + labs(title = "Numeric — Densities")
## Warning: Removed 25 rows containing non-finite outside the scale range
## (`stat_density()`).

BankData %>%
pivot_longer(all_of(cat_cols), names_to = "var", values_to = "value") %>%
mutate(value = factor(value)) %>%
ggplot(aes(value)) + geom_bar() +
facet_wrap(~ var, scales = "free_y") +
labs(title = "Categorical/Booleans — Bars", x = NULL, y = "Count")

bank <- bank %>%
mutate(
cont1_z = as.numeric(scale(cont1)),
credit.score_mm = (credit.score - min(credit.score, na.rm=TRUE))/
(max(credit.score, na.rm=TRUE) - min(credit.score, na.rm=TRUE)),
cont6_dec = cont6 / (10 ^ ceiling(log10(max(abs(cont6), na.rm=TRUE))))
)
summary(select(bank, cont1, cont1_z, credit.score, credit.score_mm, cont6, cont6_dec))
## cont1 cont1_z credit.score credit.score_mm
## Min. :13.75 Min. :-1.4901 Min. :583.7 Min. :0.0000
## 1st Qu.:22.60 1st Qu.:-0.7498 1st Qu.:666.7 1st Qu.:0.3737
## Median :28.46 Median :-0.2599 Median :697.3 Median :0.5110
## Mean :31.57 Mean : 0.0000 Mean :696.4 Mean :0.5070
## 3rd Qu.:38.23 3rd Qu.: 0.5571 3rd Qu.:726.4 3rd Qu.:0.6418
## Max. :80.25 Max. : 4.0711 Max. :806.0 Max. :1.0000
## NA's :12 NA's :12
## cont6 cont6_dec
## Min. : 0.0 Min. :0.000000
## 1st Qu.: 0.0 1st Qu.:0.000000
## Median : 5.0 Median :0.000050
## Mean : 1017.4 Mean :0.010174
## 3rd Qu.: 395.5 3rd Qu.:0.003955
## Max. :100000.0 Max. :1.000000
##
bank %>%
select(cont1, cont1_z, credit.score, credit.score_mm, cont6, cont6_dec) %>%
pivot_longer(everything(), names_to = "var", values_to = "value") %>%
ggplot(aes(value)) + geom_histogram(bins = 30) +
facet_wrap(~ var, scales = "free") + labs(title = "Original vs Normalized")
## Warning: Removed 24 rows containing non-finite outside the scale range
## (`stat_bin()`).

bank <- bank %>%
mutate(credit_bin = factor(ntile(credit.score, 3), labels = c("Low","Mid","High")))
table(bank$credit_bin)
##
## Low Mid High
## 230 230 230
ggplot(bank, aes(credit.score, fill = credit_bin)) +
geom_histogram(bins = 30, alpha = 0.85) +
labs(title = "Credit Score Bins (Equal Depth)")

bin_stats <- bank %>% group_by(credit_bin) %>%
summarise(credit_smooth_val = median(credit.score, na.rm = TRUE), .groups = "drop")
bank <- bank %>% left_join(bin_stats, by = "credit_bin") %>%
rename(credit.score_smoothed = credit_smooth_val)
bank %>%
select(credit.score, credit.score_smoothed, credit_bin) %>%
pivot_longer(-credit_bin, names_to = "kind", values_to = "value") %>%
ggplot(aes(value, fill = kind)) + geom_density(alpha = 0.6) +
facet_wrap(~ credit_bin, scales = "free_x") +
labs(title = "Smoothing by Bin Medians")

bd <- BankData %>%
mutate(
across(starts_with("bool"), ~ ifelse(. %in% c("t","T","true","TRUE",1), 1L, 0L)),
approval = factor(approval)
) %>% drop_na()
ctrl10 <- trainControl(method = "cv", number = 10)
svm_cv <- train(
approval ~ ., data = bd, method = "svmLinear",
trControl = ctrl10, preProcess = c("center","scale")
)
svm_cv
## Support Vector Machines with Linear Kernel
##
## 666 samples
## 12 predictor
## 2 classes: '-', '+'
##
## Pre-processing: centered (12), scaled (12)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 599, 599, 599, 600, 599, 600, ...
## Resampling results:
##
## Accuracy Kappa
## 0.8633424 0.7286267
##
## Tuning parameter 'C' was held constant at a value of 1
gridC <- expand.grid(C = 10 ^ seq(-5, 2, by = 0.5))
svm_grid <- train(
approval ~ ., data = bd, method = "svmLinear",
trControl = ctrl10, preProcess = c("center","scale"),
tuneGrid = gridC
)
svm_grid
## Support Vector Machines with Linear Kernel
##
## 666 samples
## 12 predictor
## 2 classes: '-', '+'
##
## Pre-processing: centered (12), scaled (12)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 601, 600, 599, 599, 599, 599, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 1.000000e-05 0.5510427 0.00000000
## 3.162278e-05 0.5510427 0.00000000
## 1.000000e-04 0.5510427 0.00000000
## 3.162278e-04 0.5706506 0.04748434
## 1.000000e-03 0.8379623 0.67022353
## 3.162278e-03 0.8650569 0.73168190
## 1.000000e-02 0.8650343 0.73176280
## 3.162278e-02 0.8635191 0.72869969
## 1.000000e-01 0.8635191 0.72869969
## 3.162278e-01 0.8635191 0.72869969
## 1.000000e+00 0.8635191 0.72869969
## 3.162278e+00 0.8635191 0.72869969
## 1.000000e+01 0.8635191 0.72869969
## 3.162278e+01 0.8635191 0.72869969
## 1.000000e+02 0.8635191 0.72869969
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 0.003162278.
svm_grid$bestTune
max(svm_grid$results$Accuracy)
## [1] 0.8650569
data(starwars, package = "dplyr")
sw_raw <- starwars %>%
select(-films, -vehicles, -starships, -name) %>% drop_na()
pred_cat <- sw_raw %>% select(where(is.character)) %>% names()
pred_cat <- setdiff(pred_cat, "gender")
dv <- dummyVars(~ ., data = sw_raw[, pred_cat, drop = FALSE], fullRank = TRUE)
X_cat <- as.data.frame(predict(dv, sw_raw[, pred_cat, drop = FALSE]))
X_num <- sw_raw %>% select(-all_of(pred_cat), -gender)
sw_ml <- bind_cols(X_num, X_cat, gender = factor(sw_raw$gender))
head(sw_ml)
svm_sw <- train(
gender ~ ., data = sw_ml, method = "svmLinear",
trControl = trainControl(method = "cv", number = 10),
preProcess = c("center","scale")
)
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: skin_colorred, homeworldDathomir,
## homeworldSerenno, speciesZabrak
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: `skin_colorbrown mottle`,
## skin_colortan, skin_colorunknown, `homeworldConcord Dawn`, homeworldKashyyyk,
## `homeworldMon Cala`, `speciesMon Calamari`, speciesWookiee
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: skin_colorgreen, eye_colorred,
## homeworldBespin, homeworldRyloth, homeworldTrandosha, speciesTrandoshan,
## `speciesTwi'lek`
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: `hair_colorbrown, grey`,
## homeworldKamino
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: homeworldCerea, homeworldDorin,
## `speciesKel Dor`
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: `homeworldHaruun Kal`
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: skin_colorwhite
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: `eye_colorblue-gray`,
## homeworldSocorro, homeworldStewjon
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: hair_colorgrey, skin_colorbrown,
## homeworldEndor, speciesEwok, speciesGungan
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
svm_sw
## Support Vector Machines with Linear Kernel
##
## 29 samples
## 60 predictors
## 2 classes: 'feminine', 'masculine'
##
## Pre-processing: centered (60), scaled (60)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 26, 25, 26, 26, 27, 27, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9416667 0.7857143
##
## Tuning parameter 'C' was held constant at a value of 1
X <- sw_ml %>% select(-gender)
pca <- prcomp(X, scale. = TRUE)
cumvar <- cumsum(pca$sdev^2 / sum(pca$sdev^2))
plot(cumvar, type = "b", ylab = "Cumulative explained variance", xlab = "PCs",
main = "Choose PCs (~95%)"); abline(h = 0.95, lty = 2)

pp <- preProcess(X, method = c("center","scale","pca"), thresh = 0.95)
pp$numComp
## [1] 21
X_pca <- predict(pp, X)
sw_pca <- bind_cols(X_pca, gender = sw_ml$gender)
head(sw_pca)
idx <- createDataPartition(sw_pca$gender, p = 0.7, list = FALSE)
sw_train <- sw_pca[idx, ]
sw_test <- sw_pca[-idx, ]
gridC <- expand.grid(C = 10 ^ seq(-5, 2, by = 0.5))
svm_pca_cvgrid <- train(
gender ~ ., data = sw_train, method = "svmLinear",
trControl = trainControl(method = "cv", number = 10),
tuneGrid = gridC
)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
svm_pca_cvgrid$bestTune
pred_test <- predict(svm_pca_cvgrid, sw_test)
confusionMatrix(sw_test$gender, pred_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction feminine masculine
## feminine 1 0
## masculine 0 6
##
## Accuracy : 1
## 95% CI : (0.5904, 1)
## No Information Rate : 0.8571
## P-Value [Acc > NIR] : 0.3399
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.1429
## Detection Rate : 0.1429
## Detection Prevalence : 0.1429
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : feminine
##
svm_pca_cv <- train(
gender ~ ., data = sw_pca, method = "svmLinear",
trControl = trainControl(method = "cv", number = 10),
tuneGrid = gridC
)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
svm_pca_cv
## Support Vector Machines with Linear Kernel
##
## 29 samples
## 21 predictors
## 2 classes: 'feminine', 'masculine'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 26, 26, 26, 25, 27, 26, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 1.000000e-05 0.8166667 0.0000000
## 3.162278e-05 0.8166667 0.0000000
## 1.000000e-04 0.8166667 0.0000000
## 3.162278e-04 0.8166667 0.0000000
## 1.000000e-03 0.8166667 0.0000000
## 3.162278e-03 0.8166667 0.0000000
## 1.000000e-02 0.8166667 0.0000000
## 3.162278e-02 0.8166667 0.0000000
## 1.000000e-01 0.9666667 0.8333333
## 3.162278e-01 0.9333333 0.7333333
## 1.000000e+00 0.9333333 0.7333333
## 3.162278e+00 0.9333333 0.7333333
## 1.000000e+01 0.9333333 0.7333333
## 3.162278e+01 0.9333333 0.7333333
## 1.000000e+02 0.9333333 0.7333333
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 0.1.
svm_pca_loocv <- train(
gender ~ ., data = sw_pca, method = "svmLinear",
trControl = trainControl(method = "LOOCV"),
tuneGrid = gridC
)
svm_pca_loocv
## Support Vector Machines with Linear Kernel
##
## 29 samples
## 21 predictors
## 2 classes: 'feminine', 'masculine'
##
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation
## Summary of sample sizes: 28, 28, 28, 28, 28, 28, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 1.000000e-05 0.7931034 0.0000000
## 3.162278e-05 0.7931034 0.0000000
## 1.000000e-04 0.7931034 0.0000000
## 3.162278e-04 0.7931034 0.0000000
## 1.000000e-03 0.7931034 0.0000000
## 3.162278e-03 0.7931034 0.0000000
## 1.000000e-02 0.7931034 0.0000000
## 3.162278e-02 0.7931034 0.0000000
## 1.000000e-01 0.9655172 0.8880309
## 3.162278e-01 0.9310345 0.7898551
## 1.000000e+00 0.9310345 0.7898551
## 3.162278e+00 0.9310345 0.7898551
## 1.000000e+01 0.9310345 0.7898551
## 3.162278e+01 0.9310345 0.7898551
## 1.000000e+02 0.9310345 0.7898551
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 0.1.
data(Sacramento, package = "caret")
sac <- Sacramento %>%
select(-zip, -city) %>%
drop_na()
table(sac$type); prop.table(table(sac$type))
##
## Condo Multi_Family Residential
## 53 13 866
##
## Condo Multi_Family Residential
## 0.05686695 0.01394850 0.92918455
sac %>%
select(where(is.numeric)) %>%
pivot_longer(everything(), names_to = "var", values_to = "value") %>%
ggplot(aes(value)) + geom_histogram(bins = 30) +
facet_wrap(~ var, scales = "free") +
labs(title = "Sacramento — Numeric Distributions")

sac1 <- sac %>%
mutate(
price_log = log(price),
sqft_log = log(sqft)
) %>%
select(-price, -sqft)
idx <- createDataPartition(sac1$type, p = 0.7, list = FALSE)
tr <- sac1[idx, ]
te <- sac1[-idx, ]
gridC <- expand.grid(C = 10 ^ seq(-5, 2, by = 0.5))
svm_sac <- train(
type ~ ., data = tr, method = "svmLinear",
trControl = trainControl(method = "cv", number = 10),
preProcess = c("center","scale"),
tuneGrid = gridC
)
svm_sac
## Support Vector Machines with Linear Kernel
##
## 655 samples
## 6 predictor
## 3 classes: 'Condo', 'Multi_Family', 'Residential'
##
## Pre-processing: centered (6), scaled (6)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 589, 589, 589, 590, 590, 590, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 1.000000e-05 0.9267366 0.00000000
## 3.162278e-05 0.9267366 0.00000000
## 1.000000e-04 0.9267366 0.00000000
## 3.162278e-04 0.9267366 0.00000000
## 1.000000e-03 0.9267366 0.00000000
## 3.162278e-03 0.9267366 0.00000000
## 1.000000e-02 0.9267366 0.00000000
## 3.162278e-02 0.9267366 0.00000000
## 1.000000e-01 0.9267366 0.00000000
## 3.162278e-01 0.9267366 0.00000000
## 1.000000e+00 0.9297669 0.06356589
## 3.162278e+00 0.9297669 0.06356589
## 1.000000e+01 0.9282284 0.06136463
## 3.162278e+01 0.9267133 0.07044190
## 1.000000e+02 0.9267133 0.07044190
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 1.
pred <- predict(svm_sac, te)
confusionMatrix(te$type, pred)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Condo Multi_Family Residential
## Condo 1 0 14
## Multi_Family 0 0 3
## Residential 1 0 258
##
## Overall Statistics
##
## Accuracy : 0.935
## 95% CI : (0.8992, 0.961)
## No Information Rate : 0.9928
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0891
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Condo Class: Multi_Family Class: Residential
## Sensitivity 0.50000 NA 0.93818
## Specificity 0.94909 0.98917 0.50000
## Pos Pred Value 0.06667 NA 0.99614
## Neg Pred Value 0.99618 NA 0.05556
## Prevalence 0.00722 0.00000 0.99278
## Detection Rate 0.00361 0.00000 0.93141
## Detection Prevalence 0.05415 0.01083 0.93502
## Balanced Accuracy 0.72455 NA 0.71909
pp <- preProcess(tr %>% select(-type), method = c("center","scale","pca"), thresh = 0.95)
tr_pca <- bind_cols(predict(pp, tr %>% select(-type)), type = tr$type)
te_pca <- bind_cols(predict(pp, te %>% select(-type)), type = te$type)
svm_sac_pca <- train(
type ~ ., data = tr_pca, method = "svmLinear",
trControl = trainControl(method = "cv", number = 10),
tuneGrid = gridC
)
svm_sac_pca
## Support Vector Machines with Linear Kernel
##
## 655 samples
## 5 predictor
## 3 classes: 'Condo', 'Multi_Family', 'Residential'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 589, 589, 590, 589, 589, 591, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 1.000000e-05 0.9267570 0.00000000
## 3.162278e-05 0.9267570 0.00000000
## 1.000000e-04 0.9267570 0.00000000
## 3.162278e-04 0.9267570 0.00000000
## 1.000000e-03 0.9267570 0.00000000
## 3.162278e-03 0.9267570 0.00000000
## 1.000000e-02 0.9267570 0.00000000
## 3.162278e-02 0.9267570 0.00000000
## 1.000000e-01 0.9267570 0.00000000
## 3.162278e-01 0.9267570 0.00000000
## 1.000000e+00 0.9282721 0.04651163
## 3.162278e+00 0.9267337 0.04414942
## 1.000000e+01 0.9282488 0.07593237
## 3.162278e+01 0.9297640 0.10771531
## 1.000000e+02 0.9297640 0.10771531
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 31.62278.
pred_pca <- predict(svm_sac_pca, te_pca)
confusionMatrix(te_pca$type, pred_pca)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Condo Multi_Family Residential
## Condo 4 0 11
## Multi_Family 0 0 3
## Residential 2 0 257
##
## Overall Statistics
##
## Accuracy : 0.9422
## 95% CI : (0.9079, 0.9666)
## No Information Rate : 0.9783
## P-Value [Acc > NIR] : 0.9999
##
## Kappa : 0.3129
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Condo Class: Multi_Family Class: Residential
## Sensitivity 0.66667 NA 0.9483
## Specificity 0.95941 0.98917 0.6667
## Pos Pred Value 0.26667 NA 0.9923
## Neg Pred Value 0.99237 NA 0.2222
## Prevalence 0.02166 0.00000 0.9783
## Detection Rate 0.01444 0.00000 0.9278
## Detection Prevalence 0.05415 0.01083 0.9350
## Balanced Accuracy 0.81304 NA 0.8075
tbl <- table(sac$type)
major <- names(tbl)[which.max(tbl)]
minor <- setdiff(names(tbl), major)
sac_major <- sac %>% filter(type == major)
sac_minors <- sac %>% filter(type %in% minor)
target_n <- nrow(sac_minors)
sac_major_s <- sac_major %>% slice_sample(n = target_n)
sac_bal <- bind_rows(sac_minors, sac_major_s)
sac_bal %>%
select(where(is.numeric)) %>%
pivot_longer(everything(), names_to = "var", values_to = "value") %>%
ggplot(aes(value)) + geom_histogram(bins = 30) +
facet_wrap(~ var, scales = "free") +
labs(title = "Balanced Sample — Numeric Distributions")

idx <- createDataPartition(sac_bal$type, p = 0.7, list = FALSE)
trb <- sac_bal[idx, ]
teb <- sac_bal[-idx, ]
svm_bal <- train(
type ~ ., data = trb, method = "svmLinear",
trControl = trainControl(method = "cv", number = 10),
preProcess = c("center","scale"),
tuneGrid = gridC
)
svm_bal
## Support Vector Machines with Linear Kernel
##
## 95 samples
## 6 predictor
## 3 classes: 'Condo', 'Multi_Family', 'Residential'
##
## Pre-processing: centered (6), scaled (6)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 85, 85, 85, 87, 85, 86, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 1.000000e-05 0.4944444 0.0000000
## 3.162278e-05 0.4944444 0.0000000
## 1.000000e-04 0.4944444 0.0000000
## 3.162278e-04 0.4944444 0.0000000
## 1.000000e-03 0.4944444 0.0000000
## 3.162278e-03 0.4944444 0.0000000
## 1.000000e-02 0.7050000 0.4592201
## 3.162278e-02 0.7061111 0.4662880
## 1.000000e-01 0.7386111 0.5217754
## 3.162278e-01 0.7411111 0.5409873
## 1.000000e+00 0.7397222 0.5381498
## 3.162278e+00 0.7708333 0.5916639
## 1.000000e+01 0.7483333 0.5501365
## 3.162278e+01 0.7372222 0.5284916
## 1.000000e+02 0.7372222 0.5284916
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 3.162278.
pred_bal <- predict(svm_bal, teb)
confusionMatrix(teb$type, pred_bal)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Condo Multi_Family Residential
## Condo 15 0 0
## Multi_Family 0 3 0
## Residential 6 0 13
##
## Overall Statistics
##
## Accuracy : 0.8378
## 95% CI : (0.6799, 0.9381)
## No Information Rate : 0.5676
## P-Value [Acc > NIR] : 0.0004694
##
## Kappa : 0.7218
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Condo Class: Multi_Family Class: Residential
## Sensitivity 0.7143 1.00000 1.0000
## Specificity 1.0000 1.00000 0.7500
## Pos Pred Value 1.0000 1.00000 0.6842
## Neg Pred Value 0.7273 1.00000 1.0000
## Prevalence 0.5676 0.08108 0.3514
## Detection Rate 0.4054 0.08108 0.3514
## Detection Prevalence 0.4054 0.08108 0.5135
## Balanced Accuracy 0.8571 1.00000 0.8750
mycars <- mtcars
mycars$folds <- 0
flds <- createFolds(1:nrow(mycars), k = 5, list = TRUE)
for (i in 1:5) { mycars$folds[flds[[i]]] <- i }
mycars %>%
mutate(folds = factor(folds)) %>%
ggplot(aes(x = folds, fill = factor(gear))) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(title = "Distribution of 'gear' across 5 folds", x = "Fold", y = "Percent", fill = "gear")
