Homework: SVM, Preprocessing, and Evaluation

knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE, fig.width = 8, fig.height = 5)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(e1071)

## 
## Attaching package: 'e1071'
## 
## The following object is masked from 'package:ggplot2':
## 
##     element

theme_set(theme_minimal())
set.seed(123)

candidates <- c(
  params$bank_path,
  "data/BankData.csv",
  "BankData.csv",
  file.path(Sys.getenv("HOME"), "Downloads", "BankData.csv")
)
existing <- candidates[file.exists(candidates)]
if (length(existing) == 0) {
  stop("BankData.csv not found. Set params$bank_path or place the file next to this .Rmd.\nTried:\n  - ",
       paste(candidates, collapse = "\n  - "))
}
BankData <- read.csv(existing[1], stringsAsFactors = FALSE)
stopifnot(is.data.frame(BankData), nrow(BankData) > 0)

bank <- BankData %>%
  mutate(
    across(starts_with("bool"), ~ ifelse(. %in% c("t","T","true","TRUE",1), 1L, 0L)),
    approval = factor(approval)
  )
numeric_cols <- bank %>% select(where(is.numeric)) %>% names()
cat_cols     <- c("approval", names(select(BankData, starts_with("bool"))))

bank %>%
  pivot_longer(all_of(numeric_cols), names_to = "var", values_to = "value") %>%
  ggplot(aes(value)) + geom_histogram(bins = 30) +
  facet_wrap(~ var, scales = "free") + labs(title = "Numeric — Histograms")

## Warning: Removed 25 rows containing non-finite outside the scale range
## (`stat_bin()`).

bank %>%
  pivot_longer(all_of(numeric_cols), names_to = "var", values_to = "value") %>%
  ggplot(aes(value)) + geom_density() +
  facet_wrap(~ var, scales = "free") + labs(title = "Numeric — Densities")

## Warning: Removed 25 rows containing non-finite outside the scale range
## (`stat_density()`).

BankData %>%
  pivot_longer(all_of(cat_cols), names_to = "var", values_to = "value") %>%
  mutate(value = factor(value)) %>%
  ggplot(aes(value)) + geom_bar() +
  facet_wrap(~ var, scales = "free_y") +
  labs(title = "Categorical/Booleans — Bars", x = NULL, y = "Count")

bank <- bank %>%
  mutate(
    cont1_z = as.numeric(scale(cont1)),
    credit.score_mm = (credit.score - min(credit.score, na.rm=TRUE))/
                      (max(credit.score, na.rm=TRUE) - min(credit.score, na.rm=TRUE)),
    cont6_dec = cont6 / (10 ^ ceiling(log10(max(abs(cont6), na.rm=TRUE))))
  )
summary(select(bank, cont1, cont1_z, credit.score, credit.score_mm, cont6, cont6_dec))

##      cont1          cont1_z         credit.score   credit.score_mm 
##  Min.   :13.75   Min.   :-1.4901   Min.   :583.7   Min.   :0.0000  
##  1st Qu.:22.60   1st Qu.:-0.7498   1st Qu.:666.7   1st Qu.:0.3737  
##  Median :28.46   Median :-0.2599   Median :697.3   Median :0.5110  
##  Mean   :31.57   Mean   : 0.0000   Mean   :696.4   Mean   :0.5070  
##  3rd Qu.:38.23   3rd Qu.: 0.5571   3rd Qu.:726.4   3rd Qu.:0.6418  
##  Max.   :80.25   Max.   : 4.0711   Max.   :806.0   Max.   :1.0000  
##  NA's   :12      NA's   :12                                        
##      cont6            cont6_dec       
##  Min.   :     0.0   Min.   :0.000000  
##  1st Qu.:     0.0   1st Qu.:0.000000  
##  Median :     5.0   Median :0.000050  
##  Mean   :  1017.4   Mean   :0.010174  
##  3rd Qu.:   395.5   3rd Qu.:0.003955  
##  Max.   :100000.0   Max.   :1.000000  
##

bank %>%
  select(cont1, cont1_z, credit.score, credit.score_mm, cont6, cont6_dec) %>%
  pivot_longer(everything(), names_to = "var", values_to = "value") %>%
  ggplot(aes(value)) + geom_histogram(bins = 30) +
  facet_wrap(~ var, scales = "free") + labs(title = "Original vs Normalized")

## Warning: Removed 24 rows containing non-finite outside the scale range
## (`stat_bin()`).

bank <- bank %>%
  mutate(credit_bin = factor(ntile(credit.score, 3), labels = c("Low","Mid","High")))
table(bank$credit_bin)

## 
##  Low  Mid High 
##  230  230  230

ggplot(bank, aes(credit.score, fill = credit_bin)) +
  geom_histogram(bins = 30, alpha = 0.85) +
  labs(title = "Credit Score Bins (Equal Depth)")

bin_stats <- bank %>% group_by(credit_bin) %>%
  summarise(credit_smooth_val = median(credit.score, na.rm = TRUE), .groups = "drop")

bank <- bank %>% left_join(bin_stats, by = "credit_bin") %>%
  rename(credit.score_smoothed = credit_smooth_val)

bank %>%
  select(credit.score, credit.score_smoothed, credit_bin) %>%
  pivot_longer(-credit_bin, names_to = "kind", values_to = "value") %>%
  ggplot(aes(value, fill = kind)) + geom_density(alpha = 0.6) +
  facet_wrap(~ credit_bin, scales = "free_x") +
  labs(title = "Smoothing by Bin Medians")

bd <- BankData %>%
  mutate(
    across(starts_with("bool"), ~ ifelse(. %in% c("t","T","true","TRUE",1), 1L, 0L)),
    approval = factor(approval)
  ) %>% drop_na()

ctrl10 <- trainControl(method = "cv", number = 10)
svm_cv <- train(
  approval ~ ., data = bd, method = "svmLinear",
  trControl = ctrl10, preProcess = c("center","scale")
)
svm_cv

## Support Vector Machines with Linear Kernel 
## 
## 666 samples
##  12 predictor
##   2 classes: '-', '+' 
## 
## Pre-processing: centered (12), scaled (12) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 599, 599, 599, 600, 599, 600, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.8633424  0.7286267
## 
## Tuning parameter 'C' was held constant at a value of 1

gridC <- expand.grid(C = 10 ^ seq(-5, 2, by = 0.5))
svm_grid <- train(
  approval ~ ., data = bd, method = "svmLinear",
  trControl = ctrl10, preProcess = c("center","scale"),
  tuneGrid = gridC
)
svm_grid

## Support Vector Machines with Linear Kernel 
## 
## 666 samples
##  12 predictor
##   2 classes: '-', '+' 
## 
## Pre-processing: centered (12), scaled (12) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 601, 600, 599, 599, 599, 599, ... 
## Resampling results across tuning parameters:
## 
##   C             Accuracy   Kappa     
##   1.000000e-05  0.5510427  0.00000000
##   3.162278e-05  0.5510427  0.00000000
##   1.000000e-04  0.5510427  0.00000000
##   3.162278e-04  0.5706506  0.04748434
##   1.000000e-03  0.8379623  0.67022353
##   3.162278e-03  0.8650569  0.73168190
##   1.000000e-02  0.8650343  0.73176280
##   3.162278e-02  0.8635191  0.72869969
##   1.000000e-01  0.8635191  0.72869969
##   3.162278e-01  0.8635191  0.72869969
##   1.000000e+00  0.8635191  0.72869969
##   3.162278e+00  0.8635191  0.72869969
##   1.000000e+01  0.8635191  0.72869969
##   3.162278e+01  0.8635191  0.72869969
##   1.000000e+02  0.8635191  0.72869969
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 0.003162278.

svm_grid$bestTune

max(svm_grid$results$Accuracy)

## [1] 0.8650569

data(starwars, package = "dplyr")
sw_raw <- starwars %>%
  select(-films, -vehicles, -starships, -name) %>% drop_na()

pred_cat <- sw_raw %>% select(where(is.character)) %>% names()
pred_cat <- setdiff(pred_cat, "gender")
dv <- dummyVars(~ ., data = sw_raw[, pred_cat, drop = FALSE], fullRank = TRUE)
X_cat <- as.data.frame(predict(dv, sw_raw[, pred_cat, drop = FALSE]))
X_num <- sw_raw %>% select(-all_of(pred_cat), -gender)
sw_ml <- bind_cols(X_num, X_cat, gender = factor(sw_raw$gender))
head(sw_ml)

svm_sw <- train(
  gender ~ ., data = sw_ml, method = "svmLinear",
  trControl = trainControl(method = "cv", number = 10),
  preProcess = c("center","scale")
)

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: skin_colorred, homeworldDathomir,
## homeworldSerenno, speciesZabrak

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: `skin_colorbrown mottle`,
## skin_colortan, skin_colorunknown, `homeworldConcord Dawn`, homeworldKashyyyk,
## `homeworldMon Cala`, `speciesMon Calamari`, speciesWookiee

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: skin_colorgreen, eye_colorred,
## homeworldBespin, homeworldRyloth, homeworldTrandosha, speciesTrandoshan,
## `speciesTwi'lek`

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: `hair_colorbrown, grey`,
## homeworldKamino

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: homeworldCerea, homeworldDorin,
## `speciesKel Dor`

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: `homeworldHaruun Kal`

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: skin_colorwhite

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: `eye_colorblue-gray`,
## homeworldSocorro, homeworldStewjon

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: hair_colorgrey, skin_colorbrown,
## homeworldEndor, speciesEwok, speciesGungan

## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.

svm_sw

## Support Vector Machines with Linear Kernel 
## 
## 29 samples
## 60 predictors
##  2 classes: 'feminine', 'masculine' 
## 
## Pre-processing: centered (60), scaled (60) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 26, 25, 26, 26, 27, 27, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.9416667  0.7857143
## 
## Tuning parameter 'C' was held constant at a value of 1

X <- sw_ml %>% select(-gender)
pca <- prcomp(X, scale. = TRUE)
cumvar <- cumsum(pca$sdev^2 / sum(pca$sdev^2))
plot(cumvar, type = "b", ylab = "Cumulative explained variance", xlab = "PCs",
     main = "Choose PCs (~95%)"); abline(h = 0.95, lty = 2)

pp <- preProcess(X, method = c("center","scale","pca"), thresh = 0.95)
pp$numComp

## [1] 21

X_pca <- predict(pp, X)
sw_pca <- bind_cols(X_pca, gender = sw_ml$gender)
head(sw_pca)

idx <- createDataPartition(sw_pca$gender, p = 0.7, list = FALSE)
sw_train <- sw_pca[idx, ]
sw_test  <- sw_pca[-idx, ]

gridC <- expand.grid(C = 10 ^ seq(-5, 2, by = 0.5))
svm_pca_cvgrid <- train(
  gender ~ ., data = sw_train, method = "svmLinear",
  trControl = trainControl(method = "cv", number = 10),
  tuneGrid = gridC
)

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.

svm_pca_cvgrid$bestTune

pred_test <- predict(svm_pca_cvgrid, sw_test)
confusionMatrix(sw_test$gender, pred_test)

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  feminine masculine
##   feminine         1         0
##   masculine        0         6
##                                      
##                Accuracy : 1          
##                  95% CI : (0.5904, 1)
##     No Information Rate : 0.8571     
##     P-Value [Acc > NIR] : 0.3399     
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.1429     
##          Detection Rate : 0.1429     
##    Detection Prevalence : 0.1429     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : feminine   
##

svm_pca_cv <- train(
  gender ~ ., data = sw_pca, method = "svmLinear",
  trControl = trainControl(method = "cv", number = 10),
  tuneGrid = gridC
)

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.

svm_pca_cv

## Support Vector Machines with Linear Kernel 
## 
## 29 samples
## 21 predictors
##  2 classes: 'feminine', 'masculine' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 26, 26, 26, 25, 27, 26, ... 
## Resampling results across tuning parameters:
## 
##   C             Accuracy   Kappa    
##   1.000000e-05  0.8166667  0.0000000
##   3.162278e-05  0.8166667  0.0000000
##   1.000000e-04  0.8166667  0.0000000
##   3.162278e-04  0.8166667  0.0000000
##   1.000000e-03  0.8166667  0.0000000
##   3.162278e-03  0.8166667  0.0000000
##   1.000000e-02  0.8166667  0.0000000
##   3.162278e-02  0.8166667  0.0000000
##   1.000000e-01  0.9666667  0.8333333
##   3.162278e-01  0.9333333  0.7333333
##   1.000000e+00  0.9333333  0.7333333
##   3.162278e+00  0.9333333  0.7333333
##   1.000000e+01  0.9333333  0.7333333
##   3.162278e+01  0.9333333  0.7333333
##   1.000000e+02  0.9333333  0.7333333
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 0.1.

svm_pca_loocv <- train(
  gender ~ ., data = sw_pca, method = "svmLinear",
  trControl = trainControl(method = "LOOCV"),
  tuneGrid = gridC
)
svm_pca_loocv

## Support Vector Machines with Linear Kernel 
## 
## 29 samples
## 21 predictors
##  2 classes: 'feminine', 'masculine' 
## 
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation 
## Summary of sample sizes: 28, 28, 28, 28, 28, 28, ... 
## Resampling results across tuning parameters:
## 
##   C             Accuracy   Kappa    
##   1.000000e-05  0.7931034  0.0000000
##   3.162278e-05  0.7931034  0.0000000
##   1.000000e-04  0.7931034  0.0000000
##   3.162278e-04  0.7931034  0.0000000
##   1.000000e-03  0.7931034  0.0000000
##   3.162278e-03  0.7931034  0.0000000
##   1.000000e-02  0.7931034  0.0000000
##   3.162278e-02  0.7931034  0.0000000
##   1.000000e-01  0.9655172  0.8880309
##   3.162278e-01  0.9310345  0.7898551
##   1.000000e+00  0.9310345  0.7898551
##   3.162278e+00  0.9310345  0.7898551
##   1.000000e+01  0.9310345  0.7898551
##   3.162278e+01  0.9310345  0.7898551
##   1.000000e+02  0.9310345  0.7898551
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 0.1.

data(Sacramento, package = "caret")
sac <- Sacramento %>%
  select(-zip, -city) %>%
  drop_na()
table(sac$type); prop.table(table(sac$type))

## 
##        Condo Multi_Family  Residential 
##           53           13          866

## 
##        Condo Multi_Family  Residential 
##   0.05686695   0.01394850   0.92918455

sac %>%
  select(where(is.numeric)) %>%
  pivot_longer(everything(), names_to = "var", values_to = "value") %>%
  ggplot(aes(value)) + geom_histogram(bins = 30) +
  facet_wrap(~ var, scales = "free") +
  labs(title = "Sacramento — Numeric Distributions")

sac1 <- sac %>%
  mutate(
    price_log = log(price),
    sqft_log  = log(sqft)
  ) %>%
  select(-price, -sqft)

idx <- createDataPartition(sac1$type, p = 0.7, list = FALSE)
tr <- sac1[idx, ]
te <- sac1[-idx, ]

gridC <- expand.grid(C = 10 ^ seq(-5, 2, by = 0.5))
svm_sac <- train(
  type ~ ., data = tr, method = "svmLinear",
  trControl = trainControl(method = "cv", number = 10),
  preProcess = c("center","scale"),
  tuneGrid = gridC
)
svm_sac

## Support Vector Machines with Linear Kernel 
## 
## 655 samples
##   6 predictor
##   3 classes: 'Condo', 'Multi_Family', 'Residential' 
## 
## Pre-processing: centered (6), scaled (6) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 589, 589, 589, 590, 590, 590, ... 
## Resampling results across tuning parameters:
## 
##   C             Accuracy   Kappa     
##   1.000000e-05  0.9267366  0.00000000
##   3.162278e-05  0.9267366  0.00000000
##   1.000000e-04  0.9267366  0.00000000
##   3.162278e-04  0.9267366  0.00000000
##   1.000000e-03  0.9267366  0.00000000
##   3.162278e-03  0.9267366  0.00000000
##   1.000000e-02  0.9267366  0.00000000
##   3.162278e-02  0.9267366  0.00000000
##   1.000000e-01  0.9267366  0.00000000
##   3.162278e-01  0.9267366  0.00000000
##   1.000000e+00  0.9297669  0.06356589
##   3.162278e+00  0.9297669  0.06356589
##   1.000000e+01  0.9282284  0.06136463
##   3.162278e+01  0.9267133  0.07044190
##   1.000000e+02  0.9267133  0.07044190
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 1.

pred <- predict(svm_sac, te)
confusionMatrix(te$type, pred)

## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     Condo Multi_Family Residential
##   Condo            1            0          14
##   Multi_Family     0            0           3
##   Residential      1            0         258
## 
## Overall Statistics
##                                          
##                Accuracy : 0.935          
##                  95% CI : (0.8992, 0.961)
##     No Information Rate : 0.9928         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.0891         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: Condo Class: Multi_Family Class: Residential
## Sensitivity               0.50000                  NA            0.93818
## Specificity               0.94909             0.98917            0.50000
## Pos Pred Value            0.06667                  NA            0.99614
## Neg Pred Value            0.99618                  NA            0.05556
## Prevalence                0.00722             0.00000            0.99278
## Detection Rate            0.00361             0.00000            0.93141
## Detection Prevalence      0.05415             0.01083            0.93502
## Balanced Accuracy         0.72455                  NA            0.71909

pp <- preProcess(tr %>% select(-type), method = c("center","scale","pca"), thresh = 0.95)
tr_pca <- bind_cols(predict(pp, tr %>% select(-type)), type = tr$type)
te_pca <- bind_cols(predict(pp, te %>% select(-type)), type = te$type)

svm_sac_pca <- train(
  type ~ ., data = tr_pca, method = "svmLinear",
  trControl = trainControl(method = "cv", number = 10),
  tuneGrid = gridC
)
svm_sac_pca

## Support Vector Machines with Linear Kernel 
## 
## 655 samples
##   5 predictor
##   3 classes: 'Condo', 'Multi_Family', 'Residential' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 589, 589, 590, 589, 589, 591, ... 
## Resampling results across tuning parameters:
## 
##   C             Accuracy   Kappa     
##   1.000000e-05  0.9267570  0.00000000
##   3.162278e-05  0.9267570  0.00000000
##   1.000000e-04  0.9267570  0.00000000
##   3.162278e-04  0.9267570  0.00000000
##   1.000000e-03  0.9267570  0.00000000
##   3.162278e-03  0.9267570  0.00000000
##   1.000000e-02  0.9267570  0.00000000
##   3.162278e-02  0.9267570  0.00000000
##   1.000000e-01  0.9267570  0.00000000
##   3.162278e-01  0.9267570  0.00000000
##   1.000000e+00  0.9282721  0.04651163
##   3.162278e+00  0.9267337  0.04414942
##   1.000000e+01  0.9282488  0.07593237
##   3.162278e+01  0.9297640  0.10771531
##   1.000000e+02  0.9297640  0.10771531
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 31.62278.

pred_pca <- predict(svm_sac_pca, te_pca)
confusionMatrix(te_pca$type, pred_pca)

## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     Condo Multi_Family Residential
##   Condo            4            0          11
##   Multi_Family     0            0           3
##   Residential      2            0         257
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9422          
##                  95% CI : (0.9079, 0.9666)
##     No Information Rate : 0.9783          
##     P-Value [Acc > NIR] : 0.9999          
##                                           
##                   Kappa : 0.3129          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Condo Class: Multi_Family Class: Residential
## Sensitivity               0.66667                  NA             0.9483
## Specificity               0.95941             0.98917             0.6667
## Pos Pred Value            0.26667                  NA             0.9923
## Neg Pred Value            0.99237                  NA             0.2222
## Prevalence                0.02166             0.00000             0.9783
## Detection Rate            0.01444             0.00000             0.9278
## Detection Prevalence      0.05415             0.01083             0.9350
## Balanced Accuracy         0.81304                  NA             0.8075

tbl <- table(sac$type)
major <- names(tbl)[which.max(tbl)]
minor <- setdiff(names(tbl), major)

sac_major  <- sac %>% filter(type == major)
sac_minors <- sac %>% filter(type %in% minor)

target_n <- nrow(sac_minors)
sac_major_s <- sac_major %>% slice_sample(n = target_n)

sac_bal <- bind_rows(sac_minors, sac_major_s)

sac_bal %>%
  select(where(is.numeric)) %>%
  pivot_longer(everything(), names_to = "var", values_to = "value") %>%
  ggplot(aes(value)) + geom_histogram(bins = 30) +
  facet_wrap(~ var, scales = "free") +
  labs(title = "Balanced Sample — Numeric Distributions")

idx <- createDataPartition(sac_bal$type, p = 0.7, list = FALSE)
trb <- sac_bal[idx, ]
teb <- sac_bal[-idx, ]

svm_bal <- train(
  type ~ ., data = trb, method = "svmLinear",
  trControl = trainControl(method = "cv", number = 10),
  preProcess = c("center","scale"),
  tuneGrid = gridC
)
svm_bal

## Support Vector Machines with Linear Kernel 
## 
## 95 samples
##  6 predictor
##  3 classes: 'Condo', 'Multi_Family', 'Residential' 
## 
## Pre-processing: centered (6), scaled (6) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 85, 85, 85, 87, 85, 86, ... 
## Resampling results across tuning parameters:
## 
##   C             Accuracy   Kappa    
##   1.000000e-05  0.4944444  0.0000000
##   3.162278e-05  0.4944444  0.0000000
##   1.000000e-04  0.4944444  0.0000000
##   3.162278e-04  0.4944444  0.0000000
##   1.000000e-03  0.4944444  0.0000000
##   3.162278e-03  0.4944444  0.0000000
##   1.000000e-02  0.7050000  0.4592201
##   3.162278e-02  0.7061111  0.4662880
##   1.000000e-01  0.7386111  0.5217754
##   3.162278e-01  0.7411111  0.5409873
##   1.000000e+00  0.7397222  0.5381498
##   3.162278e+00  0.7708333  0.5916639
##   1.000000e+01  0.7483333  0.5501365
##   3.162278e+01  0.7372222  0.5284916
##   1.000000e+02  0.7372222  0.5284916
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 3.162278.

pred_bal <- predict(svm_bal, teb)
confusionMatrix(teb$type, pred_bal)

## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     Condo Multi_Family Residential
##   Condo           15            0           0
##   Multi_Family     0            3           0
##   Residential      6            0          13
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8378          
##                  95% CI : (0.6799, 0.9381)
##     No Information Rate : 0.5676          
##     P-Value [Acc > NIR] : 0.0004694       
##                                           
##                   Kappa : 0.7218          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Condo Class: Multi_Family Class: Residential
## Sensitivity                0.7143             1.00000             1.0000
## Specificity                1.0000             1.00000             0.7500
## Pos Pred Value             1.0000             1.00000             0.6842
## Neg Pred Value             0.7273             1.00000             1.0000
## Prevalence                 0.5676             0.08108             0.3514
## Detection Rate             0.4054             0.08108             0.3514
## Detection Prevalence       0.4054             0.08108             0.5135
## Balanced Accuracy          0.8571             1.00000             0.8750

mycars <- mtcars
mycars$folds <- 0
flds <- createFolds(1:nrow(mycars), k = 5, list = TRUE)
for (i in 1:5) { mycars$folds[flds[[i]]] <- i }
mycars %>%
  mutate(folds = factor(folds)) %>%
  ggplot(aes(x = folds, fill = factor(gear))) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(title = "Distribution of 'gear' across 5 folds", x = "Fold", y = "Percent", fill = "gear")

Homework: SVM, Preprocessing, and Evaluation

Oliver Sikora