Khan Gene Data

SVM transforms our data using a technique known as the kernel trick, and then finds an optimal boundary between the possible outputs based on these transformations.

In this lab, we will explore how to use SVM models. We will start by using the Khan data set from the ISLR package.

library(ISLR)
library(tidymodels)
## Registered S3 method overwritten by 'tune':
##   method                   from   
##   required_pkgs.model_spec parsnip
## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.3 ──
## ✓ broom        0.7.6      ✓ recipes      0.1.16
## ✓ dials        0.0.9      ✓ rsample      0.0.9 
## ✓ dplyr        1.0.6      ✓ tibble       3.1.2 
## ✓ ggplot2      3.3.3      ✓ tidyr        1.1.3 
## ✓ infer        0.5.4      ✓ tune         0.1.5 
## ✓ modeldata    0.1.0      ✓ workflows    0.2.2 
## ✓ parsnip      0.1.6      ✓ workflowsets 0.0.2 
## ✓ purrr        0.3.4      ✓ yardstick    0.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x purrr::discard() masks scales::discard()
## x dplyr::filter()  masks stats::filter()
## x dplyr::lag()     masks stats::lag()
## x recipes::step()  masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
Khan_train <- data.frame(x = Khan$xtrain, y = as.factor(Khan$ytrain))
Khan_test <- data.frame(x = Khan$xtest, y = Khan$ytest)

There are 63 observations and 2309 variables in Khan_train.

dim(Khan_train)
## [1]   63 2309

We will fit a linear SVM on the training data set and predict the test data set. What do we expect the performance to be on the training and testing data set?

svm_linear_spec <- svm_linear() %>%
  set_engine("kernlab") %>%
  set_mode("classification")
Khan_fit <- fit(svm_linear_spec, y ~., data = Khan_train)
##  Setting default kernel parameters
Khan_fit
## parsnip model object
## 
## Fit time:  2.3s 
## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 1 
## 
## Linear (vanilla) kernel function. 
## 
## Number of Support Vectors : 58 
## 
## Objective Function Value : -0.0015 -0.0014 -0.0013 -0.003 -0.0045 -0.0031 
## Training error : 0 
## Probability model included.

Because we compare to the testing set, there is no error prediction.

augment(Khan_fit, new_data = Khan_train) %>%
  conf_mat(truth = y, estimate = .pred_class) %>%
  autoplot(type = "heatmap")

We find a small difference in factor 3, but the overall SVM prediction is good.

augment(Khan_fit, new_data = Khan_test) %>%
  conf_mat(truth = y, estimate = .pred_class) %>%
  autoplot(type = "heatmap")

Sales of Child Car Seats Data

Next, we will switch to the Carseats and try to predict whether a store is located in the US or not. We will tune some of the arguments to get a good fit.

There is no NA value in the numerical variables.

# Carseats
# No missing data
purrr::map_dbl(Carseats, ~sum(is.na(.x)))
##       Sales   CompPrice      Income Advertising  Population       Price 
##           0           0           0           0           0           0 
##   ShelveLoc         Age   Education       Urban          US 
##           0           0           0           0           0
set.seed(1234)
Carseats_split <- initial_split(Carseats)
Carseats_train <- training(Carseats_split)
Carseats_test <- testing(Carseats_split)

Recipe

rec_spec <- recipe(US ~., data = Carseats_train) %>%
  step_novel(all_nominal(), -all_outcomes()) %>% 
  step_dummy(all_nominal(), -all_outcomes()) %>%
  step_zv(all_predictors()) %>% #remove zero variance
  step_normalize(all_predictors()) 
rec_spec
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor         10
## 
## Operations:
## 
## Novel factor level assignment for all_nominal(), -all_outcomes()
## Dummy variables from all_nominal(), -all_outcomes()
## Zero variance filter on all_predictors()
## Centering and scaling for all_predictors()

Polynomial Support Vector Machine Specification

svm_poly_spec <- svm_poly(degree = tune()) %>%
  set_mode("classification") %>%
  set_engine("kernlab")
svm_poly_spec
## Polynomial Support Vector Machine Specification (classification)
## 
## Main Arguments:
##   degree = tune()
## 
## Computational engine: kernlab

Tune

We try to find the optimal model’s degree between 1 and 3.

param_grid <- tibble(degree = c(1, 2, 3))

K-fold

We randomly splits the data into 5 groups.

Carseats_folds <- vfold_cv(Carseats_train, v = 5)

Workflow

Carseats_wf <- workflow() %>%
  add_recipe(rec_spec) %>%
  add_model(svm_poly_spec)
Carseats_wf 
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: svm_poly()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 4 Recipe Steps
## 
## • step_novel()
## • step_dummy()
## • step_zv()
## • step_normalize()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Polynomial Support Vector Machine Specification (classification)
## 
## Main Arguments:
##   degree = tune()
## 
## Computational engine: kernlab
tune_res <- tune_grid(
  Carseats_wf, 
  resamples = Carseats_folds,
  grid = param_grid,
  control = control_grid(verbose = TRUE, save_pred = TRUE)
)
autoplot(tune_res)

In the model tuning via grid search, degrees-1 is the best value based on the ROC curve and accuracy.

collect_metrics(tune_res)
## # A tibble: 6 x 7
##   degree .metric  .estimator  mean     n std_err .config             
##    <dbl> <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1      1 accuracy binary     0.877     5  0.0187 Preprocessor1_Model1
## 2      1 roc_auc  binary     0.934     5  0.0107 Preprocessor1_Model1
## 3      2 accuracy binary     0.76      5  0.0172 Preprocessor1_Model2
## 4      2 roc_auc  binary     0.850     5  0.0204 Preprocessor1_Model2
## 5      3 accuracy binary     0.767     5  0.0269 Preprocessor1_Model3
## 6      3 roc_auc  binary     0.820     5  0.0168 Preprocessor1_Model3

Confusion Matrix

If degree is at 1 in the first fold, the true No is 21, and the true Yes is 36.

collect_predictions(tune_res) %>%
  filter(id == "Fold1", degree == 1) %>%
    conf_mat(truth = US, estimate = .pred_class) %>%
  autoplot(type = "heatmap")

Folds 1 and 5 perform well in terms of sensitivity.

collect_predictions(tune_res) %>%
  filter(degree == 1) %>%
    group_by(id) %>%
    roc_curve(truth = US, estimate = .pred_No) %>%
  autoplot(type = "heatmap") 

## Fit the Model

We take the degree of 1 to fit the svm model.

best_degree <- select_best(tune_res, "roc_auc")
best_degree 
## # A tibble: 1 x 2
##   degree .config             
##    <dbl> <chr>               
## 1      1 Preprocessor1_Model1
final_Carseats_wf <- finalize_workflow(Carseats_wf, best_degree)
final_fit <- fit(final_Carseats_wf, Carseats_train)
final_fit 
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: svm_poly()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 4 Recipe Steps
## 
## • step_novel()
## • step_dummy()
## • step_zv()
## • step_normalize()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 1 
## 
## Polynomial kernel function. 
##  Hyperparameters : degree =  1  scale =  1  offset =  1 
## 
## Number of Support Vectors : 99 
## 
## Objective Function Value : -85.1425 
## Training error : 0.126667 
## Probability model included.

In comparison to the training set, the accuracy is 0.87, which is not too shabby.

augment(final_fit, new_data = Carseats_train) %>%
  accuracy(truth = US, estimate = .pred_class)
## # A tibble: 1 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.873

Furthermore, when compared to the testing set, the model performs better, with an accuracy of 93 percent.

augment(final_fit, new_data = Carseats_test) %>%
  accuracy(truth = US, estimate = .pred_class)
## # A tibble: 1 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary          0.93

References