Statistical Learning: Week-9

Classifying Legendary Pokemons using SVM!

Using SVM techniques to classify if a Pokemon is a legendary one.

# Install required packages in one-swoop
# install.packages(c("dplyr", "ggplot2", "tidyr", "reshape2", "caret", "skimr", "psych", "e1071", "data.table", "Matrix", "keras"))
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.4.3

library(tidyr)

## Warning: package 'tidyr' was built under R version 4.4.3

library(reshape2)

## Warning: package 'reshape2' was built under R version 4.4.3

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

library(caret)

## Warning: package 'caret' was built under R version 4.4.3

## Loading required package: lattice

library(skimr)

## Warning: package 'skimr' was built under R version 4.4.3

library(psych)

## Warning: package 'psych' was built under R version 4.4.3

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(e1071)  # the R equivalent to libsvm (Library of SVM)

## Warning: package 'e1071' was built under R version 4.4.3

library(data.table)

## Warning: package 'data.table' was built under R version 4.4.3

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:reshape2':
## 
##     dcast, melt

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

library(Matrix)

## Warning: package 'Matrix' was built under R version 4.4.3

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

library(keras)

## Warning: package 'keras' was built under R version 4.4.3

# Download data from our dropbox shared folder and preliminary check
pokemon <- read.csv("https://www.dropbox.com/s/znbta9u9tub2ox9/pokemon.csv?dl=1")
dim(pokemon)

## [1] 801  41

names(pokemon)

##  [1] "abilities"         "against_bug"       "against_dark"     
##  [4] "against_dragon"    "against_electric"  "against_fairy"    
##  [7] "against_fight"     "against_fire"      "against_flying"   
## [10] "against_ghost"     "against_grass"     "against_ground"   
## [13] "against_ice"       "against_normal"    "against_poison"   
## [16] "against_psychic"   "against_rock"      "against_steel"    
## [19] "against_water"     "attack"            "base_egg_steps"   
## [22] "base_happiness"    "base_total"        "capture_rate"     
## [25] "classfication"     "defense"           "experience_growth"
## [28] "height_m"          "hp"                "japanese_name"    
## [31] "name"              "percentage_male"   "pokedex_number"   
## [34] "sp_attack"         "sp_defense"        "speed"            
## [37] "type1"             "type2"             "weight_kg"        
## [40] "generation"        "is_legendary"

str(pokemon)

## 'data.frame':    801 obs. of  41 variables:
##  $ abilities        : chr  "['Overgrow', 'Chlorophyll']" "['Overgrow', 'Chlorophyll']" "['Overgrow', 'Chlorophyll']" "['Blaze', 'Solar Power']" ...
##  $ against_bug      : num  1 1 1 0.5 0.5 0.25 1 1 1 1 ...
##  $ against_dark     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ against_dragon   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ against_electric : num  0.5 0.5 0.5 1 1 2 2 2 2 1 ...
##  $ against_fairy    : num  0.5 0.5 0.5 0.5 0.5 0.5 1 1 1 1 ...
##  $ against_fight    : num  0.5 0.5 0.5 1 1 0.5 1 1 1 0.5 ...
##  $ against_fire     : num  2 2 2 0.5 0.5 0.5 0.5 0.5 0.5 2 ...
##  $ against_flying   : num  2 2 2 1 1 1 1 1 1 2 ...
##  $ against_ghost    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ against_grass    : num  0.25 0.25 0.25 0.5 0.5 0.25 2 2 2 0.5 ...
##  $ against_ground   : num  1 1 1 2 2 0 1 1 1 0.5 ...
##  $ against_ice      : num  2 2 2 0.5 0.5 1 0.5 0.5 0.5 1 ...
##  $ against_normal   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ against_poison   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ against_psychic  : num  2 2 2 1 1 1 1 1 1 1 ...
##  $ against_rock     : num  1 1 1 2 2 4 1 1 1 2 ...
##  $ against_steel    : num  1 1 1 0.5 0.5 0.5 0.5 0.5 0.5 1 ...
##  $ against_water    : num  0.5 0.5 0.5 2 2 2 0.5 0.5 0.5 1 ...
##  $ attack           : int  49 62 100 52 64 104 48 63 103 30 ...
##  $ base_egg_steps   : int  5120 5120 5120 5120 5120 5120 5120 5120 5120 3840 ...
##  $ base_happiness   : int  70 70 70 70 70 70 70 70 70 70 ...
##  $ base_total       : int  318 405 625 309 405 634 314 405 630 195 ...
##  $ capture_rate     : chr  "45" "45" "45" "45" ...
##  $ classfication    : chr  "Seed Pokémon" "Seed Pokémon" "Seed Pokémon" "Lizard Pokémon" ...
##  $ defense          : int  49 63 123 43 58 78 65 80 120 35 ...
##  $ experience_growth: int  1059860 1059860 1059860 1059860 1059860 1059860 1059860 1059860 1059860 1000000 ...
##  $ height_m         : num  0.7 1 2 0.6 1.1 1.7 0.5 1 1.6 0.3 ...
##  $ hp               : int  45 60 80 39 58 78 44 59 79 45 ...
##  $ japanese_name    : chr  "Fushigidaneフシギダネ" "Fushigisouフシギソウ" "Fushigibanaフシギバナ" "Hitokageヒトカゲ" ...
##  $ name             : chr  "Bulbasaur" "Ivysaur" "Venusaur" "Charmander" ...
##  $ percentage_male  : num  88.1 88.1 88.1 88.1 88.1 88.1 88.1 88.1 88.1 50 ...
##  $ pokedex_number   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ sp_attack        : int  65 80 122 60 80 159 50 65 135 20 ...
##  $ sp_defense       : int  65 80 120 50 65 115 64 80 115 20 ...
##  $ speed            : int  45 60 80 65 80 100 43 58 78 45 ...
##  $ type1            : chr  "grass" "grass" "grass" "fire" ...
##  $ type2            : chr  "poison" "poison" "poison" "" ...
##  $ weight_kg        : num  6.9 13 100 8.5 19 90.5 9 22.5 85.5 2.9 ...
##  $ generation       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ is_legendary     : int  0 0 0 0 0 0 0 0 0 0 ...

# Data preparation
pokemon = tbl_df(pokemon)  # Prepare the data into tibble format for the ease of data wrangling

## Warning: `tbl_df()` was deprecated in dplyr 1.0.0.
## ℹ Please use `tibble::as_tibble()` instead.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

colnames(pokemon)[25] <- "classification" # Rename this column because the original author has a typo :p

# Our first task is to identify legendary pokemons

classify_legendary = select(pokemon, is_legendary, hp, weight_kg, height_m, speed, 
attack, defense, sp_attack, sp_defense, generation, capture_rate, 
experience_growth, percentage_male, base_happiness, base_egg_steps)   # Select relevant "features" from the data (dataframe is entered as the first argument)

# Convert three of the feature columns to appropriate class(es)
classify_legendary$is_legendary <- as.factor(classify_legendary$is_legendary)
classify_legendary$generation <- as.factor(classify_legendary$generation)
classify_legendary$capture_rate <- as.numeric(classify_legendary$capture_rate)

## Warning: NAs introduced by coercion

# You may also use lapply() to expedite this step

head(classify_legendary)

## # A tibble: 6 × 15
##   is_legendary    hp weight_kg height_m speed attack defense sp_attack
##   <fct>        <int>     <dbl>    <dbl> <int>  <int>   <int>     <int>
## 1 0               45       6.9      0.7    45     49      49        65
## 2 0               60      13        1      60     62      63        80
## 3 0               80     100        2      80    100     123       122
## 4 0               39       8.5      0.6    65     52      43        60
## 5 0               58      19        1.1    80     64      58        80
## 6 0               78      90.5      1.7   100    104      78       159
## # ℹ 7 more variables: sp_defense <int>, generation <fct>, capture_rate <dbl>,
## #   experience_growth <int>, percentage_male <dbl>, base_happiness <int>,
## #   base_egg_steps <int>

# Replace NA cells with 0 because NA values exist for Pokemons that do not have a discernible height and weight  :/
classify_legendary$weight_kg[is.na(classify_legendary$weight_kg)] <- 0
classify_legendary$height_m[is.na(classify_legendary$height_m)] <- 0

colSums(is.na(classify_legendary))  # Check if NA values still exist (colSums --> sum over a particular column of values to tell us how many rows satisfy this condition)

##      is_legendary                hp         weight_kg          height_m 
##                 0                 0                 0                 0 
##             speed            attack           defense         sp_attack 
##                 0                 0                 0                 0 
##        sp_defense        generation      capture_rate experience_growth 
##                 0                 0                 1                 0 
##   percentage_male    base_happiness    base_egg_steps 
##                98                 0                 0

# Remove rows that contain NA in any of the variables
# "percentage_male" and "capture_rate" still contain missing values
classify_legendary <- classify_legendary[complete.cases(classify_legendary), ]

# Check again
colSums(is.na(classify_legendary))

##      is_legendary                hp         weight_kg          height_m 
##                 0                 0                 0                 0 
##             speed            attack           defense         sp_attack 
##                 0                 0                 0                 0 
##        sp_defense        generation      capture_rate experience_growth 
##                 0                 0                 0                 0 
##   percentage_male    base_happiness    base_egg_steps 
##                 0                 0                 0

## Assign the outcome variable to Y (we will add it back later)
y <- classify_legendary$is_legendary

# One-hot-encode the columns, using dummyVars() function from caret package
# Convert is_legendary to numeric class
classify_legendary$is_legendary <- as.numeric(classify_legendary$is_legendary)

# dummyVars() creates a full list of dummy variables for the formula provided
dummies_model <- dummyVars(is_legendary ~ ., data = classify_legendary)    
# Use predict() to map this function to the dataframe you want to convert
data_mat <- predict(dummies_model, newdata = classify_legendary) # One-hot-encode all categorical variables
classify_legendary_ohe <- data.frame(data_mat) # Convert data_mat into dataframe class

## Pre-process the data, "range" scales the data to the interval between zero and one
pre_process_normalize <- preProcess(classify_legendary, method = "range")
pre_process_normalize_ohe <- preProcess(classify_legendary_ohe, method = "range")
# Use the same method to apply preProcess() on the dataframe we want to convert
classify_legendary <- predict(pre_process_normalize, newdata = classify_legendary)
classify_legendary_ohe <- predict(pre_process_normalize_ohe, newdata = classify_legendary_ohe)

## Add the dependent variable back in
classify_legendary$is_legendary <- y    
classify_legendary_ohe$is_legendary <- y  


## 75-25 train-test split on Y before testing the model
set.seed(123)
train_row_numbers <- createDataPartition(classify_legendary$is_legendary, p=0.75, list=FALSE)
train_classify_legendary <- classify_legendary[train_row_numbers, ]
test_classify_legendary <- classify_legendary[-train_row_numbers, ]

## Do the same for the one-hot-encoded dataframe
train_row_numbers_ohe <- createDataPartition(classify_legendary_ohe$is_legendary, p=0.75, list=FALSE)
train_classify_legendary_ohe <- classify_legendary_ohe[train_row_numbers, ]
test_classify_legendary_ohe <- classify_legendary_ohe[-train_row_numbers, ]


## SVM

# A quick introduction to some useful default settings inside svm()
svm.fit <- svm(is_legendary~., 
                scale = TRUE, # the default, x and y are scaled to zero mean and unit variance
               kernel = "polynomial", # other options are available
               degree = 3, # If kernel is of type = "polynomial"
                # gamma = if (is.vector(x)) 1 else 1 / ncol(x), # you can provide the value for variable x
                 cost = 1, # the cost function (C)
          probability = FALSE, # Whether to output probability predictions
            na.action = na.omit,
                 data = train_classify_legendary)

# If the above settings are not needed, you can just use the following
svm.fit <- svm(is_legendary~., 
                 data = train_classify_legendary)

# You may also use train() function from the caret package
svm_caret.fit = train(is_legendary ~ ., 
                      data = train_classify_legendary, 
                      method = 'svmRadial') # Note the option used in the method argument: svm w/ radial basis function

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.

# Apply the same formula on the OHE-processed data
svm_ohe.fit <- svm(is_legendary~., 
                   data = train_classify_legendary_ohe)


## Plotting
colors <- c("slategray1", "paleturquoise4")  # Map to outcome (0, 1)
plot(svm.fit, train_classify_legendary, col = colors, attack ~ height_m)

# The last argument is the set of "features" used as predictors for the outcome (is_legendary)
# Note that we only have 6 legendary pokemons (so only a few will be colored in turquoise), you can verify this by typing
table(train_classify_legendary$is_legendary)

## 
##   0   1 
## 522   6

## Get predicted values for training data (note: since the model is trained on the training data, so we are predicting something we already knew, the prediction accuracy is expected to be very high)
predict_train_svm <- predict(svm.fit, train_classify_legendary)

## Generate confusion matrix
confmat_train_svm <- table(Predicted = predict_train_svm, 
               Actual = train_classify_legendary$is_legendary)
confmat_train_svm

##          Actual
## Predicted   0   1
##         0 522   1
##         1   0   5

## Show train accuracy, why do I select [1,1] and [2,2] from the confmat_train_svm matrix?
(confmat_train_svm[1, 1] + confmat_train_svm[2, 2]) / sum(confmat_train_svm) * 100

## [1] 99.81061

## Get predicted values for the testing data
predict_test_svm <- predict(svm.fit, test_classify_legendary)

## Generate confusion matrix table
confmat_test_svm <- table(Predicted = predict_test_svm, Actual = test_classify_legendary$is_legendary)

## Show test accuracy
(confmat_test_svm[1, 1] + confmat_test_svm[2, 2]) / sum(confmat_test_svm) * 100

## [1] 100

K Nearest Neighbors (KNN)

Impute missing values using KNN

# Load the Pokemon data again as we are going to apply different kind of preprocessing on this data
pokemon <- read.csv("https://www.dropbox.com/s/znbta9u9tub2ox9/pokemon.csv?dl=1")

pokemon = tbl_df(pokemon)

## Warning: `tbl_df()` was deprecated in dplyr 1.0.0.
## ℹ Please use `tibble::as_tibble()` instead.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

colnames(pokemon)[25] <- "classification"  

# Select required columns (variables/features)

classify_legendary = select(pokemon, is_legendary, hp, weight_kg, height_m, speed, 
attack, defense, sp_attack, sp_defense, generation, capture_rate, 
experience_growth, percentage_male, base_happiness, base_egg_steps) # select relevant "features"

# Convert three of the featured columns to appropriate classes
classify_legendary$is_legendary <- as.factor(classify_legendary$is_legendary)
classify_legendary$generation <- as.factor(classify_legendary$generation)
classify_legendary$capture_rate <- as.numeric(classify_legendary$capture_rate)

## Warning: NAs introduced by coercion

# Quickly check out the data
head(classify_legendary)

## # A tibble: 6 × 15
##   is_legendary    hp weight_kg height_m speed attack defense sp_attack
##   <fct>        <int>     <dbl>    <dbl> <int>  <int>   <int>     <int>
## 1 0               45       6.9      0.7    45     49      49        65
## 2 0               60      13        1      60     62      63        80
## 3 0               80     100        2      80    100     123       122
## 4 0               39       8.5      0.6    65     52      43        60
## 5 0               58      19        1.1    80     64      58        80
## 6 0               78      90.5      1.7   100    104      78       159
## # ℹ 7 more variables: sp_defense <int>, generation <fct>, capture_rate <dbl>,
## #   experience_growth <int>, percentage_male <dbl>, base_happiness <int>,
## #   base_egg_steps <int>

# Replace NA cells with 0 (note the subset argument used inside the variable: df$x[condition])
classify_legendary$weight_kg[is.na(classify_legendary$weight_kg)] <- 0
classify_legendary$height_m[is.na(classify_legendary$height_m)] <- 0

# Impute NA with KNN, the default is k = 3
# install.packages("RANN")
library(RANN)

## Warning: package 'RANN' was built under R version 4.4.3

# Use the same trick demonstrated earlier
knn_missing_data <- preProcess(as.data.frame(classify_legendary), method="knnImpute")
# In order for a data to be processed via preProcess(), the first argument needs to be of data.frame class. I found that preProcess() is mostly compatible with tibble and other classes of inputs, but it does require programmers to force his/her data into data.frame class when applying knnImpute method. Some colleagues have reported that they encountered no such problem though.

classify_legendary_knn <- predict(knn_missing_data, newdata = classify_legendary)

# Alternatively, you can impute the entire data using the free-standing KNN function: knnImputation(data, k = k, scale = TRUE, meth = "weighAvg", distData = NULL)

## Assign the outcome variable to y
y <- classify_legendary_knn$is_legendary

## Preprocess the data, "range" scales the data to the interval between zero and one
knn_normalize <- preProcess(classify_legendary_knn, method = "range")
classify_legendary_knn <- predict(knn_normalize, newdata = classify_legendary_knn)

## Add the dependent variable back in
classify_legendary_knn$is_legendary <- y    


## 75-25 train-test split on Y before testing the model
set.seed(123)
train_row_numbers <- createDataPartition(classify_legendary_knn$is_legendary, p=0.75, list=FALSE)
train_classify_legendary <- classify_legendary_knn[train_row_numbers, ]
test_classify_legendary <- classify_legendary_knn[-train_row_numbers, ]


## SVM using KNN-imputed data
svm_knn.fit <- svm(is_legendary~., 
                 data = train_classify_legendary)

## Get predicted values for training data (a conservative test)
predict_train_svm <- predict(svm_knn.fit, train_classify_legendary)

## Generate confusion matrix
confmat_train_svm <- table(Predicted = predict_train_svm,
               Actual = train_classify_legendary$is_legendary)

confmat_train_svm

##          Actual
## Predicted   0   1
##         0 547   5
##         1   2  48

## Show train accuracy
(confmat_train_svm[1, 1] + confmat_train_svm[2, 2]) / sum(confmat_train_svm) * 100

## [1] 98.83721

## Get predicted values for testing data
predict_test_svm <- predict(svm_knn.fit, test_classify_legendary)

## Generate confusion table
confmat_test_svm <- table(Predicted = predict_test_svm, Actual = test_classify_legendary$is_legendary)

## Show test accuracy
(confmat_test_svm[1, 1] + confmat_test_svm[2, 2]) / sum(confmat_test_svm) * 100

## [1] 97.98995

Multiclass Pokemon type prediction

Classifying water, grass, and fire type Pokemons!

## Load required packages and data

# Again, load data from our dropbox shared folder and preliminary check
pokemon <- read.csv("https://www.dropbox.com/s/znbta9u9tub2ox9/pokemon.csv?dl=1")
names(pokemon)

##  [1] "abilities"         "against_bug"       "against_dark"     
##  [4] "against_dragon"    "against_electric"  "against_fairy"    
##  [7] "against_fight"     "against_fire"      "against_flying"   
## [10] "against_ghost"     "against_grass"     "against_ground"   
## [13] "against_ice"       "against_normal"    "against_poison"   
## [16] "against_psychic"   "against_rock"      "against_steel"    
## [19] "against_water"     "attack"            "base_egg_steps"   
## [22] "base_happiness"    "base_total"        "capture_rate"     
## [25] "classfication"     "defense"           "experience_growth"
## [28] "height_m"          "hp"                "japanese_name"    
## [31] "name"              "percentage_male"   "pokedex_number"   
## [34] "sp_attack"         "sp_defense"        "speed"            
## [37] "type1"             "type2"             "weight_kg"        
## [40] "generation"        "is_legendary"

names(pokemon)[25] <- "classification"

# Create a new outcome variable column "type" to denote the three main "Gosanke(s)": if a Pokemon possesses one of these three types as its primary or secondary type, then it will be coded as such 
pokemon$type <- NA  # First generate a new column of NA values
pokemon$type[pokemon$type1 == "grass" | pokemon$type2 == "grass"] <- "grass"   # "|" stands for OR in Boolean logic
pokemon$type[pokemon$type1 == "water" | pokemon$type2 == "water"] <- "water"
pokemon$type[pokemon$type1 == "fire" | pokemon$type2 == "fire"] <- "fire"
pokemon$type <- as.factor(pokemon$type) # Convert to factor class (svm() takes either factor or numeric variable as its Y variable)

# Apply other pre-processings
classify_legendary$generation <- as.factor(classify_legendary$generation)
classify_legendary$capture_rate <- as.numeric(classify_legendary$capture_rate)

# Remove rows containing NA in any of the variable columns and remove irrelevant and text features
pokemon <- pokemon[complete.cases(pokemon), -c(1, 30, 31, 33, 37, 38, 40, 41)]

# Take a look at the subsetted data
str(pokemon)

## 'data.frame':    263 obs. of  34 variables:
##  $ against_bug      : num  1 1 1 0.5 0.5 0.25 1 1 1 1 ...
##  $ against_dark     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ against_dragon   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ against_electric : num  0.5 0.5 0.5 1 1 2 2 2 2 0.5 ...
##  $ against_fairy    : num  0.5 0.5 0.5 0.5 0.5 0.5 1 1 1 0.5 ...
##  $ against_fight    : num  0.5 0.5 0.5 1 1 0.5 1 1 1 0.5 ...
##  $ against_fire     : num  2 2 2 0.5 0.5 0.5 0.5 0.5 0.5 2 ...
##  $ against_flying   : num  2 2 2 1 1 1 1 1 1 2 ...
##  $ against_ghost    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ against_grass    : num  0.25 0.25 0.25 0.5 0.5 0.25 2 2 2 0.25 ...
##  $ against_ground   : num  1 1 1 2 2 0 1 1 1 1 ...
##  $ against_ice      : num  2 2 2 0.5 0.5 1 0.5 0.5 0.5 2 ...
##  $ against_normal   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ against_poison   : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ against_psychic  : num  2 2 2 1 1 1 1 1 1 2 ...
##  $ against_rock     : num  1 1 1 2 2 4 1 1 1 1 ...
##  $ against_steel    : num  1 1 1 0.5 0.5 0.5 0.5 0.5 0.5 1 ...
##  $ against_water    : num  0.5 0.5 0.5 2 2 2 0.5 0.5 0.5 0.5 ...
##  $ attack           : int  49 62 100 52 64 104 48 63 103 50 ...
##  $ base_egg_steps   : int  5120 5120 5120 5120 5120 5120 5120 5120 5120 5120 ...
##  $ base_happiness   : int  70 70 70 70 70 70 70 70 70 70 ...
##  $ base_total       : int  318 405 625 309 405 634 314 405 630 320 ...
##  $ capture_rate     : chr  "45" "45" "45" "45" ...
##  $ classification   : chr  "Seed Pokémon" "Seed Pokémon" "Seed Pokémon" "Lizard Pokémon" ...
##  $ defense          : int  49 63 123 43 58 78 65 80 120 55 ...
##  $ experience_growth: int  1059860 1059860 1059860 1059860 1059860 1059860 1059860 1059860 1059860 1059860 ...
##  $ height_m         : num  0.7 1 2 0.6 1.1 1.7 0.5 1 1.6 0.5 ...
##  $ hp               : int  45 60 80 39 58 78 44 59 79 45 ...
##  $ percentage_male  : num  88.1 88.1 88.1 88.1 88.1 88.1 88.1 88.1 88.1 50 ...
##  $ sp_attack        : int  65 80 122 60 80 159 50 65 135 75 ...
##  $ sp_defense       : int  65 80 120 50 65 115 64 80 115 65 ...
##  $ speed            : int  45 60 80 65 80 100 43 58 78 30 ...
##  $ weight_kg        : num  6.9 13 100 8.5 19 90.5 9 22.5 85.5 5.4 ...
##  $ type             : Factor w/ 3 levels "fire","grass",..: 2 2 2 1 1 1 3 3 3 2 ...

# Temporarily remove type and assign it to y (to be combined with pre-processed data later)
y <- pokemon$type  # Remove type as y


## Preprocess the data, "range" scales the data to the interval between zero and one
pokemon_normalize <- preProcess(pokemon, method = "range")
pokemon <- predict(pokemon_normalize, newdata = pokemon)

# Add the original y variable back in
pokemon$type <- y    


## 75-25 train-test split on Y before testing the model

set.seed(123)
train_row_numbers <- createDataPartition(pokemon$type, p=0.75, list=FALSE)
training <- pokemon[train_row_numbers, ]
testing <- pokemon[-train_row_numbers, ]


## Fit a simple two-variable multi-class SVM model, note how we set gamma (functional margin) and C (the cost function), you can play with different combinations of gamma and C values
multiclass_svm.fit <- svm(type ~ against_fire + against_water, data = training, 
          method = "C-classification", # C-classification is for binary classification on categorical data
          kernal = "radial", 
          gamma = 0.5, cost = 10)

# You can use either C- or nu-classification for binary classification. The range of C is from zero to infinity, but nu is always between [0,1], reflecting the ratio of support vectors and the ratio of the training error.

colors = c("lightcoral","palegreen1","skyblue2")
plot(multiclass_svm.fit, training, against_fire ~ against_water, xlab = "against water", ylab = "against fire", col = colors) # Enter your SVM model object as the first argument, the data as the second argument, followed by your two primary X variables

## Get predicted values
predict_test_svm <- predict(multiclass_svm.fit, testing)

## Generate confusion matrix
# Note that it's a multiclass (3-class) prediction task, so we will generate a 3x3 confusion matrix table
confmat_test_svm <- table(Predicted = predict_test_svm, 
               Actual = testing$type)
confmat_test_svm

##          Actual
## Predicted fire grass water
##     fire    12     0     0
##     grass    0    21     0
##     water    1     0    30

## show train accuracy
(confmat_test_svm[1, 1] + confmat_test_svm[2, 2] + confmat_test_svm[3, 3]) / sum(confmat_test_svm) * 100

## [1] 98.4375

Hyperparameter tuning & Grid search over \(\gamma\) and C

Grid search over unique combinations of \(\gamma\) and C of a polynomial function with cross-validation (CV) to get the optimal combination of hyperparameters that best classifies the data.

\(\gamma\) and C are the hyper-parameters that determine the performance of a SVM model. \(\gamma\) parameter is used when we apply the Gaussian radial basis function (RBF) kernel, it defines how far the influence of a single data point reaches by determining the \(\textbf{curvature}\) of the SHP:

low \(\gamma\) means less local influence of a mis-classified data point \(\rightarrow\) less curvature
high \(\gamma\) means high local influence of a mis-classified data point \(\rightarrow\) more curvature

The \(C\) parameter trades off mis-classification (i.e., errors) against the simplicity of the decision surface:

low \(C\) value makes the SHP less sensitive to misclassification (high error) \(\rightarrow\) wider margin
high \(C\) value makes the SHP more sensitive to misclassification (low error) \(\rightarrow\) narrower margin

# The function will repeat for the length of gamma times that of cost (i.e., 2^(-1:1) * 2^(2:4)) and search for the optimal combination of gamma and cost from a grid of dimension length(-1:1) * length(2:4) = 9

set.seed(123)

gs_svm1.fit <- tune.svm(type ~ against_fire + against_water, data = pokemon, 
                        type = "C-classification", 
                      kernel = "polynomial", 
                       gamma = 2^(-1:1), 
                        cost = 2^(2:4), 
                 tunecontrol = tune.control(cross = 5)) # tunecontrol is an argument of tune.control() function, here it says 5-fold cross-validation (CV)

# A glimpse of the elements inside the tune.svm output
names(gs_svm1.fit)

## [1] "best.parameters"  "best.performance" "method"           "nparcomb"        
## [5] "train.ind"        "sampling"         "performances"     "best.model"

# What's the best model?
# The optimal degree is 3rd degree polynomial and the optimal cost (c) is 4
# The number of support vectors (along the SHP) is 15
gs_svm1.fit$best.model

## 
## Call:
## best.svm(x = type ~ against_fire + against_water, data = pokemon, 
##     gamma = 2^(-1:1), cost = 2^(2:4), type = "C-classification", 
##     kernel = "polynomial", tunecontrol = tune.control(cross = 5))
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  polynomial 
##        cost:  4 
##      degree:  3 
##      coef.0:  0 
## 
## Number of Support Vectors:  15

# The lowest classification error produced by the best-tuned model
min(gs_svm1.fit$performances$error)

## [1] 0.01531205

# Retrieve the best tuned (optimal) C and gamma parameters:
gs_svm1.fit$best.parameters$cost

## [1] 4

gs_svm1.fit$best.parameters$gamma

## [1] 0.5

# or you can show all the above information in one take
# Note how I subset the data inside the square bracket []
gs_svm1.fit$performances[(gs_svm1.fit$performances$error == min(gs_svm1.fit$performances$error)), ]

##   gamma cost      error dispersion
## 1   0.5    4 0.01531205 0.02506046
## 2   1.0    4 0.01531205 0.02506046
## 3   2.0    4 0.01531205 0.02506046
## 4   0.5    8 0.01531205 0.02506046
## 5   1.0    8 0.01531205 0.02506046
## 6   2.0    8 0.01531205 0.02506046
## 7   0.5   16 0.01531205 0.02506046
## 8   1.0   16 0.01531205 0.02506046
## 9   2.0   16 0.01531205 0.02506046

# Plot model performance out of each unique combination of Gamma and C in a heatmap

plot(gs_svm1.fit, xlab = expression(gamma), ylab = "C") # Enable Greek letter typesetting using expression()

# In this plot, the color gradient, which transitions from light to dark violet, indicates the performance of the SVM model based on the combination of gamma (γ) and cost (C). Darker shades indicate better-performing models, while lighter shades represent less optimal models. The values on the horizontal and vertical axes indicate the different combinations of gamma and C tested during the tuning process. We look for the region with lowest classification error (called "performance"). The region where gamma (γ) and c (Cost) produce the lowest cross-validated classification error appear to locate in the upper-middle part of the graph.

Note in particular the polynomial kernel of degree (\(d\)) is defined as \(k(x, x^{\prime}) = (\gamma \cdot {x^{\prime}}x + constant)^{d}\), where \(\gamma\) is the coefficient for different kernel functions (RBF, polynomial, and sigmoid) while the constant term controls the position of the decision boundary.

In \(\textsf{tune.svm()}\) function, if \(\textsf{gamma = "scale"}\) (the default) is passed then it uses \(\textsf{1 / (n_features * X.var())}\) as its value of \(\textsf{gamma}\), if \(\textsf{gamma = "auto"}\), then \(\textsf{1 / n_features}\) will be used. \(\textsf{coef0}\) is the constant term used in the above formula. A positive value of \(\textsf{coef0}\) shifts the decision boundary to the positive side while a negative value of \(\textsf{coef0}\) shifts the decision boundary to the negative side.

For a faster hyperparameter-tuning, you may specify a list of exact \(\gamma\) and/or cost values for tuning without having to loop through a wider range of values. Let’s do a second-degree polynomial kernel SVM, with fixed sampling, and 10-fold CV.

set.seed(123)
gs_svm2.fit <- tune.svm(type ~ against_fire + against_water, 
                        data = pokemon, 
                        type = "C-classification", 
                      kernel = "polynomial", 
                      degree = 2, 
                       gamma = c(0.1, 1, 10), 
                        cost = 2^(2:4), 
                       coef0 = c(-1, 0.1, 1), 
                 tunecontrol = tune.control(cross = 10), 
                    sampling = "fix")

# coef0 is the parameter needed for kernels of type "polynomial", it's a parameter of the kernel projection, being added to the dot product of the polynomial kernel function K(x, x') = (gamma * x'x + coef0)^d as a constant term before raising the K(x, x') function to the power of degree d.

# tune.control(cross = 10)  # Number of cross-validation: default is 10

# sampling = "fix"  fix: part of the data used for training in fixed sampling


# Check out optimal parameters
# The optimal cost is 4. The optimal degree is 2. The optimal coef0 is 0.1.
gs_svm2.fit$best.model

## 
## Call:
## best.svm(x = type ~ against_fire + against_water, data = pokemon, 
##     degree = 2, gamma = c(0.1, 1, 10), coef0 = c(-1, 0.1, 1), cost = 2^(2:4), 
##     type = "C-classification", kernel = "polynomial", tunecontrol = tune.control(cross = 10), 
##     sampling = "fix")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  polynomial 
##        cost:  4 
##      degree:  2 
##      coef.0:  0.1 
## 
## Number of Support Vectors:  38

# Combinations of hyperparameters that produces the lowest cross-validation error rate
gs_svm2.fit$performances[(gs_svm2.fit$performances$error == min(gs_svm2.fit$performances$error)), ]

##    degree gamma coef0 cost      error dispersion
## 4       2   0.1   0.1    4 0.01139601 0.02579882
## 7       2   0.1   1.0    4 0.01139601 0.02579882
## 8       2   1.0   1.0    4 0.01139601 0.02579882
## 13      2   0.1   0.1    8 0.01139601 0.02579882
## 16      2   0.1   1.0    8 0.01139601 0.02579882
## 17      2   1.0   1.0    8 0.01139601 0.02579882
## 22      2   0.1   0.1   16 0.01139601 0.02579882
## 25      2   0.1   1.0   16 0.01139601 0.02579882
## 26      2   1.0   1.0   16 0.01139601 0.02579882

# or you can just enter to check out the lowest error rate
gs_svm2.fit$best.performance

## [1] 0.01139601

# C
gs_svm2.fit$best.parameters$cost

## [1] 4

# Gamma
gs_svm2.fit$best.parameters$gamma

## [1] 0.1

# coef0
gs_svm2.fit$best.parameters$coef0

## [1] 0.1

Supplementary materials

How to write a cross-validation loop (if you must)

library(caret)

# createFolds function comes in handy from caret package
folds = createFolds(training$type, k = 10)

cv = lapply(folds, function(x) {
  training_fold = training[-x, ]
  testing_fold = training[x, ]
  classifierO = svm(type ~ against_fire + against_water,
                   data = training_fold,
                   type = "C-classification",
                   kernel = "radial")
  y_predO = predict(classifierO, newdata = testing_fold)
  confmat = table(testing_fold[,33], y_predO)
  accuracy = (confmat[1,1] + confmat[2,2] + confmat[3,3]) / sum(confmat)*100
  return(accuracy)
})

# You may want to include the code below as the object to be returned if you don't want to see the results of all 10 CV folds.
accuracy = mean(as.numeric(cv))
accuracy