This is an optional assignment of Classification: Nonparametric Methods, Support Vector Machines, STAT 508 Applied Data Mining and Statistical Learning, Pennsylvania State University. The course instructor is Dr. Lingzhou Xue, associate professor of statistics at Penn State.

This demonstration is replicable without local datasets. Packages used include ggplot2, e1071, and dplyr.

datacamp instructor: Kailash Awati

library(ggplot2)
library(dplyr)
df <- data.frame(sugar_content = scan(text = "10.9 10.9 10.6   10    8  8.2  8.6 10.9 10.7 8   7.7   7.8   8.4  11.5 11.2   8.9   8.7   7.4  10.9    10  11.4  10.8   8.5   8.2  10.6")) # generates a vector of 25 elements
# Plot sugar content along the x-axis
plot_df <- ggplot(data = df, aes(x = sugar_content, y = 0)) + 
    geom_point() + 
    geom_text(aes(label = sugar_content), size = 2.5, vjust = 2, hjust = 0.5)
# Display plot
plot_df

#The maximal margin separator is at the midpoint of the two extreme points in each cluster.
mm_separator <- (8.9 + 10)/2
#create data frame containing the maximum margin separator
separator <- data.frame(sep = mm_separator)
#add separator to sugar content scatterplot
plot_sep <- plot_df + geom_point(data = separator, aes(x = mm_separator, y = 0), color = "blue", size = 4)
#display plot
plot_sep

runif(n, min = 0, max = 1) generates uniformly distributed data.

# preliminaries
# set required number of data points
n <- 200
# set seed to ensure reproducibility
set.seed(42)
# generate dataframe with two predictors x1 and x2 in (0,1)
df <- data.frame(x1 = runif(n),
                 x2 = runif(n))
# classify points as -1 or +1
df$y <- factor(ifelse(df$x1 - df$x2 > 0, -1, 1),
               levels = c(-1, 1))
# create 2 dimensional scatter plot with x1 on the x-axis and x2 on the y-axis
# distinguish classes by color
# decision boundary is line x1 = x2
# build plot
p <- ggplot(data = df, aes(x = x1, y = x2, color = y)) +
  geom_point() +
  scale_color_manual(values = c("-1" = "red", "1" = "blue")) +
  geom_abline(slope = 1, intercept = 0)
# display the plot
p

### INTRODUCING A MARGIN
# remove points that have x1 and x2 values that differ by less than a specified value
# create a margin of 0.05
delta <- 0.05
# retain only those points that lie outside the margin
df1 <- df[abs(df$x1 - df$x2) > delta,]
nrow(df1)

## [1] 180

# replot dataset with margin
p <- ggplot(data = df1, aes(x = x1, y = x2, color = y)) +
  geom_point() +
  scale_color_manual(values = c("-1" = "red", "1" = "blue")) +
  geom_abline(slope = 1, intercept = 0)
p

p <- p +
  geom_abline(slope = 1, intercept = delta, linetype = "dashed") +
  geom_abline(slope = 1, intercept = -delta, linetype = "dashed")
p

Linear Support Vector Machines

kernel is the type of decision boundary, it has different shapes: lines, polynomials…

e1071 is the package

svm is a function that has a number of parameters, including formula - specifying the dependent variable (y), data, type- set to C-classfication, kernel- linear in this case, cost and gamma - parameters that are used to tune the model, and scale.

library(e1071)
# set seed for reproducibility
set.seed(1)
# assign rows to training/test sets randomly in 80/20 proportion
df[,"train"] <- ifelse(runif(nrow(df)) < 0.8, 1, 0) # another way to randomly select
head(df)

##          x1        x2  y train
## 1 0.9148060 0.8851177 -1     1
## 2 0.9370754 0.5171111 -1     1
## 3 0.2861395 0.8519310  1     1
## 4 0.8304476 0.4427963 -1     0
## 5 0.6417455 0.1578801 -1     1
## 6 0.5190959 0.4423246 -1     0

# separate training and test sets
trainset <- df[df$train == 1, ]
trainset <- trainset[, !names(trainset) %in% c("train")]
testset <- df[df$train == 0, ]
testset <- testset[, !names(testset) %in% c("train")]
# Type of decision boundaries is called a kernel, which must be specified upfront
svm_model <- svm(y ~ ., 
                 data = trainset, 
                 type = "C-classification", 
                 kernel = "linear", 
                 scale = F)
svm_model

## 
## Call:
## svm(formula = y ~ ., data = trainset, type = "C-classification", 
##     kernel = "linear", scale = F)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
## 
## Number of Support Vectors:  67

names(svm_model)

##  [1] "call"            "type"            "kernel"          "cost"           
##  [5] "degree"          "gamma"           "coef0"           "nu"             
##  [9] "epsilon"         "sparse"          "scaled"          "x.scale"        
## [13] "y.scale"         "nclasses"        "levels"          "tot.nSV"        
## [17] "nSV"             "labels"          "SV"              "index"          
## [21] "rho"             "compprob"        "probA"           "probB"          
## [25] "sigma"           "coefs"           "na.action"       "fitted"         
## [29] "decision.values" "terms"

# index of support vectors in training dataset
svm_model$index

##  [1]   1   9  11  12  13  22  27  29  33  44  45  46  56  65  66  67  68  73  75
## [20]  84  86  87  88  93 101 104 107 114 127 130 136 157 159 164   8  15  17  19
## [39]  20  30  34  35  37  43  47  51  59  60  61  64  74  76  78  82  85  92 100
## [58] 109 110 116 119 131 141 144 153 154 156

# Support vectors
head(svm_model$SV)

##           x1         x2
## 1  0.9148060 0.88511769
## 12 0.7191123 0.64987584
## 14 0.2554288 0.06094975
## 15 0.4622928 0.45131085
## 16 0.9400145 0.83875503
## 27 0.3902035 0.30705440

# negative intercept (unweighted)
svm_model$rho

## [1] 0.03429662

# evaluate the training and test set accuracy of the model
pred_train <- predict(svm_model, trainset)
mean(pred_train == trainset$y)

## [1] 0.9939394

# test accuracy
pred_test <- predict(svm_model, testset)
mean(pred_test == testset$y)

## [1] 1

# note that accuracy is misleading!

# visualze training data, distinguish classes using color
p <- ggplot(data = trainset, aes(x = x1, y = x2, color = y)) + 
  geom_point() +
  scale_color_manual(values = c("red", "blue"))
df_sv <- trainset[svm_model$index, ] # this is a data frame of support vectors
p <- p + geom_point(data = df_sv, 
                    aes(x = x1, y = x2),
                    color = "purple", # support vector in purple
                    size = 4, alpha = 0.5) # larger size of support vectors
# new plot w/ color-coded support vectors
p

# find slope and intercept of the boundary
# build the weight vector, `w`, from `coefs` and `SV` elements of `svm_model`
w <- t(svm_model$coefs) %*% svm_model$SV
w # weight vector

##            x1        x2
## [1,] 3.756494 -3.790794

# slope
slope_1 <- -w[1]/w[2]
slope_1

## [1] 0.9909518

# intercept
intercept_1 <- svm_model$rho/w[2]
p <- p + geom_abline(slope = slope_1,
                     intercept = intercept_1)
# margins parallel to decision boundary, offset by 1/w[2] on either side of it
p <- p +
  geom_abline(slope = slope_1,
              intercept = intercept_1 - 1/w[2],
              linetype = "dashed") +
  geom_abline(slope = slope_1,
              intercept = intercept_1 + 1/w[2],
              linetype = "dashed")
# display new plot
p

# soft margin classifiers
# never perfectly linear, usually unknown
plot(x = svm_model, data = trainset)

## TUNING LINEAR SVMs
# increase cost, reduce margin
svm_model <- svm(y ~ ., 
                 data = trainset,
                 type = "C-classification",
                 kernel = "linear",
                 cost = 100,
                 scale = FALSE) # increase the cost to 100
svm_model

## 
## Call:
## svm(formula = y ~ ., data = trainset, type = "C-classification", 
##     kernel = "linear", cost = 100, scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  100 
## 
## Number of Support Vectors:  18

# visualize again, first create support vector df
df_sv <- trainset[svm_model$index, ]
# then create weight, slope, and intercept
w <- t(svm_model$coefs) %*% svm_model$SV
slope_1 <- -w[1]/w[2]
intercept_1 <- svm_model$rho/w[2]
p <- ggplot(data = trainset, aes(x = x1, y = x2, color = y)) +
  geom_point() +
  scale_color_manual(values = c("red", "blue")) +
  geom_point(data = df_sv, # add support vectors (from sv_df) in purple
             aes(x = x1, y = x2),
             color = "purple",
             size = 4, alpha = 0.5) +
  geom_abline(slope = slope_1,
              intercept = intercept_1) +
  geom_abline(slope = slope_1, # margin
              intercept = intercept_1 - 1/w[2],
              linetype = "dashed") +
  geom_abline(slope = slope_1, # margin
              intercept = intercept_1 + 1/w[2],
              linetype = "dashed")
p # much narrower margins, intercept is close to 0 and slope close to 1, just like how data were simulated

## Nonlinear

Sample code for 100 replications

accuracy <- NA
set.seed(1)
for (i in 1:100){
    #assign 80% of the data to the training set
    df[, "train"] <- ifelse(runif(nrow(df)) < 0.8, 1, 0)
    trainColNum <- grep("train", names(df))
    trainset <- df[df$train == 1, -trainColNum]
    testset <- df[df$train == 0, -trainColNum]
    #build model using training data
    svm_model <- svm(y~ ., data = trainset, 
                     type = "C-classification", kernel = "linear")
    #calculate accuracy on test data
    pred_test <- predict(svm_model, testset)
    accuracy[i] <- mean(pred_test == testset$y)
}
mean(accuracy)

## [1] 0.9766239

sd(accuracy)

## [1] 0.02580719

Polynomial Kernels

Generate a dataset with 200 points, 2 predictors x1 and x2, uniformly distributed between -1 and 1. Specify the min and max since it’s not the default

n <- 200
set.seed(42)
df <- data.frame(x1 = runif(n, min = -1, max = 1),
                 x2 = runif(n, min = -1, max = 1)) # specify the min and max since it's not the default
# create a circular decision boundary of radius 0.7 units.
# categorical variable y is +1 or -1 depending on the point lies outside or within boundary.
radius <- 0.7
radius_squared <- radius^2
#categorize data points depending on location wrt boundary
df$y <- factor(ifelse(df$x1^2 + df$x2^2 < radius_squared, -1, 1),
               levels = c(-1, 1))
# build plot
p <- ggplot(data = df, aes(x = x1, y = x2, color = y)) +
  geom_point() +
  scale_color_manual(values = c("-1" = "red", "1" = "blue"))
p

# adding a circular boundary
# create a function to generate a circle
circle <- function(x1_center, x2_center, r, npoint = 100) {
  # angular spacing of 2*pi/npoint between points
  theta <- seq(0, 2*pi, length.out = npoint) # angle 0 ~ 360
  x1_circ <- x1_center + r*cos(theta)
  x2_circ <- x2_center + r*sin(theta)
  data.frame(x1c = x1_circ, x2c = x2_circ) # define x1c and x2c
}

boundary <- circle(x1_center = 0, x2_center = 0, r = radius)
# add boundary to previous plot
p <- p + geom_path(data = boundary,
                   aes(x = x1c, y = x2c), # x1c and x2c previously defined
                   inherit.aes = FALSE)
p

Equation of boundary is \(x_{1}^{2} + x_{2}^{2} = 0.49\)

Map \(x_{1}^2\) to a new variable \(X_1\) and \(x_{2}^{2}\) to \(X_{2}\)

The equation of boundary in the \(X_{1} - X{2}\) space becomes a line: \[X_{1} + X_{2} = 0.49\]

Equation of boundary \(X_{2} = -X_{1} + 0.49\), so the slope is \(-1\), and the y-intercept is \(0.49\)

df4 <- data.frame(x1sq = df$x1^2, x2sq = df$x2^2, Y = df$y)
# plot the X1 - X2 space
p <- ggplot(data = df4, aes(x = x1sq, y = x2sq, color = Y)) +
  geom_point() +
  scale_color_manual(values = c("red", "blue")) +
  geom_abline(slope = -1, intercept = 0.49)
p # transformed space

Polynomial kernel: (gamma * (u*v) )^degree, gamma and coef0 are tuning parameters, u and v are vectors(datapoints) belonging to the dataset.

The math formulation of SVMs requires transformations with specific properties.

Functions satisfying these properties are called kernel functions

Kernel functions are generalizations of vector dot products

Basic idea is to use a kernel that separates the data well.

df <- data.frame(x1 = runif(n, min = -1, max = 1),
                 x2 = runif(n, min = -1, max = 1)) # specify the min and max since it's not the default
# create a circular decision boundary of radius 0.7 units.
# categorical variable y is +1 or -1 depending on the point lies outside or within boundary.
radius <- 0.7
radius_squared <- radius^2
#categorize data points depending on location wrt boundary
df$y <- factor(ifelse(df$x1^2 + df$x2^2 < radius_squared, -1, 1),
               levels = c(-1, 1))
set.seed(1)
# assign rows to training/test sets randomly in 80/20 proportion
df[,"train"] <- ifelse(runif(nrow(df)) < 0.8, 1, 0) # another way to randomly select
# separate training and test sets
trainset <- df[df$train == 1, ]
trainset <- trainset[, !names(trainset) %in% c("train")]
testset <- df[df$train == 0, ]
testset <- testset[, !names(testset) %in% c("train")]
###
svm_model <- svm(y ~ ., data = trainset, type = "C-classification", kernel = "polynomial", degree = 2)
pred_test <- predict(svm_model, testset)
mean(pred_test == testset$y)

## [1] 0.9428571

plot(svm_model, trainset)

Tuning: systematically find the optimal parameter values

tune_out <- tune.svm(x = trainset[, -3],y = trainset[, 3],type = "C-classification",kernel = "polynomial", degree = 2, cost = 10^(-1:2), gamma = c(01, 1, 10), coef0 = c(0.1, 1, 10)) # details are important. A typo of "cost = 10^2(-1:2)" causes the error of "attempt to apply non-function"
tune_out$best.parameters$cost

## [1] 0.1

tune_out$best.parameters$gamma

## [1] 10

tune_out$best.parameters$coef0

## [1] 1

### With the optimal values, then, re-fit the svm
svm_model <- svm(y ~ ., data = trainset, type = "C-classification", kernel = "polynomial", degree = 2, 
                 cost = tune_out$best.parameters$cost,
                 tune_out$best.parameters$gamma, 
                 tune_out$best.parameters$coef0)
pred_train <- predict(svm_model, trainset)
mean(pred_train == trainset$y)

## [1] 0.9939394

pred_test <- predict(svm_model, testset)
mean(pred_test == testset$y)

## [1] 0.9714286

plot(svm_model, trainset)

Radial Basis Function (RBF) Kernels

x1 and x2 distributed differently

uniformly distributed: all outcomes are equally likely

n <- 600
set.seed(42)
df <- data.frame(x1 = rnorm(n, mean = -0.5, sd = 1),
                 x2 = runif(n, min = -1, max = 1))
# Generate boundary, radius and centers
radius <- 0.7 # r
radius_squared <- radius^2
center_1 <- c(0.7, 0)
center_2 <- c(-0.7, 0) # the centers are 1.4 apart
# whether a point lies in either of the two circles or not
df$y <- factor(
  ifelse(
  (df$x1 - center_1[1])^2 + (df$x2 - center_1[2])^2 < radius_squared | # if a point is within circle 1
  (df$x1 - center_2[1])^2 + (df$x2 - center_2[2])^2 < radius_squared, # if a point is within circle 2, ")" AFTER -1,1
  -1, 1), # yes -1; no +1
  levels = c(-1, 1))

### plot
p <- ggplot(data = df, aes(x = x1, y = x2, color = y)) +
  geom_point() + 
  guides(color = FALSE) + 
  scale_color_manual(values = c("red", "blue"))

## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

# add decision boundary
circle <- function(x1_center, x2_center, r, npoint = 100) {
  theta <- seq(0, 2*pi, length.out = npoint)
  x1_circ <- x1_center + r * cos(theta)
  x2_circ <- x2_center + r * sin(theta)
  data.frame(x1c = x1_circ, x2c = x2_circ)
}

set.seed(1)
# assign rows to training/test sets randomly in 80/20 proportion
df[,"train"] <- ifelse(runif(nrow(df)) < 0.8, 1, 0) # another way to randomly select
# separate training and test sets
trainset <- df[df$train == 1, ]
trainset <- trainset[, !names(trainset) %in% c("train")]
testset <- df[df$train == 0, ]
testset <- testset[, !names(testset) %in% c("train")]

Heuristic approach, akin to k-Nearest Neighbors algorithm.

For a given point in the dataset, \(X_{1} = (a,b)\), a simple function with this property is exp(-gamma * r), where r is the distance between \(X_{1}\) and any other point X

rbf function is a decreasing function of distance between two points in dataset, simulates k-NN algorithm

rbf <- function(r, gamma) exp(-gamma * r) # a function about gamma and r, the rule is exp(-gamma * r)
ggplot(
  data.frame(r = c(-0, 10)), aes(r)) +
  stat_function(fun = rbf, args = list(gamma = 0.2), aes(color = "0.2")) + 
  stat_function(fun = rbf, args = list(gamma = 0.4), aes(color = "0.4")) +
  stat_function(fun = rbf, args = list(gamma = 0.6), aes(color = "0.6")) +
  stat_function(fun = rbf, args = list(gamma = 0.8), aes(color = "0.8")) +
  stat_function(fun = rbf, args = list(gamma = 1), aes(color = "1")) +
  stat_function(fun = rbf, args = list(gamma = 2), aes(color = "2")) +
  scale_color_manual("gamma", values = c("red", "orange", "yellow", "green", "blue", "violet")) +
  ggtitle("Radial basis function (gamma = 0.2 to 2)")

## Build a RBF kernel model
svm_model <- svm(y ~ ., data = trainset,
                 type = "C-classification",
                 kernel = "radial")
pred_train <- predict(svm_model, trainset)
mean(pred_train == trainset$y)

## [1] 0.9256198

pred_test <- predict(svm_model, testset)
mean(pred_test == testset$y)

## [1] 0.9310345

plot(svm_model, trainset)

# refining the decision boundary
# tune parameters
tune_out <- tune.svm(x = trainset[, -3], y = trainset[, 3], gamma = 5*10 ^ (-2:2),
                     cost = c(0.01, 0.1, 1, 10, 100),
                     type = "C-classification",
                     kernel = "radial")
tune_out$best.parameters$cost

## [1] 100

tune_out$best.parameters$gamma

## [1] 5

# refit model with best parameters
svm_model <- svm(y ~ ., data = trainset, type = "C-classification", kernel = "radial", 
                 cost = tune_out$best.parameters$cost, gamma = tune_out$best.parameters$gamma)
pred_train <- predict(svm_model, trainset)
mean(pred_train == trainset$y)

## [1] 0.9979339

pred_test <- predict(svm_model, testset)
mean(pred_test == testset$y)

## [1] 0.9741379

plot(svm_model, trainset) # plot shows the optimal parameters are the best

Note: the rbf has a local nature.

datacamp Support Vector Machines in R

Tiangeng Lu

2022-07-20

Linear Support Vector Machines

Polynomial Kernels

Radial Basis Function (RBF) Kernels