This is an optional assignment of Classification: Nonparametric Methods, Support Vector Machines, STAT 508 Applied Data Mining and Statistical Learning, Pennsylvania State University. The course instructor is Dr. Lingzhou Xue, associate professor of statistics at Penn State.
This demonstration is replicable without local datasets. Packages
used include ggplot2
, e1071
, and
dplyr
.
datacamp instructor: Kailash Awati
library(ggplot2)
library(dplyr)
df <- data.frame(sugar_content = scan(text = "10.9 10.9 10.6 10 8 8.2 8.6 10.9 10.7 8 7.7 7.8 8.4 11.5 11.2 8.9 8.7 7.4 10.9 10 11.4 10.8 8.5 8.2 10.6")) # generates a vector of 25 elements
# Plot sugar content along the x-axis
plot_df <- ggplot(data = df, aes(x = sugar_content, y = 0)) +
geom_point() +
geom_text(aes(label = sugar_content), size = 2.5, vjust = 2, hjust = 0.5)
# Display plot
plot_df
#The maximal margin separator is at the midpoint of the two extreme points in each cluster.
mm_separator <- (8.9 + 10)/2
#create data frame containing the maximum margin separator
separator <- data.frame(sep = mm_separator)
#add separator to sugar content scatterplot
plot_sep <- plot_df + geom_point(data = separator, aes(x = mm_separator, y = 0), color = "blue", size = 4)
#display plot
plot_sep
runif(n, min = 0, max = 1)
generates uniformly
distributed data.
# preliminaries
# set required number of data points
n <- 200
# set seed to ensure reproducibility
set.seed(42)
# generate dataframe with two predictors x1 and x2 in (0,1)
df <- data.frame(x1 = runif(n),
x2 = runif(n))
# classify points as -1 or +1
df$y <- factor(ifelse(df$x1 - df$x2 > 0, -1, 1),
levels = c(-1, 1))
# create 2 dimensional scatter plot with x1 on the x-axis and x2 on the y-axis
# distinguish classes by color
# decision boundary is line x1 = x2
# build plot
p <- ggplot(data = df, aes(x = x1, y = x2, color = y)) +
geom_point() +
scale_color_manual(values = c("-1" = "red", "1" = "blue")) +
geom_abline(slope = 1, intercept = 0)
# display the plot
p
### INTRODUCING A MARGIN
# remove points that have x1 and x2 values that differ by less than a specified value
# create a margin of 0.05
delta <- 0.05
# retain only those points that lie outside the margin
df1 <- df[abs(df$x1 - df$x2) > delta,]
nrow(df1)
## [1] 180
# replot dataset with margin
p <- ggplot(data = df1, aes(x = x1, y = x2, color = y)) +
geom_point() +
scale_color_manual(values = c("-1" = "red", "1" = "blue")) +
geom_abline(slope = 1, intercept = 0)
p
p <- p +
geom_abline(slope = 1, intercept = delta, linetype = "dashed") +
geom_abline(slope = 1, intercept = -delta, linetype = "dashed")
p
kernel is the type of decision boundary, it has different shapes: lines, polynomials…
e1071
is the package
svm
is a function that has a number of parameters,
including formula
- specifying the dependent variable (y),
data
, type
- set to C-classfication,
kernel
- linear in this case, cost
and
gamma
- parameters that are used to tune the model, and
scale
.
library(e1071)
# set seed for reproducibility
set.seed(1)
# assign rows to training/test sets randomly in 80/20 proportion
df[,"train"] <- ifelse(runif(nrow(df)) < 0.8, 1, 0) # another way to randomly select
head(df)
## x1 x2 y train
## 1 0.9148060 0.8851177 -1 1
## 2 0.9370754 0.5171111 -1 1
## 3 0.2861395 0.8519310 1 1
## 4 0.8304476 0.4427963 -1 0
## 5 0.6417455 0.1578801 -1 1
## 6 0.5190959 0.4423246 -1 0
# separate training and test sets
trainset <- df[df$train == 1, ]
trainset <- trainset[, !names(trainset) %in% c("train")]
testset <- df[df$train == 0, ]
testset <- testset[, !names(testset) %in% c("train")]
# Type of decision boundaries is called a kernel, which must be specified upfront
svm_model <- svm(y ~ .,
data = trainset,
type = "C-classification",
kernel = "linear",
scale = F)
svm_model
##
## Call:
## svm(formula = y ~ ., data = trainset, type = "C-classification",
## kernel = "linear", scale = F)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 67
names(svm_model)
## [1] "call" "type" "kernel" "cost"
## [5] "degree" "gamma" "coef0" "nu"
## [9] "epsilon" "sparse" "scaled" "x.scale"
## [13] "y.scale" "nclasses" "levels" "tot.nSV"
## [17] "nSV" "labels" "SV" "index"
## [21] "rho" "compprob" "probA" "probB"
## [25] "sigma" "coefs" "na.action" "fitted"
## [29] "decision.values" "terms"
# index of support vectors in training dataset
svm_model$index
## [1] 1 9 11 12 13 22 27 29 33 44 45 46 56 65 66 67 68 73 75
## [20] 84 86 87 88 93 101 104 107 114 127 130 136 157 159 164 8 15 17 19
## [39] 20 30 34 35 37 43 47 51 59 60 61 64 74 76 78 82 85 92 100
## [58] 109 110 116 119 131 141 144 153 154 156
# Support vectors
head(svm_model$SV)
## x1 x2
## 1 0.9148060 0.88511769
## 12 0.7191123 0.64987584
## 14 0.2554288 0.06094975
## 15 0.4622928 0.45131085
## 16 0.9400145 0.83875503
## 27 0.3902035 0.30705440
# negative intercept (unweighted)
svm_model$rho
## [1] 0.03429662
# evaluate the training and test set accuracy of the model
pred_train <- predict(svm_model, trainset)
mean(pred_train == trainset$y)
## [1] 0.9939394
# test accuracy
pred_test <- predict(svm_model, testset)
mean(pred_test == testset$y)
## [1] 1
# note that accuracy is misleading!
# visualze training data, distinguish classes using color
p <- ggplot(data = trainset, aes(x = x1, y = x2, color = y)) +
geom_point() +
scale_color_manual(values = c("red", "blue"))
df_sv <- trainset[svm_model$index, ] # this is a data frame of support vectors
p <- p + geom_point(data = df_sv,
aes(x = x1, y = x2),
color = "purple", # support vector in purple
size = 4, alpha = 0.5) # larger size of support vectors
# new plot w/ color-coded support vectors
p
# find slope and intercept of the boundary
# build the weight vector, `w`, from `coefs` and `SV` elements of `svm_model`
w <- t(svm_model$coefs) %*% svm_model$SV
w # weight vector
## x1 x2
## [1,] 3.756494 -3.790794
# slope
slope_1 <- -w[1]/w[2]
slope_1
## [1] 0.9909518
# intercept
intercept_1 <- svm_model$rho/w[2]
p <- p + geom_abline(slope = slope_1,
intercept = intercept_1)
# margins parallel to decision boundary, offset by 1/w[2] on either side of it
p <- p +
geom_abline(slope = slope_1,
intercept = intercept_1 - 1/w[2],
linetype = "dashed") +
geom_abline(slope = slope_1,
intercept = intercept_1 + 1/w[2],
linetype = "dashed")
# display new plot
p
# soft margin classifiers
# never perfectly linear, usually unknown
plot(x = svm_model, data = trainset)
## TUNING LINEAR SVMs
# increase cost, reduce margin
svm_model <- svm(y ~ .,
data = trainset,
type = "C-classification",
kernel = "linear",
cost = 100,
scale = FALSE) # increase the cost to 100
svm_model
##
## Call:
## svm(formula = y ~ ., data = trainset, type = "C-classification",
## kernel = "linear", cost = 100, scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 100
##
## Number of Support Vectors: 18
# visualize again, first create support vector df
df_sv <- trainset[svm_model$index, ]
# then create weight, slope, and intercept
w <- t(svm_model$coefs) %*% svm_model$SV
slope_1 <- -w[1]/w[2]
intercept_1 <- svm_model$rho/w[2]
p <- ggplot(data = trainset, aes(x = x1, y = x2, color = y)) +
geom_point() +
scale_color_manual(values = c("red", "blue")) +
geom_point(data = df_sv, # add support vectors (from sv_df) in purple
aes(x = x1, y = x2),
color = "purple",
size = 4, alpha = 0.5) +
geom_abline(slope = slope_1,
intercept = intercept_1) +
geom_abline(slope = slope_1, # margin
intercept = intercept_1 - 1/w[2],
linetype = "dashed") +
geom_abline(slope = slope_1, # margin
intercept = intercept_1 + 1/w[2],
linetype = "dashed")
p # much narrower margins, intercept is close to 0 and slope close to 1, just like how data were simulated
## Nonlinear
Sample code for 100 replications
accuracy <- NA
set.seed(1)
for (i in 1:100){
#assign 80% of the data to the training set
df[, "train"] <- ifelse(runif(nrow(df)) < 0.8, 1, 0)
trainColNum <- grep("train", names(df))
trainset <- df[df$train == 1, -trainColNum]
testset <- df[df$train == 0, -trainColNum]
#build model using training data
svm_model <- svm(y~ ., data = trainset,
type = "C-classification", kernel = "linear")
#calculate accuracy on test data
pred_test <- predict(svm_model, testset)
accuracy[i] <- mean(pred_test == testset$y)
}
mean(accuracy)
## [1] 0.9766239
sd(accuracy)
## [1] 0.02580719
Generate a dataset with 200 points, 2 predictors x1
and
x2
, uniformly distributed between -1 and 1. Specify the
min
and max
since it’s not the default
n <- 200
set.seed(42)
df <- data.frame(x1 = runif(n, min = -1, max = 1),
x2 = runif(n, min = -1, max = 1)) # specify the min and max since it's not the default
# create a circular decision boundary of radius 0.7 units.
# categorical variable y is +1 or -1 depending on the point lies outside or within boundary.
radius <- 0.7
radius_squared <- radius^2
#categorize data points depending on location wrt boundary
df$y <- factor(ifelse(df$x1^2 + df$x2^2 < radius_squared, -1, 1),
levels = c(-1, 1))
# build plot
p <- ggplot(data = df, aes(x = x1, y = x2, color = y)) +
geom_point() +
scale_color_manual(values = c("-1" = "red", "1" = "blue"))
p
# adding a circular boundary
# create a function to generate a circle
circle <- function(x1_center, x2_center, r, npoint = 100) {
# angular spacing of 2*pi/npoint between points
theta <- seq(0, 2*pi, length.out = npoint) # angle 0 ~ 360
x1_circ <- x1_center + r*cos(theta)
x2_circ <- x2_center + r*sin(theta)
data.frame(x1c = x1_circ, x2c = x2_circ) # define x1c and x2c
}
boundary <- circle(x1_center = 0, x2_center = 0, r = radius)
# add boundary to previous plot
p <- p + geom_path(data = boundary,
aes(x = x1c, y = x2c), # x1c and x2c previously defined
inherit.aes = FALSE)
p
Equation of boundary is \(x_{1}^{2} + x_{2}^{2} = 0.49\)
Map \(x_{1}^2\) to a new variable \(X_1\) and \(x_{2}^{2}\) to \(X_{2}\)
The equation of boundary in the \(X_{1} - X{2}\) space becomes a line: \[X_{1} + X_{2} = 0.49\]
Equation of boundary \(X_{2} = -X_{1} +
0.49\), so the slope
is \(-1\), and the y-intercept
is
\(0.49\)
df4 <- data.frame(x1sq = df$x1^2, x2sq = df$x2^2, Y = df$y)
# plot the X1 - X2 space
p <- ggplot(data = df4, aes(x = x1sq, y = x2sq, color = Y)) +
geom_point() +
scale_color_manual(values = c("red", "blue")) +
geom_abline(slope = -1, intercept = 0.49)
p # transformed space
Polynomial kernel: (gamma * (u*v) )^degree
,
gamma
and coef0
are tuning parameters,
u
and v
are vectors(datapoints) belonging to
the dataset.
The math formulation of SVMs requires transformations with specific properties.
Functions satisfying these properties are called kernel functions
Kernel functions are generalizations of vector dot products
Basic idea is to use a kernel that separates the data well.
df <- data.frame(x1 = runif(n, min = -1, max = 1),
x2 = runif(n, min = -1, max = 1)) # specify the min and max since it's not the default
# create a circular decision boundary of radius 0.7 units.
# categorical variable y is +1 or -1 depending on the point lies outside or within boundary.
radius <- 0.7
radius_squared <- radius^2
#categorize data points depending on location wrt boundary
df$y <- factor(ifelse(df$x1^2 + df$x2^2 < radius_squared, -1, 1),
levels = c(-1, 1))
set.seed(1)
# assign rows to training/test sets randomly in 80/20 proportion
df[,"train"] <- ifelse(runif(nrow(df)) < 0.8, 1, 0) # another way to randomly select
# separate training and test sets
trainset <- df[df$train == 1, ]
trainset <- trainset[, !names(trainset) %in% c("train")]
testset <- df[df$train == 0, ]
testset <- testset[, !names(testset) %in% c("train")]
###
svm_model <- svm(y ~ ., data = trainset, type = "C-classification", kernel = "polynomial", degree = 2)
pred_test <- predict(svm_model, testset)
mean(pred_test == testset$y)
## [1] 0.9428571
plot(svm_model, trainset)
Tuning: systematically find the optimal parameter values
tune_out <- tune.svm(x = trainset[, -3],y = trainset[, 3],type = "C-classification",kernel = "polynomial", degree = 2, cost = 10^(-1:2), gamma = c(01, 1, 10), coef0 = c(0.1, 1, 10)) # details are important. A typo of "cost = 10^2(-1:2)" causes the error of "attempt to apply non-function"
tune_out$best.parameters$cost
## [1] 0.1
tune_out$best.parameters$gamma
## [1] 10
tune_out$best.parameters$coef0
## [1] 1
### With the optimal values, then, re-fit the svm
svm_model <- svm(y ~ ., data = trainset, type = "C-classification", kernel = "polynomial", degree = 2,
cost = tune_out$best.parameters$cost,
tune_out$best.parameters$gamma,
tune_out$best.parameters$coef0)
pred_train <- predict(svm_model, trainset)
mean(pred_train == trainset$y)
## [1] 0.9939394
pred_test <- predict(svm_model, testset)
mean(pred_test == testset$y)
## [1] 0.9714286
plot(svm_model, trainset)
x1
and x2
distributed differently
uniformly distributed: all outcomes are equally likely
n <- 600
set.seed(42)
df <- data.frame(x1 = rnorm(n, mean = -0.5, sd = 1),
x2 = runif(n, min = -1, max = 1))
# Generate boundary, radius and centers
radius <- 0.7 # r
radius_squared <- radius^2
center_1 <- c(0.7, 0)
center_2 <- c(-0.7, 0) # the centers are 1.4 apart
# whether a point lies in either of the two circles or not
df$y <- factor(
ifelse(
(df$x1 - center_1[1])^2 + (df$x2 - center_1[2])^2 < radius_squared | # if a point is within circle 1
(df$x1 - center_2[1])^2 + (df$x2 - center_2[2])^2 < radius_squared, # if a point is within circle 2, ")" AFTER -1,1
-1, 1), # yes -1; no +1
levels = c(-1, 1))
### plot
p <- ggplot(data = df, aes(x = x1, y = x2, color = y)) +
geom_point() +
guides(color = FALSE) +
scale_color_manual(values = c("red", "blue"))
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
p
# add decision boundary
circle <- function(x1_center, x2_center, r, npoint = 100) {
theta <- seq(0, 2*pi, length.out = npoint)
x1_circ <- x1_center + r * cos(theta)
x2_circ <- x2_center + r * sin(theta)
data.frame(x1c = x1_circ, x2c = x2_circ)
}
set.seed(1)
# assign rows to training/test sets randomly in 80/20 proportion
df[,"train"] <- ifelse(runif(nrow(df)) < 0.8, 1, 0) # another way to randomly select
# separate training and test sets
trainset <- df[df$train == 1, ]
trainset <- trainset[, !names(trainset) %in% c("train")]
testset <- df[df$train == 0, ]
testset <- testset[, !names(testset) %in% c("train")]
Heuristic approach, akin to k-Nearest Neighbors algorithm.
For a given point in the dataset, \(X_{1} =
(a,b)\), a simple function with this property is
exp(-gamma * r)
, where r
is the distance
between \(X_{1}\) and any other point
X
rbf function is a decreasing function of distance between two points in dataset, simulates k-NN algorithm
rbf <- function(r, gamma) exp(-gamma * r) # a function about gamma and r, the rule is exp(-gamma * r)
ggplot(
data.frame(r = c(-0, 10)), aes(r)) +
stat_function(fun = rbf, args = list(gamma = 0.2), aes(color = "0.2")) +
stat_function(fun = rbf, args = list(gamma = 0.4), aes(color = "0.4")) +
stat_function(fun = rbf, args = list(gamma = 0.6), aes(color = "0.6")) +
stat_function(fun = rbf, args = list(gamma = 0.8), aes(color = "0.8")) +
stat_function(fun = rbf, args = list(gamma = 1), aes(color = "1")) +
stat_function(fun = rbf, args = list(gamma = 2), aes(color = "2")) +
scale_color_manual("gamma", values = c("red", "orange", "yellow", "green", "blue", "violet")) +
ggtitle("Radial basis function (gamma = 0.2 to 2)")
## Build a RBF kernel model
svm_model <- svm(y ~ ., data = trainset,
type = "C-classification",
kernel = "radial")
pred_train <- predict(svm_model, trainset)
mean(pred_train == trainset$y)
## [1] 0.9256198
pred_test <- predict(svm_model, testset)
mean(pred_test == testset$y)
## [1] 0.9310345
plot(svm_model, trainset)
# refining the decision boundary
# tune parameters
tune_out <- tune.svm(x = trainset[, -3], y = trainset[, 3], gamma = 5*10 ^ (-2:2),
cost = c(0.01, 0.1, 1, 10, 100),
type = "C-classification",
kernel = "radial")
tune_out$best.parameters$cost
## [1] 100
tune_out$best.parameters$gamma
## [1] 5
# refit model with best parameters
svm_model <- svm(y ~ ., data = trainset, type = "C-classification", kernel = "radial",
cost = tune_out$best.parameters$cost, gamma = tune_out$best.parameters$gamma)
pred_train <- predict(svm_model, trainset)
mean(pred_train == trainset$y)
## [1] 0.9979339
pred_test <- predict(svm_model, testset)
mean(pred_test == testset$y)
## [1] 0.9741379
plot(svm_model, trainset) # plot shows the optimal parameters are the best
Note: the rbf has a local nature.