Machine learning intro in R: Support Vector Regression

Generate some random data

x = 1:75
y = cumsum((rnorm(length(x))))
# plot
makePlot <-function(x,y){
  plot(x,y,col="black",pch=5,lwd=1)
  lines(x,y,lty=2, lwd=2)
  grid()}
makePlot(x,y)
title("original data")

Linear regression

# make data frame named `Data`
Data<-data.frame(cbind(x,y))
# Create a linear regression model
linregress_model <- lm(y ~ x, data=Data)
# make predictions for regression model for each x val
predictYlinregress <- predict(linregress_model,Data)
 
# show predictions with orignal
makePlot(x,y)
title("original data + linear regression")
abline(linregress_model, col="red")

Root mean squared error function

rmse <- function(errval)
{
  val = sqrt(mean(errval^2))
  return(val)
}

Error in linear regression model

errval <- linregress_model$residuals  # same as data$Y - predictedY
linregress_RMSE <- rmse(errval)   
print(paste('logregress RMSE = ', 
            linregress_RMSE))
## [1] "logregress RMSE =  1.4337480655808"

Support vector regress

#install.packages("e1071")
library(e1071)
## Warning: package 'e1071' was built under R version 3.2.5
# svm model 
svm_model <- svm(y ~ x , Data)
#predicted vals for all X
predictYsvm <- predict(svm_model, Data)
# viz comparison
makePlot(x,y)
title("original data + linear regression + svr")
abline(linregress_model, col="red")
points(Data$x, predictYsvm, col = "blue", pch=4)
points(Data$x, predictYsvm, col = "blue", type="l")

#### Error in svr

errval <- Data$y - predictYsvm 
svr_RMSE <- rmse(errval)   
print(paste('svr RMSE = ', 
            svr_RMSE))
## [1] "svr RMSE =  1.03189964199952"

Tune SVM regression model

Identify 'best' parameters

# perform a grid search 
# (this might take a few seconds, adjust how fine of grid if taking too long)
tuneResult1 <- tune(svm, y ~ x,  data = Data,
              ranges = list(epsilon = seq(0,1,0.1), cost = 2^(seq(0.5,8,.5)))
)

# Map tuning results
plot(tuneResult1)

#### Finer grid

tuneResult <- tune(svm, y ~ x,  data = Data,
              ranges = list(epsilon = seq(tuneResult1$best.model$epsilon-.15,
                                          tuneResult1$best.model$epsilon+.15,
                                          0.01), 
                            cost = seq(2^(log2(tuneResult1$best.model$cost)-1),
                                       2^(log2(tuneResult1$best.model$cost)+1),
                                       length=6))
)

plot(tuneResult)

Optimized/tuned result parameters:

print(tuneResult)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  epsilon cost
##     0.26  512
## 
## - best performance: 0.8322872

Predict tuned and Viz results

#predicted vals for all X for tuned
tunedVals <-tuneResult$best.model
predictYsvm2 <- predict(tunedVals, Data)

# viz comparison
makePlot(x,y)
title("original data + linear regression + svr + tuned svm")
abline(linregress_model, col="red")
points(Data$x, predictYsvm, col = "blue", pch=4)
points(Data$x, predictYsvm, col = "blue", type="l")
points(Data$x, predictYsvm2, col = "green", pch=5)
points(Data$x, predictYsvm2, col = "green", type="l")
legend("bottomleft", # places a legend at the appropriate place 
       c("Data","Linear regress","SVM regress","tuned SVM regress"))

Note: The example shown here is for illustrative purposes only, as in most cases the regression example shown in the previous plot would be a severly overfitted model.

Compare root mean squared error for different models

errval2 <- Data$y - predictYsvm2 
svr_RMSE2 <- rmse(errval2)   

vals <- matrix(c(linregress_RMSE,svr_RMSE,svr_RMSE2),ncol=3,byrow=TRUE)
colnames(vals) <- c("Lin regress  ","SVM model  ","Tuned SVM model ")
rownames(vals) <- c("RMSE of model")
as.table(vals)
##               Lin regress   SVM model   Tuned SVM model 
## RMSE of model     1.4337481   1.0318996        0.8043095