ls()
## character(0)
rm(list=ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 358620 19.2 592000 31.7 460000 24.6
## Vcells 549686 4.2 1023718 7.9 839610 6.5
memory.size()
## [1] 30.12
memory.limit()
## [1] 4027
library(kernlab)
setwd("C:\\Kamal\\Learning and Entertainment\\Data Science and Analytics\\Business Analytics\\Assignment 9 SVM\\track2")
library(data.table)
dtTraining <- fread("training.txt" , nrows=1000)
library(e1071)
str(dtTraining)
## Classes 'data.table' and 'data.frame': 1000 obs. of 12 variables:
## $ V1 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ V2 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ V3 : num 4.30e+18 4.86e+18 9.70e+18 1.37e+19 3.28e+18 ...
## $ V4 : int 7686695 21560664 21748480 3517124 20758093 21375650 4427028 4428493 20945590 21406020 ...
## $ V5 : int 385 37484 36759 23778 34535 36832 28647 28647 35083 36943 ...
## $ V6 : int 3 2 3 3 1 2 3 2 2 2 ...
## $ V7 : int 3 2 3 1 1 1 1 2 1 2 ...
## $ V8 : int 1601 2255103 4532751 1601 4532751 4688625 4532751 13171922 35143 4688625 ...
## $ V9 : int 5521 317 60721 2155 77819 202465 720719 1493 28111 202465 ...
## $ V10: int 7709 48989 685038 1207 266618 457316 3402221 11658 151695 1172072 ...
## $ V11: int 576 44771 29681 1422 222223 429545 2663964 5668 128782 973354 ...
## $ V12: int 490234 490234 490234 490234 490234 490234 490234 490234 490234 490234 ...
## - attr(*, ".internal.selfref")=<externalptr>
attach(dtTraining)
## classification mode
# default with factor response:
model <- svm(V6 ~ V7, data = dtTraining)
# alternatively the traditional interface:
x <- subset(dtTraining, select = V7)
y <- V6
model <- svm(x, y)
print(model)
##
## Call:
## svm.default(x = x, y = y)
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: radial
## cost: 1
## gamma: 1
## epsilon: 0.1
##
##
## Number of Support Vectors: 738
summary(model)
##
## Call:
## svm.default(x = x, y = y)
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: radial
## cost: 1
## gamma: 1
## epsilon: 0.1
##
##
## Number of Support Vectors: 738
# test with train data
pred <- predict(model, x)
# Check accuracy:
table(pred, y)
## y
## pred 1 2 3
## 1.92749199275473 289 233 74
## 2.07226625095115 0 244 79
## 2.92797550534338 0 0 81
#compute decision values and probabilities:
predictedY <- predict(model, x, decision.values = TRUE)
attr(predictedY, "decision.values")[1:4,]
## 1 2 3 4
## 1.36249278 0.17640251 1.36249278 -0.02426768
# visualize (classes by color, SV by crosses):
plot(cmdscale(dist(dtTraining)),
col = as.integer(dtTraining[,5]),
pch = c("o","+")[1:150 %in% model$index + 1])
## Verification of the predictions
rmse <- function(error)
{
sqrt(mean(error^2))
}
error <- dtTraining$V7 - predictedY # same as data$Y - predictedY
predictionRMSE <- rmse(error)
predictionRMSE
## [1] 0.7175032
library(e1071)
model <- svm(V6~V7 , dtTraining)
predictedY <- predict(model, dtTraining)
points(V7, predictedY, col = "red", pch=4)

#Error Computation
error <- V7 - predictedY
svrPredictionRMSE <- rmse(error)
svrPredictionRMSE
## [1] 0.7175032
# perform a grid search
tuneResult <- tune(svm, V6 ~ V7, data = dtTraining,
ranges = list(epsilon = seq(0,1,0.1), cost = 2^(2:9))
)
print(tuneResult)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## epsilon cost
## 0.4 4
##
## - best performance: 0.3555533
plot(tuneResult)

# Retrying with different parameters
tuneResult <- tune(svm, V6 ~ V7, data = dtTraining,
ranges = list(epsilon = seq(0.3,0.5,0.01), cost = 2^(2:9))
)
print(tuneResult)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## epsilon cost
## 0.41 64
##
## - best performance: 0.3556127
plot(tuneResult)

# Retrying with different parameters
tuneResult <- tune(svm, V6 ~ V7, data = dtTraining,
ranges = list(epsilon = seq(0.38,0.42,0.01), cost = 2^(2:9))
)
print(tuneResult)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## epsilon cost
## 0.41 64
##
## - best performance: 0.3555838
plot(tuneResult)

#Tuning the model to select best result
tunedModel <- tuneResult$best.model
tunedModelY <- predict(tunedModel, dtTraining)
error <- V6 - tunedModelY
# this value can be different on your computer
# because the tune method randomly shuffles the data
tunedModelRMSE <- rmse(error)
tunedModelRMSE
## [1] 0.5962719