SVM_Modelling.R

ls()

## character(0)

rm(list=ls())
gc()

##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 358620 19.2     592000 31.7   460000 24.6
## Vcells 549686  4.2    1023718  7.9   839610  6.5

memory.size()

## [1] 30.12

memory.limit()

## [1] 4027

library(kernlab)
setwd("C:\\Kamal\\Learning and Entertainment\\Data Science and Analytics\\Business Analytics\\Assignment 9 SVM\\track2")

library(data.table)
dtTraining <- fread("training.txt" , nrows=1000)

library(e1071)
str(dtTraining)

## Classes 'data.table' and 'data.frame':   1000 obs. of  12 variables:
##  $ V1 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ V2 : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ V3 : num  4.30e+18 4.86e+18 9.70e+18 1.37e+19 3.28e+18 ...
##  $ V4 : int  7686695 21560664 21748480 3517124 20758093 21375650 4427028 4428493 20945590 21406020 ...
##  $ V5 : int  385 37484 36759 23778 34535 36832 28647 28647 35083 36943 ...
##  $ V6 : int  3 2 3 3 1 2 3 2 2 2 ...
##  $ V7 : int  3 2 3 1 1 1 1 2 1 2 ...
##  $ V8 : int  1601 2255103 4532751 1601 4532751 4688625 4532751 13171922 35143 4688625 ...
##  $ V9 : int  5521 317 60721 2155 77819 202465 720719 1493 28111 202465 ...
##  $ V10: int  7709 48989 685038 1207 266618 457316 3402221 11658 151695 1172072 ...
##  $ V11: int  576 44771 29681 1422 222223 429545 2663964 5668 128782 973354 ...
##  $ V12: int  490234 490234 490234 490234 490234 490234 490234 490234 490234 490234 ...
##  - attr(*, ".internal.selfref")=<externalptr>

attach(dtTraining)

## classification mode
# default with factor response:

model <- svm(V6 ~ V7, data = dtTraining)

# alternatively the traditional interface:

x <- subset(dtTraining, select = V7)
y <- V6

model <- svm(x, y)


print(model)

## 
## Call:
## svm.default(x = x, y = y)
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  1 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  738

summary(model)

## 
## Call:
## svm.default(x = x, y = y)
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  1 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  738

# test with train data
pred <- predict(model, x)


# Check accuracy:
table(pred, y)

##                   y
## pred                 1   2   3
##   1.92749199275473 289 233  74
##   2.07226625095115   0 244  79
##   2.92797550534338   0   0  81

#compute decision values and probabilities:
predictedY <- predict(model, x, decision.values = TRUE)
attr(predictedY, "decision.values")[1:4,]

##           1           2           3           4 
##  1.36249278  0.17640251  1.36249278 -0.02426768

# visualize (classes by color, SV by crosses):
plot(cmdscale(dist(dtTraining)),
     col = as.integer(dtTraining[,5]),
     pch = c("o","+")[1:150 %in% model$index + 1])



## Verification of the predictions

rmse <- function(error)
{
  sqrt(mean(error^2))
}

error <- dtTraining$V7 - predictedY  # same as data$Y - predictedY
predictionRMSE <- rmse(error) 

predictionRMSE

## [1] 0.7175032

library(e1071)


model <- svm(V6~V7 , dtTraining)

predictedY <- predict(model, dtTraining)

points(V7, predictedY, col = "red", pch=4)

#Error Computation
error <- V7 - predictedY
svrPredictionRMSE <- rmse(error)  

svrPredictionRMSE

## [1] 0.7175032

# perform a grid search
tuneResult <- tune(svm, V6 ~ V7,  data = dtTraining,
                   ranges = list(epsilon = seq(0,1,0.1), cost = 2^(2:9))
)
print(tuneResult)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  epsilon cost
##      0.4    4
## 
## - best performance: 0.3555533

plot(tuneResult)

# Retrying with different parameters 
tuneResult <- tune(svm, V6 ~ V7,  data = dtTraining,
                   ranges = list(epsilon = seq(0.3,0.5,0.01), cost = 2^(2:9))
) 

print(tuneResult)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  epsilon cost
##     0.41   64
## 
## - best performance: 0.3556127

plot(tuneResult)

# Retrying with different parameters 
tuneResult <- tune(svm, V6 ~ V7,  data = dtTraining,
                   ranges = list(epsilon = seq(0.38,0.42,0.01), cost = 2^(2:9))
) 

print(tuneResult)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  epsilon cost
##     0.41   64
## 
## - best performance: 0.3555838

plot(tuneResult)

#Tuning the model to select best result 

tunedModel <- tuneResult$best.model
tunedModelY <- predict(tunedModel, dtTraining) 

error <- V6 - tunedModelY  

# this value can be different on your computer
# because the tune method  randomly shuffles the data
tunedModelRMSE <- rmse(error)  

tunedModelRMSE

## [1] 0.5962719

SVM_Modelling.R

Kamalm

Sun Nov 06 03:38:29 2016