data <- iris
str(data)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
#Scatter plot
library(ggplot2)
qplot(Petal.Length, Petal.Width, data=data, color = Species)
We can see three classes with different colors.
#svm model
library(e1071)
## Warning: package 'e1071' was built under R version 3.5.3
#model is built without data partion just for illustration purpose
model <- svm(Species ~ ., data=data)
summary(model)
##
## Call:
## svm(formula = Species ~ ., data = data)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 0.25
##
## Number of Support Vectors: 51
##
## ( 8 22 21 )
##
##
## Number of Classes: 3
##
## Levels:
## setosa versicolor virginica
#plot
plot(model, data=data,
Petal.Width~Petal.Length,
slice = list(Sepal.Width = 3, Sepal.Length = 4))
#prediction and Confusion Matrix
p <- predict(model, data)
(tab <- table(Predicted=p, Actual=iris$Species))
## Actual
## Predicted setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 48 2
## virginica 0 2 48
#Accuracy
sum(diag(tab))/sum(tab)
## [1] 0.9733333
#Misclassificatio Error
1-sum(diag(tab)/sum(tab))
## [1] 0.02666667
By default the above model was built using kernel function as “radial”. So there is about 2.6% misclassification error on entire data.
#svm model using kernel function as "linear"
model <- svm(Species ~ ., data=data, kernel="linear")
summary(model)
##
## Call:
## svm(formula = Species ~ ., data = data, kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
## gamma: 0.25
##
## Number of Support Vectors: 29
##
## ( 2 15 12 )
##
##
## Number of Classes: 3
##
## Levels:
## setosa versicolor virginica
#plot
plot(model, data=data,
Petal.Width~Petal.Length,
slice = list(Sepal.Width = 3, Sepal.Length = 4))
#prediction and Confusion Matrix
p <- predict(model, data)
tab <- table(Predicted=p, Actual=iris$Species)
tab
## Actual
## Predicted setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 46 1
## virginica 0 4 49
#Accuracy
sum(diag(tab))/sum(tab)
## [1] 0.9666667
#Misclassificatio Error
1-sum(diag(tab)/sum(tab))
## [1] 0.03333333
There is about 3.3% misclassification error on entire data using “linear” kernal function. This error is slightly larger than the above error.
#svm model using kernel function as "polynomial"
model <- svm(Species ~ ., data=data, kernel="polynomial")
summary(model)
##
## Call:
## svm(formula = Species ~ ., data = data, kernel = "polynomial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 1
## degree: 3
## gamma: 0.25
## coef.0: 0
##
## Number of Support Vectors: 54
##
## ( 6 26 22 )
##
##
## Number of Classes: 3
##
## Levels:
## setosa versicolor virginica
plot(model, data=data,
Petal.Width~Petal.Length,
slice = list(Sepal.Width = 3, Sepal.Length = 4))
#prediction and Confusion Matrix
p <- predict(model, data)
tab <- table(Predicted=p, Actual=iris$Species)
tab
## Actual
## Predicted setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 50 7
## virginica 0 0 43
#Accuracy
sum(diag(tab))/sum(tab)
## [1] 0.9533333
#Misclassificatio Error
1-sum(diag(tab)/sum(tab))
## [1] 0.04666667
The misclassification error is even higher at 4.6% as compared to the above two models on entire data when built using “polynomial” kernal function.
#svm model using kernel function as "sigmoid"
model <- svm(Species ~ ., data=data, kernel="sigmoid")
summary(model)
##
## Call:
## svm(formula = Species ~ ., data = data, kernel = "sigmoid")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: sigmoid
## cost: 1
## gamma: 0.25
## coef.0: 0
##
## Number of Support Vectors: 54
##
## ( 6 26 22 )
##
##
## Number of Classes: 3
##
## Levels:
## setosa versicolor virginica
plot(model, data=data,
Petal.Width~Petal.Length,
slice = list(Sepal.Width = 3, Sepal.Length = 4))
#prediction and Confusion Matrix
p <- predict(model, data)
tab <- table(Predicted=p, Actual=iris$Species)
tab
## Actual
## Predicted setosa versicolor virginica
## setosa 49 0 0
## versicolor 1 41 7
## virginica 0 9 43
#Accuracy
sum(diag(tab))/sum(tab)
## [1] 0.8866667
#Misclassificatio Error
1-sum(diag(tab)/sum(tab))
## [1] 0.1133333
Misclassification error is 11.3% using ‘sigmoid’ kernel function. This error is significantly larger than all the above models.
Finally, ‘radial’ kernal function gave minimal misclassification error.
#by default uses "radial" kernal function
set.seed(123)
tmodel <- tune(svm, Species ~ ., data=data,
ranges = list(epsilon = seq(0,1,0.1), cost = 2 ^ (2:7)))
plot(tmodel)
#Choose best model
mymodel <- tmodel$best.model
summary(mymodel)
##
## Call:
## best.tune(method = svm, train.x = Species ~ ., data = data, ranges = list(epsilon = seq(0,
## 1, 0.1), cost = 2^(2:7)))
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 8
## gamma: 0.25
##
## Number of Support Vectors: 35
##
## ( 6 15 14 )
##
##
## Number of Classes: 3
##
## Levels:
## setosa versicolor virginica
plot(mymodel, data=data,
Petal.Width~Petal.Length,
slice = list(Sepal.Width = 3, Sepal.Length = 4))
#prediction and Confusion Matrix
p <- predict(mymodel, data)
tab <- table(Predicted=p, Actual=iris$Species)
tab
## Actual
## Predicted setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 48 0
## virginica 0 2 50
#Accuracy
sum(diag(tab))/sum(tab)
## [1] 0.9866667
#Misclassificatio Error
1-sum(diag(tab)/sum(tab))
## [1] 0.01333333
Finally with fine tuning model, the misclassification error has significantly come down to 0.013% as against 2.6% in the above first model which was built using the same “radial” kernal function.
Thanks!!!