Use the ksvm method to perform SVM classification on some data set of your choice (as typical examples, use the Ionosphere data set or the Breast Cancer Wisconsin (Diagnostic) data set.
The dataset used is the Ionosphere data set This radar data was collected by a system in Goose Bay, Labrador. 351 observations, 34 variables, last variable being a class, good or bad. “Good” radar returns are those showing evidence of some type of structure in the ionosphere. “Bad” returns are those that do not; their signals pass through the ionosphere.
In this work, SVM with linear, polynomial and RBF kernel were used, with the following parameters
| Kernel type | Cost | Epsilon | Kernel type | Cost | Epsilon | Kernel type | Cost | Epsilon |
|---|---|---|---|---|---|---|---|---|
| Linear | 1 | 0.001 | Polynomial | 1 | 0.001 | RBF | 1 | 0.001 |
| Linear | 1 | 0.01 | Polynomial | 1 | 0.01 | RBF | 1 | 0.01 |
| Linear | 1 | 0.1 | Polynomial | 1 | 0.1 | RBF | 1 | 0.1 |
| Linear | 10 | 0.001 | Polynomial | 10 | 0.001 | RBF | 10 | 0.001 |
| Linear | 10 | 0.01 | Polynomial | 10 | 0.01 | RBF | 10 | 0.01 |
| Linear | 10 | 0.1 | Polynomial | 10 | 0.1 | RBF | 10 | 0.1 |
In order to compare the performance of the mentioned methods, the 5-times 10-fold cross-validation has been performed. 5 average accuracies for each mehod were presented as boxplots. It is important to see the variance of accuracy, would it vary a little or a lot if the experiment is repeated.
It can be seen that RBF kernel performs considerably better. With this plots it is also possible to tune the parameters of the model.
The best parameters for this data are:
Linear kernel SVM with C=10 and e=0.001
Polynomial kernel SVM with C=1 and e=0.001
RBF kernel SVM with C=10 and e=0.001 (gives out the highest accuracy)
The RBF kernel with these parameters is further used to build another model on 80% of the dataset and validate it on the 20%.
## Warning: package 'kernlab' was built under R version 3.2.5
## [1] "Confusion matrix"
## prediction
## t_df_test -1 1
## -1 20 1
## 1 0 50
## [1] "Accuracy"
## [1] 0.9859155
The k-fold cross validation is useful in evaluating the models and tuning the parameters. Parameter evaluation could have been done in an automatic way, taking not only 2 values of C and 3 values of epsilon, but the execution time would rise, even now we are building the model \(18*10*5\) times.
The variance of the accuracy has to be taken into account, because the methods that give small variance on the accuracy are preferred over those that have greater accuracy, but also greater variance. The accuracy of
## [1] 0.9859155
has been achieved after the tuning.
#install.packages("kernlab")
library(kernlab)
#################################
# Problem Set 3
# Classifying the ionosphere data
#################################
##################
# Reading the data
##################
url <- "http://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data"
ion_data <- read.table(url, sep=",")
# looking at the data and removing V2 as it contains only 0
# head(ion_data)
# summary(ion_data)
ion_data <- ion_data[,c(1,3:length(ion_data))]
# renaming the classes
ion_data$V36[ion_data$V35=="g"] <- 1
ion_data$V36[ion_data$V35=="b"] <- -1
# this is the dataset we are going to work with
dataset <- cbind(ion_data[1:(length(ion_data[1,])-2)], Class=ion_data$V36)
#########################
# Training the classifier
#########################
# we are preforming 10-fold cross validation
N <- length(dataset[,1])
k <- 10
valid.error <- rep(0,k)
# the function that given the parameters of the SVM (kernel type, C and epsilon),
# returns the average validation error
train.svm <- function (parameter)
{
# each time we are sampling in a different way
folds <- sample(rep(1:k, length=N), N, replace=FALSE)
# k-fold validation
for (i in 1:k)
{
which.kernel <- parameter[1]
mycost <- parameter[2]
epsilon <- parameter[3]
train <- dataset[folds!=i,] # for building the model (training)
valid <- dataset[folds==i,] # for prediction (validation)
x_train <- train[,1:(length(dataset[1,])-1)]
t_train <- train[,length(dataset[1,])]
if (which.kernel=="linear"){
model <- ksvm (as.factor(Class) ~ ., data = train,kernel='vanilladot', C=mycost, epsilon =epsilon, cross=length(train))
}
if (which.kernel=="poly"){
model <- ksvm (as.factor(Class) ~ ., data = train,kernel='polydot', C=mycost, epsilon =epsilon, cross=length(train))
}
if (which.kernel=="RBF"){
model <- ksvm (as.factor(Class) ~ ., data = train,kernel='rbf', C=mycost, epsilon =epsilon, cross=length(train))
}
x_valid <- valid[,1:(length(dataset[1,])-1)]
pred <- predict(model,x_valid)
t_true <- valid[,length(dataset[1,])]
# compute validation error for part 'i'
valid.error[i] <- sum(pred != t_true)/length(t_true)
}
# return average validation error
sum(valid.error)/length(valid.error)
}
############################################
# Building up the result df and parameter df
############################################
# we are going to repeat k-fold validation 5 times
times <- 5
# this is the dataframe of errors in all the methods used
VA.error <- as.data.frame(matrix(nrow = 18, ncol = times+1))
colnames(VA.error)[1] <- "name"
VA.error$name <- c("linear_C1_e001", "linear_C1_e01", "linear_C1_e1",
"linear_C10_e001", "linear_C10_e01", "linear_C10_e1",
"poly_C1_e001", "poly_C1_e01", "poly_C1_e1",
"poly_C10_e001", "poly_C10_e01", "poly_C10_e1",
"RBF_C1_e001", "RBF_C1_e01", "RBF_C1_e1",
"RBF_C10_e001", "RBF_C10_e01", "RBF_C10_e1")
# this is the dataframe of parameters used
parameters <- as.data.frame(matrix(nrow = 18, ncol = 3))
colnames(parameters) <- c("name", "c", "e")
parameters[c(1:6),1] <- "linear"
parameters[c(7:12),1] <- "poly"
parameters[c(13:18),1] <- "RBF"
parameters[c(1:3, 7:9, 13:15),2] <- 1
parameters[c(4:6, 10:12, 16:18),2] <- 10
parameters[seq(1, 18, by=3),3] <- 0.001
parameters[seq(2, 18, by=3),3] <- 0.01
parameters[seq(3, 18, by=3),3] <- 0.1
# for each row in parameters, repeat 10-fold cross validation 5 times and write into VA.error
# this is going to take 5-10 mins and repeat 18*5 times
for (j in 1:length(parameters$name)){
print(j)
for (i in 1:times){
print(i)
VA.error[j,i+1] <- train.svm(parameters[j,])
}
}
# transforming errors to accuracies
accuracy <- VA.error
accuracy[,c(2:length(accuracy[1,]))] <- 1-accuracy[,c(2:length(accuracy[1,]))]
# drawing plots
boxplot(t(accuracy[c(1:6),c(2:length(accuracy[1,]))] ), las = 2, names=accuracy[c(1:6), 1], main="SVM with linear kernel")
boxplot(t(accuracy[c(7:12),c(2:length(accuracy[1,]))] ), las = 2, names=accuracy[c(7:12), 1], main="SVM with polynomial kernel")
boxplot(t(accuracy[c(13:18),c(2:length(accuracy[1,]))] ), las = 2, names=accuracy[c(13:18), 1], main="SVM with RBF kernel")
# best model seems to be the SVM with RBF kernel with the parameters C=1 and epsilon = 0.01
smp_size <- floor(0.80 * nrow(dataset))
# set the seed to make your partition reproductible
# devide into training and testing parts
set.seed(123)
train_ind <- sample(seq_len(nrow(dataset)), size = smp_size)
df_train <- dataset[train_ind, ]
df_test <- dataset[-train_ind, ]
x_df_train <- df_train[,1:(length(dataset[1,])-1)]
t_df_train <- df_train[,length(dataset[1,])]
x_df_test <- df_test[,1:(length(dataset[1,])-1)]
t_df_test <- df_test[,length(dataset[1,])]
# run the model and calculate the predictions
model_final <- ksvm (as.factor(Class) ~ ., data = df_train,kernel='rbf', C=1, epsilon=0.01, cross=length(df_train))
prediction <- predict(model_final,newdata=x_df_test)
# observe the confusion matrix and the accuracy of 94 percent
conf_mat <- table(t_df_test, prediction)
sum(diag(conf_mat))/sum(conf_mat)