SVM: Kernel polinomial y radial

Dasha Ivanova

11/12/2019

SVM con Kernel Polinomial

Librerías necesarias:

library(ggplot2)
library(e1071) # svm
library(kernlab) # ksvm
library(caret) # funciones createDataPartition y confusionMatrix

Generamos los datos:

# Definimos n valores
n <- 400
set.seed(1)

# Generamos dataframe con dos variables x1, x2 uniformemente distribuidas
df <- data.frame(x1 = runif(n, min = -1, max = 1), 
                 x2 = runif(n, min = -1, max = 1))

# Establecemos el límite del radio
radius <- 0.8
radius_squared <- radius^2

# Definimos la variable dependiente categórica "y"
df$y <- factor(ifelse(df$x1^2 + df$x2^2 < radius_squared, -1, 1),
               levels = c(-1, 1))

# Represento datos
ggplot(data=df, aes(x=x1, y=x2, color=y)) +
    geom_point()

Dividimos el conjunto de datos en:

80% –> datos de entrenamiento
20% –> datos para testear

# Adjudico aleatoriamente un 80% de los datos como entrenamiento
df[,"train"] <- ifelse(runif(nrow(df))<0.8,1,0)
# Separamos los dos sets: train y test
trainset <- df[df$train==1,]
testset <- df[df$train==0,]
# Vemos en qué columna se encuentran los datos 0 y 1 que deciden si
# se trata de un registro de entrenamiento o de testeo
trainColNum <- grep("train",names(trainset))
# Elimino dicha columna
trainset <- trainset[,-trainColNum]
testset <- testset[,-trainColNum]

Generamos el modelo a partir de los datos de entrenamiento, con kernel polinomial de grado 2:

svm_model <- svm(y ~ ., data = trainset, type = "C-classification",
                 kernel = "polynomial", degree = 2)

# Medimos la precisión del entrenamiento y del test
pred_train <- predict(svm_model, trainset)
mean(pred_train == trainset$y)

## [1] 0.9655172

pred_test <- predict(svm_model, testset)
mean(pred_test == testset$y)

## [1] 0.962963

# Plot
plot(svm_model, trainset)

Obtener el valor óptimo de los parámetros COST, GAMMA y COEF0:

# Tune model
tune_out <- tune.svm(x = trainset[, -3], y = trainset[, 3],
                     type = "C-classification",
                     kernel = "polynomial", degree = 2, cost = 10^(-1:2),
                     gamma = c(0.1, 1, 10), coef0 = c(0.1, 1, 10))

# cost = c(0.1,1,10,100)
# gamma = c(0.1, 1, 10)
# coef0 = c(0.1, 1, 10)

#list optimal values
tune_out$best.parameters$cost

## [1] 0.1

tune_out$best.parameters$gamma

## [1] 10

tune_out$best.parameters$coef0

## [1] 0.1

Generamos el modelo con los parámetros óptimos:

# Tuned model
svm_model <- svm(y~ ., data = trainset, type = "C-classification", 
                 kernel = "polynomial", degree = 2, 
                 cost = tune_out$best.parameters$cost, 
                 gamma = tune_out$best.parameters$gamma, 
                 coef0 = tune_out$best.parameters$coef0)

# Precisión del modelo "tuneado" con trainset y testset
pred_train <- predict(svm_model, trainset)
mean(pred_train == trainset$y)

## [1] 1

pred_test <- predict(svm_model, testset)
mean(pred_test == testset$y)

## [1] 0.962963

# Plot model
plot(svm_model, trainset)

SVM con Kernel Radial

Para clasificación SVM no lineal.

Presentamos gráficamente los datos:

# Plot de los datos. Variables x1,x2 e y
plot(x, col = y + 1)

Modelo SVM con kernel radial:

# Adjudico aleatoriamente un 80% de los datos como entrenamiento
dat[,"train"] <- ifelse(runif(nrow(dat))<0.8,1,0)
# Separamos los dos sets: train y test
trainset <- dat[dat$train==1,]
testset <- dat[dat$train==0,]
# Vemos en qué columna se encuentran los datos 0 y 1 que deciden si
# se trata de un registro de entrenamiento o de testeo
trainColNum <- grep("train",names(trainset))
# Elimino dicha columna
trainset <- trainset[,-trainColNum]
testset <- testset[,-trainColNum]

# Generamos el modelo
dat <- data.frame(y = factor(y), x)
fit <- svm(factor(y) ~ X2+X1, data = trainset, 
           scale = FALSE, kernel = "radial", 
           cost = 5)

# Medimos la precisión del entrenamiento y del test
pred_train <- predict(fit, trainset)
mean(pred_train == trainset$y)

## [1] 0.8378378

pred_test <- predict(fit, testset)
mean(pred_test == testset$y)

## [1] 0.8076923

# Plot
plot(fit, trainset)

Ejemplo Kernel Polinomial con Librería Kernlab

ksvm_model <- ksvm(y ~ ., data = trainset, 
                   type = "C-svc", 
                   kernel = "polydot",
                   kpar=list(degree=2))

# Medimos la precisión del entrenamiento y del test
pred_train <- predict(ksvm_model, trainset)
mean(pred_train == trainset$y)

## [1] 0.9968652

pred_test <- predict(ksvm_model, testset)
mean(pred_test == testset$y)

## [1] 0.9382716

Ejemplo con Librería Kernlab

# Utilizamos el data set "spam" email
data(spam)

# Visualizamos los datos
head(spam)

##   make address  all num3d  our over remove internet order mail receive
## 1 0.00    0.64 0.64     0 0.32 0.00   0.00     0.00  0.00 0.00    0.00
## 2 0.21    0.28 0.50     0 0.14 0.28   0.21     0.07  0.00 0.94    0.21
## 3 0.06    0.00 0.71     0 1.23 0.19   0.19     0.12  0.64 0.25    0.38
## 4 0.00    0.00 0.00     0 0.63 0.00   0.31     0.63  0.31 0.63    0.31
## 5 0.00    0.00 0.00     0 0.63 0.00   0.31     0.63  0.31 0.63    0.31
## 6 0.00    0.00 0.00     0 1.85 0.00   0.00     1.85  0.00 0.00    0.00
##   will people report addresses free business email  you credit your font
## 1 0.64   0.00   0.00      0.00 0.32     0.00  1.29 1.93   0.00 0.96    0
## 2 0.79   0.65   0.21      0.14 0.14     0.07  0.28 3.47   0.00 1.59    0
## 3 0.45   0.12   0.00      1.75 0.06     0.06  1.03 1.36   0.32 0.51    0
## 4 0.31   0.31   0.00      0.00 0.31     0.00  0.00 3.18   0.00 0.31    0
## 5 0.31   0.31   0.00      0.00 0.31     0.00  0.00 3.18   0.00 0.31    0
## 6 0.00   0.00   0.00      0.00 0.00     0.00  0.00 0.00   0.00 0.00    0
##   num000 money hp hpl george num650 lab labs telnet num857 data num415
## 1   0.00  0.00  0   0      0      0   0    0      0      0    0      0
## 2   0.43  0.43  0   0      0      0   0    0      0      0    0      0
## 3   1.16  0.06  0   0      0      0   0    0      0      0    0      0
## 4   0.00  0.00  0   0      0      0   0    0      0      0    0      0
## 5   0.00  0.00  0   0      0      0   0    0      0      0    0      0
## 6   0.00  0.00  0   0      0      0   0    0      0      0    0      0
##   num85 technology num1999 parts pm direct cs meeting original project
## 1     0          0    0.00     0  0   0.00  0       0     0.00       0
## 2     0          0    0.07     0  0   0.00  0       0     0.00       0
## 3     0          0    0.00     0  0   0.06  0       0     0.12       0
## 4     0          0    0.00     0  0   0.00  0       0     0.00       0
## 5     0          0    0.00     0  0   0.00  0       0     0.00       0
## 6     0          0    0.00     0  0   0.00  0       0     0.00       0
##     re  edu table conference charSemicolon charRoundbracket
## 1 0.00 0.00     0          0          0.00            0.000
## 2 0.00 0.00     0          0          0.00            0.132
## 3 0.06 0.06     0          0          0.01            0.143
## 4 0.00 0.00     0          0          0.00            0.137
## 5 0.00 0.00     0          0          0.00            0.135
## 6 0.00 0.00     0          0          0.00            0.223
##   charSquarebracket charExclamation charDollar charHash capitalAve
## 1                 0           0.778      0.000    0.000      3.756
## 2                 0           0.372      0.180    0.048      5.114
## 3                 0           0.276      0.184    0.010      9.821
## 4                 0           0.137      0.000    0.000      3.537
## 5                 0           0.135      0.000    0.000      3.537
## 6                 0           0.000      0.000    0.000      3.000
##   capitalLong capitalTotal type
## 1          61          278 spam
## 2         101         1028 spam
## 3         485         2259 spam
## 4          40          191 spam
## 5          40          191 spam
## 6          15           54 spam

# Creamos trainset (80%) y testset (20%)
# elegimos una muestra aleatoria de los indices
index <- sample(1:dim(spam)[1]) 
# guardamos en train el primer 80% de los datos
spamtrain <- spam[index[1:floor(dim(spam)[1]*0.8)],] 
# guardamos en test el 20% restante de los datos
spamtest <- spam[index[((ceiling(dim(spam)[1]*0.8))):dim(spam)[1]],] 

## Otra forma de crear el trainset y testset
# inTrain <- createDataPartition(y=spam$type, p=0.8, list=FALSE)
# training <- spam[inTrain,]
# testing <- spam[-inTrain,]

# Aplicamos el modelo SVM
# kernel = Radial Basis kernel "Gaussian" (rbfdot)
# C = coste
# kpar = parámetros kernel
filter <- ksvm(type~.,data = spamtrain,kernel="rbfdot",
               kpar=list(sigma=0.05),C=5)
filter

## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 5 
## 
## Gaussian Radial Basis kernel function. 
##  Hyperparameter : sigma =  0.05 
## 
## Number of Support Vectors : 1346 
## 
## Objective Function Value : -1721.016 
## Training error : 0.023913

# Predicción si será "spam" o "no spam" el email sobre el test set
mailtype <- predict(filter,spamtest[,-58])

# Chequeamos los resultados
table(mailtype,spamtest[,58])

##          
## mailtype  nonspam spam
##   nonspam     528   47
##   spam         21  325

# Tabla cruzada de observado y predicho con estadísticos asociados
confusionMatrix(mailtype, spamtest$type)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction nonspam spam
##    nonspam     528   47
##    spam         21  325
##                                           
##                Accuracy : 0.9262          
##                  95% CI : (0.9073, 0.9422)
##     No Information Rate : 0.5961          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8449          
##                                           
##  Mcnemar's Test P-Value : 0.002432        
##                                           
##             Sensitivity : 0.9617          
##             Specificity : 0.8737          
##          Pos Pred Value : 0.9183          
##          Neg Pred Value : 0.9393          
##              Prevalence : 0.5961          
##          Detection Rate : 0.5733          
##    Detection Prevalence : 0.6243          
##       Balanced Accuracy : 0.9177          
##                                           
##        'Positive' Class : nonspam         
##

# Medimos la precisión del entrenamiento y del test
mail_train <- predict(filter, spamtrain[,-58])
mean(mail_train == spamtrain$type)

## [1] 0.976087

mail_test <- predict(filter, spamtest[,-58])
mean(mail_test == spamtest$type)

## [1] 0.9261672