Dasha Ivanova
11/12/2019
Librerías necesarias:
library(ggplot2)
library(e1071) # svm
library(kernlab) # ksvm
library(caret) # funciones createDataPartition y confusionMatrixGeneramos los datos:
# Definimos n valores
n <- 400
set.seed(1)
# Generamos dataframe con dos variables x1, x2 uniformemente distribuidas
df <- data.frame(x1 = runif(n, min = -1, max = 1),
x2 = runif(n, min = -1, max = 1))
# Establecemos el límite del radio
radius <- 0.8
radius_squared <- radius^2
# Definimos la variable dependiente categórica "y"
df$y <- factor(ifelse(df$x1^2 + df$x2^2 < radius_squared, -1, 1),
levels = c(-1, 1))
# Represento datos
ggplot(data=df, aes(x=x1, y=x2, color=y)) +
geom_point()Dividimos el conjunto de datos en:
80% –> datos de entrenamiento
20% –> datos para testear
# Adjudico aleatoriamente un 80% de los datos como entrenamiento
df[,"train"] <- ifelse(runif(nrow(df))<0.8,1,0)
# Separamos los dos sets: train y test
trainset <- df[df$train==1,]
testset <- df[df$train==0,]
# Vemos en qué columna se encuentran los datos 0 y 1 que deciden si
# se trata de un registro de entrenamiento o de testeo
trainColNum <- grep("train",names(trainset))
# Elimino dicha columna
trainset <- trainset[,-trainColNum]
testset <- testset[,-trainColNum]Generamos el modelo a partir de los datos de entrenamiento, con kernel polinomial de grado 2:
svm_model <- svm(y ~ ., data = trainset, type = "C-classification",
kernel = "polynomial", degree = 2)
# Medimos la precisión del entrenamiento y del test
pred_train <- predict(svm_model, trainset)
mean(pred_train == trainset$y)## [1] 0.9655172
## [1] 0.962963
Obtener el valor óptimo de los parámetros COST, GAMMA y COEF0:
# Tune model
tune_out <- tune.svm(x = trainset[, -3], y = trainset[, 3],
type = "C-classification",
kernel = "polynomial", degree = 2, cost = 10^(-1:2),
gamma = c(0.1, 1, 10), coef0 = c(0.1, 1, 10))
# cost = c(0.1,1,10,100)
# gamma = c(0.1, 1, 10)
# coef0 = c(0.1, 1, 10)
#list optimal values
tune_out$best.parameters$cost## [1] 0.1
## [1] 10
## [1] 0.1
Generamos el modelo con los parámetros óptimos:
# Tuned model
svm_model <- svm(y~ ., data = trainset, type = "C-classification",
kernel = "polynomial", degree = 2,
cost = tune_out$best.parameters$cost,
gamma = tune_out$best.parameters$gamma,
coef0 = tune_out$best.parameters$coef0)
# Precisión del modelo "tuneado" con trainset y testset
pred_train <- predict(svm_model, trainset)
mean(pred_train == trainset$y)## [1] 1
## [1] 0.962963
Para clasificación SVM no lineal.
Presentamos gráficamente los datos:
Modelo SVM con kernel radial:
# Adjudico aleatoriamente un 80% de los datos como entrenamiento
dat[,"train"] <- ifelse(runif(nrow(dat))<0.8,1,0)
# Separamos los dos sets: train y test
trainset <- dat[dat$train==1,]
testset <- dat[dat$train==0,]
# Vemos en qué columna se encuentran los datos 0 y 1 que deciden si
# se trata de un registro de entrenamiento o de testeo
trainColNum <- grep("train",names(trainset))
# Elimino dicha columna
trainset <- trainset[,-trainColNum]
testset <- testset[,-trainColNum]# Generamos el modelo
dat <- data.frame(y = factor(y), x)
fit <- svm(factor(y) ~ X2+X1, data = trainset,
scale = FALSE, kernel = "radial",
cost = 5)# Medimos la precisión del entrenamiento y del test
pred_train <- predict(fit, trainset)
mean(pred_train == trainset$y)## [1] 0.8378378
## [1] 0.8076923
ksvm_model <- ksvm(y ~ ., data = trainset,
type = "C-svc",
kernel = "polydot",
kpar=list(degree=2))
# Medimos la precisión del entrenamiento y del test
pred_train <- predict(ksvm_model, trainset)
mean(pred_train == trainset$y)## [1] 0.9968652
## [1] 0.9382716
## make address all num3d our over remove internet order mail receive
## 1 0.00 0.64 0.64 0 0.32 0.00 0.00 0.00 0.00 0.00 0.00
## 2 0.21 0.28 0.50 0 0.14 0.28 0.21 0.07 0.00 0.94 0.21
## 3 0.06 0.00 0.71 0 1.23 0.19 0.19 0.12 0.64 0.25 0.38
## 4 0.00 0.00 0.00 0 0.63 0.00 0.31 0.63 0.31 0.63 0.31
## 5 0.00 0.00 0.00 0 0.63 0.00 0.31 0.63 0.31 0.63 0.31
## 6 0.00 0.00 0.00 0 1.85 0.00 0.00 1.85 0.00 0.00 0.00
## will people report addresses free business email you credit your font
## 1 0.64 0.00 0.00 0.00 0.32 0.00 1.29 1.93 0.00 0.96 0
## 2 0.79 0.65 0.21 0.14 0.14 0.07 0.28 3.47 0.00 1.59 0
## 3 0.45 0.12 0.00 1.75 0.06 0.06 1.03 1.36 0.32 0.51 0
## 4 0.31 0.31 0.00 0.00 0.31 0.00 0.00 3.18 0.00 0.31 0
## 5 0.31 0.31 0.00 0.00 0.31 0.00 0.00 3.18 0.00 0.31 0
## 6 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0
## num000 money hp hpl george num650 lab labs telnet num857 data num415
## 1 0.00 0.00 0 0 0 0 0 0 0 0 0 0
## 2 0.43 0.43 0 0 0 0 0 0 0 0 0 0
## 3 1.16 0.06 0 0 0 0 0 0 0 0 0 0
## 4 0.00 0.00 0 0 0 0 0 0 0 0 0 0
## 5 0.00 0.00 0 0 0 0 0 0 0 0 0 0
## 6 0.00 0.00 0 0 0 0 0 0 0 0 0 0
## num85 technology num1999 parts pm direct cs meeting original project
## 1 0 0 0.00 0 0 0.00 0 0 0.00 0
## 2 0 0 0.07 0 0 0.00 0 0 0.00 0
## 3 0 0 0.00 0 0 0.06 0 0 0.12 0
## 4 0 0 0.00 0 0 0.00 0 0 0.00 0
## 5 0 0 0.00 0 0 0.00 0 0 0.00 0
## 6 0 0 0.00 0 0 0.00 0 0 0.00 0
## re edu table conference charSemicolon charRoundbracket
## 1 0.00 0.00 0 0 0.00 0.000
## 2 0.00 0.00 0 0 0.00 0.132
## 3 0.06 0.06 0 0 0.01 0.143
## 4 0.00 0.00 0 0 0.00 0.137
## 5 0.00 0.00 0 0 0.00 0.135
## 6 0.00 0.00 0 0 0.00 0.223
## charSquarebracket charExclamation charDollar charHash capitalAve
## 1 0 0.778 0.000 0.000 3.756
## 2 0 0.372 0.180 0.048 5.114
## 3 0 0.276 0.184 0.010 9.821
## 4 0 0.137 0.000 0.000 3.537
## 5 0 0.135 0.000 0.000 3.537
## 6 0 0.000 0.000 0.000 3.000
## capitalLong capitalTotal type
## 1 61 278 spam
## 2 101 1028 spam
## 3 485 2259 spam
## 4 40 191 spam
## 5 40 191 spam
## 6 15 54 spam
# Creamos trainset (80%) y testset (20%)
# elegimos una muestra aleatoria de los indices
index <- sample(1:dim(spam)[1])
# guardamos en train el primer 80% de los datos
spamtrain <- spam[index[1:floor(dim(spam)[1]*0.8)],]
# guardamos en test el 20% restante de los datos
spamtest <- spam[index[((ceiling(dim(spam)[1]*0.8))):dim(spam)[1]],]
## Otra forma de crear el trainset y testset
# inTrain <- createDataPartition(y=spam$type, p=0.8, list=FALSE)
# training <- spam[inTrain,]
# testing <- spam[-inTrain,]
# Aplicamos el modelo SVM
# kernel = Radial Basis kernel "Gaussian" (rbfdot)
# C = coste
# kpar = parámetros kernel
filter <- ksvm(type~.,data = spamtrain,kernel="rbfdot",
kpar=list(sigma=0.05),C=5)
filter## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 5
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.05
##
## Number of Support Vectors : 1346
##
## Objective Function Value : -1721.016
## Training error : 0.023913
# Predicción si será "spam" o "no spam" el email sobre el test set
mailtype <- predict(filter,spamtest[,-58])
# Chequeamos los resultados
table(mailtype,spamtest[,58])##
## mailtype nonspam spam
## nonspam 528 47
## spam 21 325
# Tabla cruzada de observado y predicho con estadísticos asociados
confusionMatrix(mailtype, spamtest$type)## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 528 47
## spam 21 325
##
## Accuracy : 0.9262
## 95% CI : (0.9073, 0.9422)
## No Information Rate : 0.5961
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8449
##
## Mcnemar's Test P-Value : 0.002432
##
## Sensitivity : 0.9617
## Specificity : 0.8737
## Pos Pred Value : 0.9183
## Neg Pred Value : 0.9393
## Prevalence : 0.5961
## Detection Rate : 0.5733
## Detection Prevalence : 0.6243
## Balanced Accuracy : 0.9177
##
## 'Positive' Class : nonspam
##
# Medimos la precisión del entrenamiento y del test
mail_train <- predict(filter, spamtrain[,-58])
mean(mail_train == spamtrain$type)## [1] 0.976087
## [1] 0.9261672