library(e1071)
## Warning: package 'e1071' was built under R version 3.6.3
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.6.3
library(knitr) # Para ver tablas mas amigables en formato html markdown
## Warning: package 'knitr' was built under R version 3.6.3
library(ggplot2) # Gráficas
## Warning: package 'ggplot2' was built under R version 3.6.3
library(dplyr) # Varias operaciones
## Warning: package 'dplyr' was built under R version 3.6.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret) # Para particionar datos. De entranamiento y de validación
## Warning: package 'caret' was built under R version 3.6.3
## Loading required package: lattice
#install.packages("e1071") # Para SVM
library(e1071)
datos <- OJ
str(datos)
## 'data.frame': 1070 obs. of 18 variables:
## $ Purchase : Factor w/ 2 levels "CH","MM": 1 1 1 2 1 1 1 1 1 1 ...
## $ WeekofPurchase: num 237 239 245 227 228 230 232 234 235 238 ...
## $ StoreID : num 1 1 1 1 7 7 7 7 7 7 ...
## $ PriceCH : num 1.75 1.75 1.86 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
## $ PriceMM : num 1.99 1.99 2.09 1.69 1.69 1.99 1.99 1.99 1.99 1.99 ...
## $ DiscCH : num 0 0 0.17 0 0 0 0 0 0 0 ...
## $ DiscMM : num 0 0.3 0 0 0 0 0.4 0.4 0.4 0.4 ...
## $ SpecialCH : num 0 0 0 0 0 0 1 1 0 0 ...
## $ SpecialMM : num 0 1 0 0 0 1 1 0 0 0 ...
## $ LoyalCH : num 0.5 0.6 0.68 0.4 0.957 ...
## $ SalePriceMM : num 1.99 1.69 2.09 1.69 1.69 1.99 1.59 1.59 1.59 1.59 ...
## $ SalePriceCH : num 1.75 1.75 1.69 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
## $ PriceDiff : num 0.24 -0.06 0.4 0 0 0.3 -0.1 -0.16 -0.16 -0.16 ...
## $ Store7 : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 2 2 2 2 2 ...
## $ PctDiscMM : num 0 0.151 0 0 0 ...
## $ PctDiscCH : num 0 0 0.0914 0 0 ...
## $ ListPriceDiff : num 0.24 0.24 0.23 0 0 0.3 0.3 0.24 0.24 0.24 ...
## $ STORE : num 1 1 1 1 0 0 0 0 0 0 ...
kable(summary(datos))
|
CH:653 |
Min. :227.0 |
Min. :1.00 |
Min. :1.690 |
Min. :1.690 |
Min. :0.00000 |
Min. :0.0000 |
Min. :0.0000 |
Min. :0.0000 |
Min. :0.000011 |
Min. :1.190 |
Min. :1.390 |
Min. :-0.6700 |
No :714 |
Min. :0.0000 |
Min. :0.00000 |
Min. :0.000 |
Min. :0.000 |
|
MM:417 |
1st Qu.:240.0 |
1st Qu.:2.00 |
1st Qu.:1.790 |
1st Qu.:1.990 |
1st Qu.:0.00000 |
1st Qu.:0.0000 |
1st Qu.:0.0000 |
1st Qu.:0.0000 |
1st Qu.:0.325257 |
1st Qu.:1.690 |
1st Qu.:1.750 |
1st Qu.: 0.0000 |
Yes:356 |
1st Qu.:0.0000 |
1st Qu.:0.00000 |
1st Qu.:0.140 |
1st Qu.:0.000 |
|
NA |
Median :257.0 |
Median :3.00 |
Median :1.860 |
Median :2.090 |
Median :0.00000 |
Median :0.0000 |
Median :0.0000 |
Median :0.0000 |
Median :0.600000 |
Median :2.090 |
Median :1.860 |
Median : 0.2300 |
NA |
Median :0.0000 |
Median :0.00000 |
Median :0.240 |
Median :2.000 |
|
NA |
Mean :254.4 |
Mean :3.96 |
Mean :1.867 |
Mean :2.085 |
Mean :0.05186 |
Mean :0.1234 |
Mean :0.1477 |
Mean :0.1617 |
Mean :0.565782 |
Mean :1.962 |
Mean :1.816 |
Mean : 0.1465 |
NA |
Mean :0.0593 |
Mean :0.02731 |
Mean :0.218 |
Mean :1.631 |
|
NA |
3rd Qu.:268.0 |
3rd Qu.:7.00 |
3rd Qu.:1.990 |
3rd Qu.:2.180 |
3rd Qu.:0.00000 |
3rd Qu.:0.2300 |
3rd Qu.:0.0000 |
3rd Qu.:0.0000 |
3rd Qu.:0.850873 |
3rd Qu.:2.130 |
3rd Qu.:1.890 |
3rd Qu.: 0.3200 |
NA |
3rd Qu.:0.1127 |
3rd Qu.:0.00000 |
3rd Qu.:0.300 |
3rd Qu.:3.000 |
|
NA |
Max. :278.0 |
Max. :7.00 |
Max. :2.090 |
Max. :2.290 |
Max. :0.50000 |
Max. :0.8000 |
Max. :1.0000 |
Max. :1.0000 |
Max. :0.999947 |
Max. :2.290 |
Max. :2.090 |
Max. : 0.6400 |
NA |
Max. :0.4020 |
Max. :0.25269 |
Max. :0.440 |
Max. :4.000 |
ggplot(data = OJ, aes(x = Purchase, y = ..count.., fill = Purchase)) +
geom_bar() +
labs(title = "Distribución de 'Purchase'") +
scale_fill_manual(values = c("darkgreen", "orangered2"),
labels = c("Citrus Hill", "Orange Juice")) +
theme_bw() + theme(plot.title = element_text(hjust = 0.5))

frecuencias <- prop.table(table(datos$Purchase)) %>%
round(digits = 2)
frecuencias
##
## CH MM
## 0.61 0.39
set.seed(123)
entrena <- createDataPartition(y = datos$Purchase, p = 0.8, list = FALSE, times = 1)
# Datos entrenamiento
datos.entrenamiento <- datos[entrena, ]
# Datos validación
datos.validacion <- datos[-entrena, ]
kable(head(datos))
| CH |
237 |
1 |
1.75 |
1.99 |
0.00 |
0.0 |
0 |
0 |
0.500000 |
1.99 |
1.75 |
0.24 |
No |
0.000000 |
0.000000 |
0.24 |
1 |
| CH |
239 |
1 |
1.75 |
1.99 |
0.00 |
0.3 |
0 |
1 |
0.600000 |
1.69 |
1.75 |
-0.06 |
No |
0.150754 |
0.000000 |
0.24 |
1 |
| CH |
245 |
1 |
1.86 |
2.09 |
0.17 |
0.0 |
0 |
0 |
0.680000 |
2.09 |
1.69 |
0.40 |
No |
0.000000 |
0.091398 |
0.23 |
1 |
| MM |
227 |
1 |
1.69 |
1.69 |
0.00 |
0.0 |
0 |
0 |
0.400000 |
1.69 |
1.69 |
0.00 |
No |
0.000000 |
0.000000 |
0.00 |
1 |
| CH |
228 |
7 |
1.69 |
1.69 |
0.00 |
0.0 |
0 |
0 |
0.956535 |
1.69 |
1.69 |
0.00 |
Yes |
0.000000 |
0.000000 |
0.00 |
0 |
| CH |
230 |
7 |
1.69 |
1.99 |
0.00 |
0.0 |
0 |
1 |
0.965228 |
1.99 |
1.69 |
0.30 |
Yes |
0.000000 |
0.000000 |
0.30 |
0 |
kable(tail(datos))
| 1065 |
CH |
251 |
7 |
1.86 |
2.09 |
0.1 |
0.00 |
0 |
0 |
0.484778 |
2.09 |
1.76 |
0.33 |
Yes |
0.000000 |
0.053763 |
0.23 |
0 |
| 1066 |
CH |
252 |
7 |
1.86 |
2.09 |
0.1 |
0.00 |
0 |
0 |
0.587822 |
2.09 |
1.76 |
0.33 |
Yes |
0.000000 |
0.053763 |
0.23 |
0 |
| 1067 |
CH |
256 |
7 |
1.86 |
2.18 |
0.0 |
0.00 |
0 |
0 |
0.670258 |
2.18 |
1.86 |
0.32 |
Yes |
0.000000 |
0.000000 |
0.32 |
0 |
| 1068 |
MM |
257 |
7 |
1.86 |
2.18 |
0.0 |
0.00 |
0 |
0 |
0.736206 |
2.18 |
1.86 |
0.32 |
Yes |
0.000000 |
0.000000 |
0.32 |
0 |
| 1069 |
CH |
261 |
7 |
1.86 |
2.13 |
0.0 |
0.24 |
0 |
0 |
0.588965 |
1.89 |
1.86 |
0.03 |
Yes |
0.112676 |
0.000000 |
0.27 |
0 |
| 1070 |
CH |
270 |
1 |
1.86 |
2.18 |
0.0 |
0.00 |
0 |
0 |
0.671172 |
2.18 |
1.86 |
0.32 |
No |
0.000000 |
0.000000 |
0.32 |
1 |
kable(head(datos.entrenamiento))
| 2 |
CH |
239 |
1 |
1.75 |
1.99 |
0 |
0.3 |
0 |
1 |
0.600000 |
1.69 |
1.75 |
-0.06 |
No |
0.150754 |
0 |
0.24 |
1 |
| 4 |
MM |
227 |
1 |
1.69 |
1.69 |
0 |
0.0 |
0 |
0 |
0.400000 |
1.69 |
1.69 |
0.00 |
No |
0.000000 |
0 |
0.00 |
1 |
| 5 |
CH |
228 |
7 |
1.69 |
1.69 |
0 |
0.0 |
0 |
0 |
0.956535 |
1.69 |
1.69 |
0.00 |
Yes |
0.000000 |
0 |
0.00 |
0 |
| 6 |
CH |
230 |
7 |
1.69 |
1.99 |
0 |
0.0 |
0 |
1 |
0.965228 |
1.99 |
1.69 |
0.30 |
Yes |
0.000000 |
0 |
0.30 |
0 |
| 7 |
CH |
232 |
7 |
1.69 |
1.99 |
0 |
0.4 |
1 |
1 |
0.972182 |
1.59 |
1.69 |
-0.10 |
Yes |
0.201005 |
0 |
0.30 |
0 |
| 8 |
CH |
234 |
7 |
1.75 |
1.99 |
0 |
0.4 |
1 |
0 |
0.977746 |
1.59 |
1.75 |
-0.16 |
Yes |
0.201005 |
0 |
0.24 |
0 |
kable(tail(datos.entrenamiento))
| 1065 |
CH |
251 |
7 |
1.86 |
2.09 |
0.1 |
0.00 |
0 |
0 |
0.484778 |
2.09 |
1.76 |
0.33 |
Yes |
0.000000 |
0.053763 |
0.23 |
0 |
| 1066 |
CH |
252 |
7 |
1.86 |
2.09 |
0.1 |
0.00 |
0 |
0 |
0.587822 |
2.09 |
1.76 |
0.33 |
Yes |
0.000000 |
0.053763 |
0.23 |
0 |
| 1067 |
CH |
256 |
7 |
1.86 |
2.18 |
0.0 |
0.00 |
0 |
0 |
0.670258 |
2.18 |
1.86 |
0.32 |
Yes |
0.000000 |
0.000000 |
0.32 |
0 |
| 1068 |
MM |
257 |
7 |
1.86 |
2.18 |
0.0 |
0.00 |
0 |
0 |
0.736206 |
2.18 |
1.86 |
0.32 |
Yes |
0.000000 |
0.000000 |
0.32 |
0 |
| 1069 |
CH |
261 |
7 |
1.86 |
2.13 |
0.0 |
0.24 |
0 |
0 |
0.588965 |
1.89 |
1.86 |
0.03 |
Yes |
0.112676 |
0.000000 |
0.27 |
0 |
| 1070 |
CH |
270 |
1 |
1.86 |
2.18 |
0.0 |
0.00 |
0 |
0 |
0.671172 |
2.18 |
1.86 |
0.32 |
No |
0.000000 |
0.000000 |
0.32 |
1 |
kable(head(datos.validacion))
| 1 |
CH |
237 |
1 |
1.75 |
1.99 |
0.00 |
0.0 |
0 |
0 |
0.500000 |
1.99 |
1.75 |
0.24 |
No |
0.000000 |
0.000000 |
0.24 |
1 |
| 3 |
CH |
245 |
1 |
1.86 |
2.09 |
0.17 |
0.0 |
0 |
0 |
0.680000 |
2.09 |
1.69 |
0.40 |
No |
0.000000 |
0.091398 |
0.23 |
1 |
| 10 |
CH |
238 |
7 |
1.75 |
1.99 |
0.00 |
0.4 |
0 |
0 |
0.985757 |
1.59 |
1.75 |
-0.16 |
Yes |
0.201005 |
0.000000 |
0.24 |
0 |
| 16 |
CH |
278 |
7 |
2.06 |
2.13 |
0.00 |
0.0 |
0 |
0 |
0.795200 |
2.13 |
2.06 |
0.07 |
Yes |
0.000000 |
0.000000 |
0.07 |
0 |
| 18 |
MM |
268 |
2 |
1.86 |
2.18 |
0.00 |
0.0 |
0 |
1 |
0.400000 |
2.18 |
1.86 |
0.32 |
No |
0.000000 |
0.000000 |
0.32 |
2 |
| 22 |
CH |
258 |
1 |
1.76 |
2.18 |
0.00 |
0.0 |
0 |
0 |
0.680000 |
2.18 |
1.76 |
0.42 |
No |
0.000000 |
0.000000 |
0.42 |
1 |
kable(tail(datos.validacion))
| 1040 |
CH |
237 |
7 |
1.75 |
1.99 |
0 |
0.4 |
0 |
0 |
0.320000 |
1.59 |
1.75 |
-0.16 |
Yes |
0.201005 |
0 |
0.24 |
0 |
| 1043 |
CH |
239 |
1 |
1.75 |
1.99 |
0 |
0.3 |
0 |
1 |
0.651840 |
1.69 |
1.75 |
-0.06 |
No |
0.150754 |
0 |
0.24 |
1 |
| 1053 |
CH |
237 |
7 |
1.75 |
1.99 |
0 |
0.4 |
0 |
0 |
0.740928 |
1.59 |
1.75 |
-0.16 |
Yes |
0.201005 |
0 |
0.24 |
0 |
| 1056 |
MM |
227 |
1 |
1.69 |
1.69 |
0 |
0.0 |
0 |
0 |
0.320000 |
1.69 |
1.69 |
0.00 |
No |
0.000000 |
0 |
0.00 |
1 |
| 1057 |
CH |
228 |
7 |
1.69 |
1.69 |
0 |
0.0 |
0 |
0 |
0.256000 |
1.69 |
1.69 |
0.00 |
Yes |
0.000000 |
0 |
0.00 |
0 |
| 1059 |
CH |
233 |
7 |
1.75 |
1.99 |
0 |
0.4 |
1 |
0 |
0.523840 |
1.59 |
1.75 |
-0.16 |
Yes |
0.201005 |
0 |
0.24 |
0 |
set.seed(325)
tuning <- tune(svm, Purchase ~ ., data = datos.entrenamiento,
kernel = "linear",
ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 10, 15, 20)),
scale = TRUE)
summary(tuning)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 15
##
## - best performance: 0.1656772
##
## - Detailed performance results:
## cost error dispersion
## 1 0.001 0.3009850 0.05276636
## 2 0.010 0.1691382 0.04602764
## 3 0.100 0.1679754 0.04037149
## 4 1.000 0.1703010 0.04099747
## 5 5.000 0.1703010 0.03755497
## 6 10.000 0.1668263 0.03560699
## 7 15.000 0.1656772 0.03338668
## 8 20.000 0.1668399 0.03479758
names(tuning)
## [1] "best.parameters" "best.performance" "method" "nparcomb"
## [5] "train.ind" "sampling" "performances" "best.model"
ggplot(data = tuning$performances, aes(x = cost, y = error)) +
geom_line() +
geom_point() +
labs(title = "Error de validación ~ hiperparámetro C") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))

modelo <- tuning$best.model
summary(modelo)
##
## Call:
## best.tune(method = svm, train.x = Purchase ~ ., data = datos.entrenamiento,
## ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 10, 15, 20)),
## kernel = "linear", scale = TRUE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 15
##
## Number of Support Vectors: 345
##
## ( 173 172 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
modelo.lin <- svm(Purchase ~ ., data = datos.entrenamiento, kernel = "linear", cost = 15, scale = TRUE)
summary(modelo.lin)
##
## Call:
## svm(formula = Purchase ~ ., data = datos.entrenamiento, kernel = "linear",
## cost = 15, scale = TRUE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 15
##
## Number of Support Vectors: 345
##
## ( 173 172 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
prediccion <- predict(modelo, datos.validacion)
kable(head(prediccion, 10))
| 1 |
CH |
| 3 |
CH |
| 10 |
CH |
| 16 |
CH |
| 18 |
CH |
| 22 |
CH |
| 26 |
CH |
| 41 |
CH |
| 45 |
CH |
| 49 |
CH |
kable(tail(prediccion, 10))
| 1011 |
CH |
| 1014 |
CH |
| 1020 |
CH |
| 1039 |
MM |
| 1040 |
MM |
| 1043 |
CH |
| 1053 |
CH |
| 1056 |
MM |
| 1057 |
MM |
| 1059 |
CH |
length(prediccion)
## [1] 213
nrow(datos.validacion)
## [1] 213
mat.confusion <- table(predicho = prediccion, real = datos.validacion$Purchase)
mat.confusion
## real
## predicho CH MM
## CH 108 17
## MM 22 66
ggplot(datos.validacion, aes(x = SalePriceCH, y = PriceCH)) +
geom_point(aes(colour = factor(prediccion)))

ggplot(datos.validacion, aes(x = SalePriceMM, y = PriceMM)) +
geom_point(aes(colour = factor(prediccion)))

modelo.pol <- svm(Purchase ~ ., data = datos.entrenamiento, kernel = "polynomial", cost = 15, scale = TRUE)
summary(modelo.pol)
##
## Call:
## svm(formula = Purchase ~ ., data = datos.entrenamiento, kernel = "polynomial",
## cost = 15, scale = TRUE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 15
## degree: 3
## coef.0: 0
##
## Number of Support Vectors: 342
##
## ( 177 165 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
prediccion <- predict(modelo.pol, datos.validacion)
kable(head(prediccion, 10))
| 1 |
CH |
| 3 |
CH |
| 10 |
CH |
| 16 |
CH |
| 18 |
CH |
| 22 |
CH |
| 26 |
CH |
| 41 |
CH |
| 45 |
CH |
| 49 |
CH |
kable(tail(prediccion, 10))
| 1011 |
CH |
| 1014 |
CH |
| 1020 |
CH |
| 1039 |
MM |
| 1040 |
MM |
| 1043 |
CH |
| 1053 |
CH |
| 1056 |
MM |
| 1057 |
CH |
| 1059 |
MM |
length(prediccion)
## [1] 213
nrow(datos.validacion)
## [1] 213
mat.confusion <- table(predicho = prediccion, real = datos.validacion$Purchase)
mat.confusion
## real
## predicho CH MM
## CH 108 23
## MM 22 60
ggplot(datos.validacion, aes(x = SalePriceCH, y = PriceCH)) +
geom_point(aes(colour = factor(prediccion)))

ggplot(datos.validacion, aes(x = SalePriceMM, y = PriceMM)) +
geom_point(aes(colour = factor(prediccion)))

modelo.rad <- svm(Purchase ~ ., data = datos.entrenamiento, kernel = "radial", cost = 15, scale = TRUE)
summary(modelo.rad)
##
## Call:
## svm(formula = Purchase ~ ., data = datos.entrenamiento, kernel = "radial",
## cost = 15, scale = TRUE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 15
##
## Number of Support Vectors: 340
##
## ( 173 167 )
##
##
## Number of Classes: 2
##
## Levels:
## CH MM
prediccion <- predict(modelo.rad, datos.validacion)
kable(head(prediccion, 10))
| 1 |
MM |
| 3 |
CH |
| 10 |
CH |
| 16 |
CH |
| 18 |
CH |
| 22 |
CH |
| 26 |
CH |
| 41 |
CH |
| 45 |
CH |
| 49 |
CH |
kable(tail(prediccion, 10))
| 1011 |
CH |
| 1014 |
CH |
| 1020 |
CH |
| 1039 |
CH |
| 1040 |
MM |
| 1043 |
CH |
| 1053 |
CH |
| 1056 |
MM |
| 1057 |
CH |
| 1059 |
CH |
length(prediccion)
## [1] 213
nrow(datos.validacion)
## [1] 213
mat.confusion <- table(predicho = prediccion, real = datos.validacion$Purchase)
mat.confusion
## real
## predicho CH MM
## CH 108 21
## MM 22 62
ggplot(datos.validacion, aes(x = SalePriceCH, y = PriceCH)) +
geom_point(aes(colour = factor(prediccion)))

ggplot(datos.validacion, aes(x = SalePriceMM, y = PriceMM)) +
geom_point(aes(colour = factor(prediccion)))

INTERPRETACION
Desde el inicio con la feafica de distribución de la variable de respuesta Purchase nos pudimos dar cueta que:
- El 61 % de las preferencias de los clientes es para el refresco “CM” CH = Citrus Hill
- El 39 % de las preferencias de los clientes es para el refresco “CM” MM = Minute Maid Orange Juice
Con la graficasion de los costos de turing se obtiene que:
- El valor de coste que obtiene el error de validación mas chico es 15.
al fenerar la matriz de confucion de la prediccion se observa qu se obtinen el 81.6901 %,78.8732 % y 79.8122 % en términos de exactitud. al generar la Gráfica de los valores reales Vs predichos con estos valores de exactitud las diferencias entre una y otra no son tan grandes pero si las hay.