library(e1071)
## Warning: package 'e1071' was built under R version 3.6.3
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.6.3
library(knitr) # Para ver tablas mas amigables en formato html markdown
## Warning: package 'knitr' was built under R version 3.6.3
library(ggplot2) # Gráficas
## Warning: package 'ggplot2' was built under R version 3.6.3
library(dplyr) # Varias operaciones
## Warning: package 'dplyr' was built under R version 3.6.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(caret) # Para particionar datos. De entranamiento y de validación
## Warning: package 'caret' was built under R version 3.6.3
## Loading required package: lattice
#install.packages("e1071") # Para SVM
library(e1071)
datos <- OJ
str(datos)
## 'data.frame':    1070 obs. of  18 variables:
##  $ Purchase      : Factor w/ 2 levels "CH","MM": 1 1 1 2 1 1 1 1 1 1 ...
##  $ WeekofPurchase: num  237 239 245 227 228 230 232 234 235 238 ...
##  $ StoreID       : num  1 1 1 1 7 7 7 7 7 7 ...
##  $ PriceCH       : num  1.75 1.75 1.86 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
##  $ PriceMM       : num  1.99 1.99 2.09 1.69 1.69 1.99 1.99 1.99 1.99 1.99 ...
##  $ DiscCH        : num  0 0 0.17 0 0 0 0 0 0 0 ...
##  $ DiscMM        : num  0 0.3 0 0 0 0 0.4 0.4 0.4 0.4 ...
##  $ SpecialCH     : num  0 0 0 0 0 0 1 1 0 0 ...
##  $ SpecialMM     : num  0 1 0 0 0 1 1 0 0 0 ...
##  $ LoyalCH       : num  0.5 0.6 0.68 0.4 0.957 ...
##  $ SalePriceMM   : num  1.99 1.69 2.09 1.69 1.69 1.99 1.59 1.59 1.59 1.59 ...
##  $ SalePriceCH   : num  1.75 1.75 1.69 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
##  $ PriceDiff     : num  0.24 -0.06 0.4 0 0 0.3 -0.1 -0.16 -0.16 -0.16 ...
##  $ Store7        : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 2 2 2 2 2 ...
##  $ PctDiscMM     : num  0 0.151 0 0 0 ...
##  $ PctDiscCH     : num  0 0 0.0914 0 0 ...
##  $ ListPriceDiff : num  0.24 0.24 0.23 0 0 0.3 0.3 0.24 0.24 0.24 ...
##  $ STORE         : num  1 1 1 1 0 0 0 0 0 0 ...
kable(summary(datos))
Purchase WeekofPurchase StoreID PriceCH PriceMM DiscCH DiscMM SpecialCH SpecialMM LoyalCH SalePriceMM SalePriceCH PriceDiff Store7 PctDiscMM PctDiscCH ListPriceDiff STORE
CH:653 Min. :227.0 Min. :1.00 Min. :1.690 Min. :1.690 Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.000011 Min. :1.190 Min. :1.390 Min. :-0.6700 No :714 Min. :0.0000 Min. :0.00000 Min. :0.000 Min. :0.000
MM:417 1st Qu.:240.0 1st Qu.:2.00 1st Qu.:1.790 1st Qu.:1.990 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.325257 1st Qu.:1.690 1st Qu.:1.750 1st Qu.: 0.0000 Yes:356 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.140 1st Qu.:0.000
NA Median :257.0 Median :3.00 Median :1.860 Median :2.090 Median :0.00000 Median :0.0000 Median :0.0000 Median :0.0000 Median :0.600000 Median :2.090 Median :1.860 Median : 0.2300 NA Median :0.0000 Median :0.00000 Median :0.240 Median :2.000
NA Mean :254.4 Mean :3.96 Mean :1.867 Mean :2.085 Mean :0.05186 Mean :0.1234 Mean :0.1477 Mean :0.1617 Mean :0.565782 Mean :1.962 Mean :1.816 Mean : 0.1465 NA Mean :0.0593 Mean :0.02731 Mean :0.218 Mean :1.631
NA 3rd Qu.:268.0 3rd Qu.:7.00 3rd Qu.:1.990 3rd Qu.:2.180 3rd Qu.:0.00000 3rd Qu.:0.2300 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.850873 3rd Qu.:2.130 3rd Qu.:1.890 3rd Qu.: 0.3200 NA 3rd Qu.:0.1127 3rd Qu.:0.00000 3rd Qu.:0.300 3rd Qu.:3.000
NA Max. :278.0 Max. :7.00 Max. :2.090 Max. :2.290 Max. :0.50000 Max. :0.8000 Max. :1.0000 Max. :1.0000 Max. :0.999947 Max. :2.290 Max. :2.090 Max. : 0.6400 NA Max. :0.4020 Max. :0.25269 Max. :0.440 Max. :4.000
ggplot(data = OJ, aes(x = Purchase, y = ..count.., fill = Purchase)) +
geom_bar() +
labs(title = "Distribución de 'Purchase'") +
scale_fill_manual(values = c("darkgreen", "orangered2"),
labels = c("Citrus Hill", "Orange Juice")) +
theme_bw() + theme(plot.title = element_text(hjust = 0.5))

frecuencias <- prop.table(table(datos$Purchase)) %>%
round(digits = 2)
frecuencias
## 
##   CH   MM 
## 0.61 0.39
set.seed(123)
entrena <- createDataPartition(y = datos$Purchase, p = 0.8, list = FALSE, times = 1)
# Datos entrenamiento
datos.entrenamiento <- datos[entrena, ]
# Datos validación
datos.validacion <- datos[-entrena, ]
kable(head(datos))
Purchase WeekofPurchase StoreID PriceCH PriceMM DiscCH DiscMM SpecialCH SpecialMM LoyalCH SalePriceMM SalePriceCH PriceDiff Store7 PctDiscMM PctDiscCH ListPriceDiff STORE
CH 237 1 1.75 1.99 0.00 0.0 0 0 0.500000 1.99 1.75 0.24 No 0.000000 0.000000 0.24 1
CH 239 1 1.75 1.99 0.00 0.3 0 1 0.600000 1.69 1.75 -0.06 No 0.150754 0.000000 0.24 1
CH 245 1 1.86 2.09 0.17 0.0 0 0 0.680000 2.09 1.69 0.40 No 0.000000 0.091398 0.23 1
MM 227 1 1.69 1.69 0.00 0.0 0 0 0.400000 1.69 1.69 0.00 No 0.000000 0.000000 0.00 1
CH 228 7 1.69 1.69 0.00 0.0 0 0 0.956535 1.69 1.69 0.00 Yes 0.000000 0.000000 0.00 0
CH 230 7 1.69 1.99 0.00 0.0 0 1 0.965228 1.99 1.69 0.30 Yes 0.000000 0.000000 0.30 0
kable(tail(datos))
Purchase WeekofPurchase StoreID PriceCH PriceMM DiscCH DiscMM SpecialCH SpecialMM LoyalCH SalePriceMM SalePriceCH PriceDiff Store7 PctDiscMM PctDiscCH ListPriceDiff STORE
1065 CH 251 7 1.86 2.09 0.1 0.00 0 0 0.484778 2.09 1.76 0.33 Yes 0.000000 0.053763 0.23 0
1066 CH 252 7 1.86 2.09 0.1 0.00 0 0 0.587822 2.09 1.76 0.33 Yes 0.000000 0.053763 0.23 0
1067 CH 256 7 1.86 2.18 0.0 0.00 0 0 0.670258 2.18 1.86 0.32 Yes 0.000000 0.000000 0.32 0
1068 MM 257 7 1.86 2.18 0.0 0.00 0 0 0.736206 2.18 1.86 0.32 Yes 0.000000 0.000000 0.32 0
1069 CH 261 7 1.86 2.13 0.0 0.24 0 0 0.588965 1.89 1.86 0.03 Yes 0.112676 0.000000 0.27 0
1070 CH 270 1 1.86 2.18 0.0 0.00 0 0 0.671172 2.18 1.86 0.32 No 0.000000 0.000000 0.32 1
kable(head(datos.entrenamiento))
Purchase WeekofPurchase StoreID PriceCH PriceMM DiscCH DiscMM SpecialCH SpecialMM LoyalCH SalePriceMM SalePriceCH PriceDiff Store7 PctDiscMM PctDiscCH ListPriceDiff STORE
2 CH 239 1 1.75 1.99 0 0.3 0 1 0.600000 1.69 1.75 -0.06 No 0.150754 0 0.24 1
4 MM 227 1 1.69 1.69 0 0.0 0 0 0.400000 1.69 1.69 0.00 No 0.000000 0 0.00 1
5 CH 228 7 1.69 1.69 0 0.0 0 0 0.956535 1.69 1.69 0.00 Yes 0.000000 0 0.00 0
6 CH 230 7 1.69 1.99 0 0.0 0 1 0.965228 1.99 1.69 0.30 Yes 0.000000 0 0.30 0
7 CH 232 7 1.69 1.99 0 0.4 1 1 0.972182 1.59 1.69 -0.10 Yes 0.201005 0 0.30 0
8 CH 234 7 1.75 1.99 0 0.4 1 0 0.977746 1.59 1.75 -0.16 Yes 0.201005 0 0.24 0
kable(tail(datos.entrenamiento))
Purchase WeekofPurchase StoreID PriceCH PriceMM DiscCH DiscMM SpecialCH SpecialMM LoyalCH SalePriceMM SalePriceCH PriceDiff Store7 PctDiscMM PctDiscCH ListPriceDiff STORE
1065 CH 251 7 1.86 2.09 0.1 0.00 0 0 0.484778 2.09 1.76 0.33 Yes 0.000000 0.053763 0.23 0
1066 CH 252 7 1.86 2.09 0.1 0.00 0 0 0.587822 2.09 1.76 0.33 Yes 0.000000 0.053763 0.23 0
1067 CH 256 7 1.86 2.18 0.0 0.00 0 0 0.670258 2.18 1.86 0.32 Yes 0.000000 0.000000 0.32 0
1068 MM 257 7 1.86 2.18 0.0 0.00 0 0 0.736206 2.18 1.86 0.32 Yes 0.000000 0.000000 0.32 0
1069 CH 261 7 1.86 2.13 0.0 0.24 0 0 0.588965 1.89 1.86 0.03 Yes 0.112676 0.000000 0.27 0
1070 CH 270 1 1.86 2.18 0.0 0.00 0 0 0.671172 2.18 1.86 0.32 No 0.000000 0.000000 0.32 1
kable(head(datos.validacion))
Purchase WeekofPurchase StoreID PriceCH PriceMM DiscCH DiscMM SpecialCH SpecialMM LoyalCH SalePriceMM SalePriceCH PriceDiff Store7 PctDiscMM PctDiscCH ListPriceDiff STORE
1 CH 237 1 1.75 1.99 0.00 0.0 0 0 0.500000 1.99 1.75 0.24 No 0.000000 0.000000 0.24 1
3 CH 245 1 1.86 2.09 0.17 0.0 0 0 0.680000 2.09 1.69 0.40 No 0.000000 0.091398 0.23 1
10 CH 238 7 1.75 1.99 0.00 0.4 0 0 0.985757 1.59 1.75 -0.16 Yes 0.201005 0.000000 0.24 0
16 CH 278 7 2.06 2.13 0.00 0.0 0 0 0.795200 2.13 2.06 0.07 Yes 0.000000 0.000000 0.07 0
18 MM 268 2 1.86 2.18 0.00 0.0 0 1 0.400000 2.18 1.86 0.32 No 0.000000 0.000000 0.32 2
22 CH 258 1 1.76 2.18 0.00 0.0 0 0 0.680000 2.18 1.76 0.42 No 0.000000 0.000000 0.42 1
kable(tail(datos.validacion))
Purchase WeekofPurchase StoreID PriceCH PriceMM DiscCH DiscMM SpecialCH SpecialMM LoyalCH SalePriceMM SalePriceCH PriceDiff Store7 PctDiscMM PctDiscCH ListPriceDiff STORE
1040 CH 237 7 1.75 1.99 0 0.4 0 0 0.320000 1.59 1.75 -0.16 Yes 0.201005 0 0.24 0
1043 CH 239 1 1.75 1.99 0 0.3 0 1 0.651840 1.69 1.75 -0.06 No 0.150754 0 0.24 1
1053 CH 237 7 1.75 1.99 0 0.4 0 0 0.740928 1.59 1.75 -0.16 Yes 0.201005 0 0.24 0
1056 MM 227 1 1.69 1.69 0 0.0 0 0 0.320000 1.69 1.69 0.00 No 0.000000 0 0.00 1
1057 CH 228 7 1.69 1.69 0 0.0 0 0 0.256000 1.69 1.69 0.00 Yes 0.000000 0 0.00 0
1059 CH 233 7 1.75 1.99 0 0.4 1 0 0.523840 1.59 1.75 -0.16 Yes 0.201005 0 0.24 0
set.seed(325)
tuning <- tune(svm, Purchase ~ ., data = datos.entrenamiento,
kernel = "linear",
ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 10, 15, 20)),
scale = TRUE)
summary(tuning)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##    15
## 
## - best performance: 0.1656772 
## 
## - Detailed performance results:
##     cost     error dispersion
## 1  0.001 0.3009850 0.05276636
## 2  0.010 0.1691382 0.04602764
## 3  0.100 0.1679754 0.04037149
## 4  1.000 0.1703010 0.04099747
## 5  5.000 0.1703010 0.03755497
## 6 10.000 0.1668263 0.03560699
## 7 15.000 0.1656772 0.03338668
## 8 20.000 0.1668399 0.03479758
names(tuning)
## [1] "best.parameters"  "best.performance" "method"           "nparcomb"        
## [5] "train.ind"        "sampling"         "performances"     "best.model"
ggplot(data = tuning$performances, aes(x = cost, y = error)) +
geom_line() +
geom_point() +
labs(title = "Error de validación ~ hiperparámetro C") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))

modelo <- tuning$best.model
summary(modelo)
## 
## Call:
## best.tune(method = svm, train.x = Purchase ~ ., data = datos.entrenamiento, 
##     ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 10, 15, 20)), 
##     kernel = "linear", scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  15 
## 
## Number of Support Vectors:  345
## 
##  ( 173 172 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM
modelo.lin <- svm(Purchase ~ ., data = datos.entrenamiento, kernel = "linear", cost = 15, scale = TRUE)
summary(modelo.lin)
## 
## Call:
## svm(formula = Purchase ~ ., data = datos.entrenamiento, kernel = "linear", 
##     cost = 15, scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  15 
## 
## Number of Support Vectors:  345
## 
##  ( 173 172 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM
prediccion <- predict(modelo, datos.validacion)
kable(head(prediccion, 10))
x
1 CH
3 CH
10 CH
16 CH
18 CH
22 CH
26 CH
41 CH
45 CH
49 CH
kable(tail(prediccion, 10))
x
1011 CH
1014 CH
1020 CH
1039 MM
1040 MM
1043 CH
1053 CH
1056 MM
1057 MM
1059 CH
length(prediccion)
## [1] 213
nrow(datos.validacion)
## [1] 213
mat.confusion <- table(predicho  = prediccion, real = datos.validacion$Purchase)
mat.confusion
##         real
## predicho  CH  MM
##       CH 108  17
##       MM  22  66
ggplot(datos.validacion, aes(x = SalePriceCH, y = PriceCH)) +
geom_point(aes(colour = factor(prediccion)))

ggplot(datos.validacion, aes(x = SalePriceMM, y = PriceMM)) +
geom_point(aes(colour = factor(prediccion)))

modelo.pol <- svm(Purchase ~ ., data = datos.entrenamiento, kernel = "polynomial", cost = 15, scale = TRUE)
summary(modelo.pol)
## 
## Call:
## svm(formula = Purchase ~ ., data = datos.entrenamiento, kernel = "polynomial", 
##     cost = 15, scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  polynomial 
##        cost:  15 
##      degree:  3 
##      coef.0:  0 
## 
## Number of Support Vectors:  342
## 
##  ( 177 165 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM
prediccion <- predict(modelo.pol, datos.validacion)
kable(head(prediccion, 10))
x
1 CH
3 CH
10 CH
16 CH
18 CH
22 CH
26 CH
41 CH
45 CH
49 CH
kable(tail(prediccion, 10))
x
1011 CH
1014 CH
1020 CH
1039 MM
1040 MM
1043 CH
1053 CH
1056 MM
1057 CH
1059 MM
length(prediccion)
## [1] 213
nrow(datos.validacion)
## [1] 213
mat.confusion <- table(predicho  = prediccion, real = datos.validacion$Purchase)
mat.confusion
##         real
## predicho  CH  MM
##       CH 108  23
##       MM  22  60
ggplot(datos.validacion, aes(x = SalePriceCH, y = PriceCH)) +
geom_point(aes(colour = factor(prediccion)))

ggplot(datos.validacion, aes(x = SalePriceMM, y = PriceMM)) +
geom_point(aes(colour = factor(prediccion)))

modelo.rad <- svm(Purchase ~ ., data = datos.entrenamiento, kernel = "radial", cost = 15, scale = TRUE)
summary(modelo.rad)
## 
## Call:
## svm(formula = Purchase ~ ., data = datos.entrenamiento, kernel = "radial", 
##     cost = 15, scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  15 
## 
## Number of Support Vectors:  340
## 
##  ( 173 167 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM
prediccion <- predict(modelo.rad, datos.validacion)
kable(head(prediccion, 10))
x
1 MM
3 CH
10 CH
16 CH
18 CH
22 CH
26 CH
41 CH
45 CH
49 CH
kable(tail(prediccion, 10))
x
1011 CH
1014 CH
1020 CH
1039 CH
1040 MM
1043 CH
1053 CH
1056 MM
1057 CH
1059 CH
length(prediccion)
## [1] 213
nrow(datos.validacion)
## [1] 213
mat.confusion <- table(predicho  = prediccion, real = datos.validacion$Purchase)
mat.confusion
##         real
## predicho  CH  MM
##       CH 108  21
##       MM  22  62
ggplot(datos.validacion, aes(x = SalePriceCH, y = PriceCH)) +
geom_point(aes(colour = factor(prediccion)))

ggplot(datos.validacion, aes(x = SalePriceMM, y = PriceMM)) +
geom_point(aes(colour = factor(prediccion)))

INTERPRETACION

Desde el inicio con la feafica de distribución de la variable de respuesta Purchase nos pudimos dar cueta que:

Con la graficasion de los costos de turing se obtiene que:

al fenerar la matriz de confucion de la prediccion se observa qu se obtinen el 81.6901 %,78.8732 % y 79.8122 % en términos de exactitud. al generar la Gráfica de los valores reales Vs predichos con estos valores de exactitud las diferencias entre una y otra no son tan grandes pero si las hay.