practica13

library(e1071)

## Warning: package 'e1071' was built under R version 3.6.3

library(ISLR)

## Warning: package 'ISLR' was built under R version 3.6.3

library(knitr) # Para ver tablas mas amigables en formato html markdown

## Warning: package 'knitr' was built under R version 3.6.3

library(ggplot2) # Gráficas

## Warning: package 'ggplot2' was built under R version 3.6.3

library(dplyr) # Varias operaciones

## Warning: package 'dplyr' was built under R version 3.6.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(caret) # Para particionar datos. De entranamiento y de validación

## Warning: package 'caret' was built under R version 3.6.3

## Loading required package: lattice

#install.packages("e1071") # Para SVM
library(e1071)
datos <- OJ
str(datos)

## 'data.frame':    1070 obs. of  18 variables:
##  $ Purchase      : Factor w/ 2 levels "CH","MM": 1 1 1 2 1 1 1 1 1 1 ...
##  $ WeekofPurchase: num  237 239 245 227 228 230 232 234 235 238 ...
##  $ StoreID       : num  1 1 1 1 7 7 7 7 7 7 ...
##  $ PriceCH       : num  1.75 1.75 1.86 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
##  $ PriceMM       : num  1.99 1.99 2.09 1.69 1.69 1.99 1.99 1.99 1.99 1.99 ...
##  $ DiscCH        : num  0 0 0.17 0 0 0 0 0 0 0 ...
##  $ DiscMM        : num  0 0.3 0 0 0 0 0.4 0.4 0.4 0.4 ...
##  $ SpecialCH     : num  0 0 0 0 0 0 1 1 0 0 ...
##  $ SpecialMM     : num  0 1 0 0 0 1 1 0 0 0 ...
##  $ LoyalCH       : num  0.5 0.6 0.68 0.4 0.957 ...
##  $ SalePriceMM   : num  1.99 1.69 2.09 1.69 1.69 1.99 1.59 1.59 1.59 1.59 ...
##  $ SalePriceCH   : num  1.75 1.75 1.69 1.69 1.69 1.69 1.69 1.75 1.75 1.75 ...
##  $ PriceDiff     : num  0.24 -0.06 0.4 0 0 0.3 -0.1 -0.16 -0.16 -0.16 ...
##  $ Store7        : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 2 2 2 2 2 ...
##  $ PctDiscMM     : num  0 0.151 0 0 0 ...
##  $ PctDiscCH     : num  0 0 0.0914 0 0 ...
##  $ ListPriceDiff : num  0.24 0.24 0.23 0 0 0.3 0.3 0.24 0.24 0.24 ...
##  $ STORE         : num  1 1 1 1 0 0 0 0 0 0 ...

kable(summary(datos))

Purchase	WeekofPurchase	StoreID	PriceCH	PriceMM	DiscCH	DiscMM	SpecialCH	SpecialMM	LoyalCH	SalePriceMM	SalePriceCH	PriceDiff	Store7	PctDiscMM	PctDiscCH	ListPriceDiff	STORE
CH:653	Min. :227.0	Min. :1.00	Min. :1.690	Min. :1.690	Min. :0.00000	Min. :0.0000	Min. :0.0000	Min. :0.0000	Min. :0.000011	Min. :1.190	Min. :1.390	Min. :-0.6700	No :714	Min. :0.0000	Min. :0.00000	Min. :0.000	Min. :0.000
MM:417	1st Qu.:240.0	1st Qu.:2.00	1st Qu.:1.790	1st Qu.:1.990	1st Qu.:0.00000	1st Qu.:0.0000	1st Qu.:0.0000	1st Qu.:0.0000	1st Qu.:0.325257	1st Qu.:1.690	1st Qu.:1.750	1st Qu.: 0.0000	Yes:356	1st Qu.:0.0000	1st Qu.:0.00000	1st Qu.:0.140	1st Qu.:0.000
NA	Median :257.0	Median :3.00	Median :1.860	Median :2.090	Median :0.00000	Median :0.0000	Median :0.0000	Median :0.0000	Median :0.600000	Median :2.090	Median :1.860	Median : 0.2300	NA	Median :0.0000	Median :0.00000	Median :0.240	Median :2.000
NA	Mean :254.4	Mean :3.96	Mean :1.867	Mean :2.085	Mean :0.05186	Mean :0.1234	Mean :0.1477	Mean :0.1617	Mean :0.565782	Mean :1.962	Mean :1.816	Mean : 0.1465	NA	Mean :0.0593	Mean :0.02731	Mean :0.218	Mean :1.631
NA	3rd Qu.:268.0	3rd Qu.:7.00	3rd Qu.:1.990	3rd Qu.:2.180	3rd Qu.:0.00000	3rd Qu.:0.2300	3rd Qu.:0.0000	3rd Qu.:0.0000	3rd Qu.:0.850873	3rd Qu.:2.130	3rd Qu.:1.890	3rd Qu.: 0.3200	NA	3rd Qu.:0.1127	3rd Qu.:0.00000	3rd Qu.:0.300	3rd Qu.:3.000
NA	Max. :278.0	Max. :7.00	Max. :2.090	Max. :2.290	Max. :0.50000	Max. :0.8000	Max. :1.0000	Max. :1.0000	Max. :0.999947	Max. :2.290	Max. :2.090	Max. : 0.6400	NA	Max. :0.4020	Max. :0.25269	Max. :0.440	Max. :4.000

ggplot(data = OJ, aes(x = Purchase, y = ..count.., fill = Purchase)) +
geom_bar() +
labs(title = "Distribución de 'Purchase'") +
scale_fill_manual(values = c("darkgreen", "orangered2"),
labels = c("Citrus Hill", "Orange Juice")) +
theme_bw() + theme(plot.title = element_text(hjust = 0.5))

frecuencias <- prop.table(table(datos$Purchase)) %>%
round(digits = 2)
frecuencias

## 
##   CH   MM 
## 0.61 0.39

set.seed(123)
entrena <- createDataPartition(y = datos$Purchase, p = 0.8, list = FALSE, times = 1)
# Datos entrenamiento
datos.entrenamiento <- datos[entrena, ]
# Datos validación
datos.validacion <- datos[-entrena, ]
kable(head(datos))

Purchase	WeekofPurchase	StoreID	PriceCH	PriceMM	DiscCH	DiscMM	SpecialMM	LoyalCH	SalePriceMM	SalePriceCH	PriceDiff	Store7	PctDiscMM	PctDiscCH	ListPriceDiff	STORE
CH	237	1	1.75	1.99	0.00	0.0	0	0.500000	1.99	1.75	0.24	No	0.000000	0.000000	0.24	1
CH	239	1	1.75	1.99	0.00	0.3	1	0.600000	1.69	1.75	-0.06	No	0.150754	0.000000	0.24	1
CH	245	1	1.86	2.09	0.17	0.0	0	0.680000	2.09	1.69	0.40	No	0.000000	0.091398	0.23	1
MM	227	1	1.69	1.69	0.00	0.0	0	0.400000	1.69	1.69	0.00	No	0.000000	0.000000	0.00	1
CH	228	7	1.69	1.69	0.00	0.0	0	0.956535	1.69	1.69	0.00	Yes	0.000000	0.000000	0.00	0
CH	230	7	1.69	1.99	0.00	0.0	1	0.965228	1.99	1.69	0.30	Yes	0.000000	0.000000	0.30	0

kable(tail(datos))

	Purchase	WeekofPurchase	StoreID	PriceCH	PriceMM	DiscCH	DiscMM	LoyalCH	SalePriceMM	SalePriceCH	PriceDiff	Store7	PctDiscMM	PctDiscCH	ListPriceDiff	STORE
1065	CH	251	7	1.86	2.09	0.1	0.00	0.484778	2.09	1.76	0.33	Yes	0.000000	0.053763	0.23	0
1066	CH	252	7	1.86	2.09	0.1	0.00	0.587822	2.09	1.76	0.33	Yes	0.000000	0.053763	0.23	0
1067	CH	256	7	1.86	2.18	0.0	0.00	0.670258	2.18	1.86	0.32	Yes	0.000000	0.000000	0.32	0
1068	MM	257	7	1.86	2.18	0.0	0.00	0.736206	2.18	1.86	0.32	Yes	0.000000	0.000000	0.32	0
1069	CH	261	7	1.86	2.13	0.0	0.24	0.588965	1.89	1.86	0.03	Yes	0.112676	0.000000	0.27	0
1070	CH	270	1	1.86	2.18	0.0	0.00	0.671172	2.18	1.86	0.32	No	0.000000	0.000000	0.32	1

kable(head(datos.entrenamiento))

	Purchase	WeekofPurchase	StoreID	PriceCH	PriceMM	DiscMM	SpecialCH	SpecialMM	LoyalCH	SalePriceMM	SalePriceCH	PriceDiff	Store7	PctDiscMM	ListPriceDiff	STORE
2	CH	239	1	1.75	1.99	0.3	0	1	0.600000	1.69	1.75	-0.06	No	0.150754	0.24	1
4	MM	227	1	1.69	1.69	0.0	0	0	0.400000	1.69	1.69	0.00	No	0.000000	0.00	1
5	CH	228	7	1.69	1.69	0.0	0	0	0.956535	1.69	1.69	0.00	Yes	0.000000	0.00	0
6	CH	230	7	1.69	1.99	0.0	0	1	0.965228	1.99	1.69	0.30	Yes	0.000000	0.30	0
7	CH	232	7	1.69	1.99	0.4	1	1	0.972182	1.59	1.69	-0.10	Yes	0.201005	0.30	0
8	CH	234	7	1.75	1.99	0.4	1	0	0.977746	1.59	1.75	-0.16	Yes	0.201005	0.24	0

kable(tail(datos.entrenamiento))

	Purchase	WeekofPurchase	StoreID	PriceCH	PriceMM	DiscCH	DiscMM	LoyalCH	SalePriceMM	SalePriceCH	PriceDiff	Store7	PctDiscMM	PctDiscCH	ListPriceDiff	STORE
1065	CH	251	7	1.86	2.09	0.1	0.00	0.484778	2.09	1.76	0.33	Yes	0.000000	0.053763	0.23	0
1066	CH	252	7	1.86	2.09	0.1	0.00	0.587822	2.09	1.76	0.33	Yes	0.000000	0.053763	0.23	0
1067	CH	256	7	1.86	2.18	0.0	0.00	0.670258	2.18	1.86	0.32	Yes	0.000000	0.000000	0.32	0
1068	MM	257	7	1.86	2.18	0.0	0.00	0.736206	2.18	1.86	0.32	Yes	0.000000	0.000000	0.32	0
1069	CH	261	7	1.86	2.13	0.0	0.24	0.588965	1.89	1.86	0.03	Yes	0.112676	0.000000	0.27	0
1070	CH	270	1	1.86	2.18	0.0	0.00	0.671172	2.18	1.86	0.32	No	0.000000	0.000000	0.32	1

kable(head(datos.validacion))

	Purchase	WeekofPurchase	StoreID	PriceCH	PriceMM	DiscCH	DiscMM	SpecialMM	LoyalCH	SalePriceMM	SalePriceCH	PriceDiff	Store7	PctDiscMM	PctDiscCH	ListPriceDiff	STORE
1	CH	237	1	1.75	1.99	0.00	0.0	0	0.500000	1.99	1.75	0.24	No	0.000000	0.000000	0.24	1
3	CH	245	1	1.86	2.09	0.17	0.0	0	0.680000	2.09	1.69	0.40	No	0.000000	0.091398	0.23	1
10	CH	238	7	1.75	1.99	0.00	0.4	0	0.985757	1.59	1.75	-0.16	Yes	0.201005	0.000000	0.24	0
16	CH	278	7	2.06	2.13	0.00	0.0	0	0.795200	2.13	2.06	0.07	Yes	0.000000	0.000000	0.07	0
18	MM	268	2	1.86	2.18	0.00	0.0	1	0.400000	2.18	1.86	0.32	No	0.000000	0.000000	0.32	2
22	CH	258	1	1.76	2.18	0.00	0.0	0	0.680000	2.18	1.76	0.42	No	0.000000	0.000000	0.42	1

kable(tail(datos.validacion))

	Purchase	WeekofPurchase	StoreID	PriceCH	PriceMM	DiscMM	SpecialCH	SpecialMM	LoyalCH	SalePriceMM	SalePriceCH	PriceDiff	Store7	PctDiscMM	ListPriceDiff	STORE
1040	CH	237	7	1.75	1.99	0.4	0	0	0.320000	1.59	1.75	-0.16	Yes	0.201005	0.24	0
1043	CH	239	1	1.75	1.99	0.3	0	1	0.651840	1.69	1.75	-0.06	No	0.150754	0.24	1
1053	CH	237	7	1.75	1.99	0.4	0	0	0.740928	1.59	1.75	-0.16	Yes	0.201005	0.24	0
1056	MM	227	1	1.69	1.69	0.0	0	0	0.320000	1.69	1.69	0.00	No	0.000000	0.00	1
1057	CH	228	7	1.69	1.69	0.0	0	0	0.256000	1.69	1.69	0.00	Yes	0.000000	0.00	0
1059	CH	233	7	1.75	1.99	0.4	1	0	0.523840	1.59	1.75	-0.16	Yes	0.201005	0.24	0

set.seed(325)
tuning <- tune(svm, Purchase ~ ., data = datos.entrenamiento,
kernel = "linear",
ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 10, 15, 20)),
scale = TRUE)
summary(tuning)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##    15
## 
## - best performance: 0.1656772 
## 
## - Detailed performance results:
##     cost     error dispersion
## 1  0.001 0.3009850 0.05276636
## 2  0.010 0.1691382 0.04602764
## 3  0.100 0.1679754 0.04037149
## 4  1.000 0.1703010 0.04099747
## 5  5.000 0.1703010 0.03755497
## 6 10.000 0.1668263 0.03560699
## 7 15.000 0.1656772 0.03338668
## 8 20.000 0.1668399 0.03479758

names(tuning)

## [1] "best.parameters"  "best.performance" "method"           "nparcomb"        
## [5] "train.ind"        "sampling"         "performances"     "best.model"

ggplot(data = tuning$performances, aes(x = cost, y = error)) +
geom_line() +
geom_point() +
labs(title = "Error de validación ~ hiperparámetro C") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))

modelo <- tuning$best.model
summary(modelo)

## 
## Call:
## best.tune(method = svm, train.x = Purchase ~ ., data = datos.entrenamiento, 
##     ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 10, 15, 20)), 
##     kernel = "linear", scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  15 
## 
## Number of Support Vectors:  345
## 
##  ( 173 172 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM

modelo.lin <- svm(Purchase ~ ., data = datos.entrenamiento, kernel = "linear", cost = 15, scale = TRUE)
summary(modelo.lin)

## 
## Call:
## svm(formula = Purchase ~ ., data = datos.entrenamiento, kernel = "linear", 
##     cost = 15, scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  15 
## 
## Number of Support Vectors:  345
## 
##  ( 173 172 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM

prediccion <- predict(modelo, datos.validacion)
kable(head(prediccion, 10))

	x
1	CH
3	CH
10	CH
16	CH
18	CH
22	CH
26	CH
41	CH
45	CH
49	CH

kable(tail(prediccion, 10))

	x
1011	CH
1014	CH
1020	CH
1039	MM
1040	MM
1043	CH
1053	CH
1056	MM
1057	MM
1059	CH

length(prediccion)

## [1] 213

nrow(datos.validacion)

## [1] 213

mat.confusion <- table(predicho  = prediccion, real = datos.validacion$Purchase)
mat.confusion

##         real
## predicho  CH  MM
##       CH 108  17
##       MM  22  66

ggplot(datos.validacion, aes(x = SalePriceCH, y = PriceCH)) +
geom_point(aes(colour = factor(prediccion)))

ggplot(datos.validacion, aes(x = SalePriceMM, y = PriceMM)) +
geom_point(aes(colour = factor(prediccion)))

modelo.pol <- svm(Purchase ~ ., data = datos.entrenamiento, kernel = "polynomial", cost = 15, scale = TRUE)
summary(modelo.pol)

## 
## Call:
## svm(formula = Purchase ~ ., data = datos.entrenamiento, kernel = "polynomial", 
##     cost = 15, scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  polynomial 
##        cost:  15 
##      degree:  3 
##      coef.0:  0 
## 
## Number of Support Vectors:  342
## 
##  ( 177 165 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM

prediccion <- predict(modelo.pol, datos.validacion)
kable(head(prediccion, 10))

	x
1	CH
3	CH
10	CH
16	CH
18	CH
22	CH
26	CH
41	CH
45	CH
49	CH

kable(tail(prediccion, 10))

	x
1011	CH
1014	CH
1020	CH
1039	MM
1040	MM
1043	CH
1053	CH
1056	MM
1057	CH
1059	MM

length(prediccion)

## [1] 213

nrow(datos.validacion)

## [1] 213

mat.confusion <- table(predicho  = prediccion, real = datos.validacion$Purchase)
mat.confusion

##         real
## predicho  CH  MM
##       CH 108  23
##       MM  22  60

ggplot(datos.validacion, aes(x = SalePriceCH, y = PriceCH)) +
geom_point(aes(colour = factor(prediccion)))

ggplot(datos.validacion, aes(x = SalePriceMM, y = PriceMM)) +
geom_point(aes(colour = factor(prediccion)))

modelo.rad <- svm(Purchase ~ ., data = datos.entrenamiento, kernel = "radial", cost = 15, scale = TRUE)
summary(modelo.rad)

## 
## Call:
## svm(formula = Purchase ~ ., data = datos.entrenamiento, kernel = "radial", 
##     cost = 15, scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  15 
## 
## Number of Support Vectors:  340
## 
##  ( 173 167 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  CH MM

prediccion <- predict(modelo.rad, datos.validacion)
kable(head(prediccion, 10))

	x
1	MM
3	CH
10	CH
16	CH
18	CH
22	CH
26	CH
41	CH
45	CH
49	CH

kable(tail(prediccion, 10))

	x
1011	CH
1014	CH
1020	CH
1039	CH
1040	MM
1043	CH
1053	CH
1056	MM
1057	CH
1059	CH

length(prediccion)

## [1] 213

nrow(datos.validacion)

## [1] 213

mat.confusion <- table(predicho  = prediccion, real = datos.validacion$Purchase)
mat.confusion

##         real
## predicho  CH  MM
##       CH 108  21
##       MM  22  62

ggplot(datos.validacion, aes(x = SalePriceCH, y = PriceCH)) +
geom_point(aes(colour = factor(prediccion)))

ggplot(datos.validacion, aes(x = SalePriceMM, y = PriceMM)) +
geom_point(aes(colour = factor(prediccion)))

INTERPRETACION

Desde el inicio con la feafica de distribución de la variable de respuesta Purchase nos pudimos dar cueta que:

El 61 % de las preferencias de los clientes es para el refresco “CM” CH = Citrus Hill
El 39 % de las preferencias de los clientes es para el refresco “CM” MM = Minute Maid Orange Juice

Con la graficasion de los costos de turing se obtiene que:

El valor de coste que obtiene el error de validación mas chico es 15.

al fenerar la matriz de confucion de la prediccion se observa qu se obtinen el 81.6901 %,78.8732 % y 79.8122 % en términos de exactitud. al generar la Gráfica de los valores reales Vs predichos con estos valores de exactitud las diferencias entre una y otra no son tan grandes pero si las hay.

practica13

DavidGv

29/4/2020

INTERPRETACION