Contenido
- Repaso de Lasso - Ridge
- Elastic-Net Caso general de Lasso y Ridge
- Caso práctico - Web content minign
Leo el archivo y Cargo paquetes
# Leo el archivo
d <- read.table('web_content.txt')
## Dimension (Matriz dispersa)
dim(d)
## [1] 816 2078
# Que se busca predecir?
# Si la página web pertenece a la categoria sports o shopping en base a su contenido (texto en la página)
table(d$clase)
##
## shopping sports
## 497 319
## Nombres de las variables
names(d)[1:100]
## [1] "abajo" "abierta" "abierto" "abril" "abrió"
## [6] "abrir" "absoluta" "acá" "acaba" "academia"
## [11] "acceso" "accesorio" "accion" "acción" "acerca"
## [16] "acosta" "actitud" "actividad" "activo" "acto"
## [21] "actuacion" "actuación" "actual" "actualidad" "actualment"
## [26] "acuerdo" "adecuado" "adelant" "adema" "ademá"
## [31] "adentro" "adolfo" "adrogué" "adulto" "afa"
## [36] "aficionado" "afuera" "agosto" "agua" "aguilar"
## [41] "agustÃn" "ahÃ" "ahora" "air" "alberto"
## [46] "alcanzar" "alcorta" "alegr" "alegrÃa" "alejandro"
## [51] "alem" "alemania" "alex" "alfredo" "alguien"
## [56] "algún" "alguna" "alguno" "all" "allá"
## [61] "allend" "allÃ" "alma" "almagro" "almirant"
## [66] "alquil" "alrededor" "alta" "alternativa" "alto"
## [71] "altura" "alumno" "amant" "amarilla" "amateur"
## [76] "amba" "ambient" "ambo" "américa" "amigo"
## [81] "amistad" "amistoso" "amor" "amplia" "amplio"
## [86] "ana" "análisi" "ancho" "andes" "andré"
## [91] "año" "anotó" "anterior" "antigua" "antiguo"
## [96] "antonio" "aparec" "apart" "apena" "apertura"
## top
d[1:5, 1:10]
## abajo abierta abierto abril abrió abrir absoluta acá acaba
## 1 0.004236 0 0.000000 0 0 0.004698 0 0 0.005393
## 2 0.000000 0 0.000000 0 0 0.000000 0 0 0.000000
## 3 0.000000 0 0.009674 0 0 0.000000 0 0 0.000000
## 4 0.000000 0 0.000000 0 0 0.000000 0 0 0.000000
## 5 0.000000 0 0.000000 0 0 0.000000 0 0 0.000000
## academia
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
### Instalar los paquetes que digan FALSE
# En paralelo no anda en windows
library(caret)
## Warning: package 'caret' was built under R version 3.1.1
## Loading required package: lattice
## Loading required package: ggplot2
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 1.9-8
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
# Para procesar en paralelo
registerDoParallel(cores=4)
Divido en Training y Testing
# Nombres de features
predictors <- names(d)[!names(d) %in% "clase"]
# ?createDataPartition
inTrainingSet <- createDataPartition(d$clase, p = 0.7, list = FALSE)
train <- d[inTrainingSet, ]
test <- d[-inTrainingSet, ]
x = train[,predictors]
y = train$clase
# glmnet pide una matrix como input, uso as.matrix()
x = as.matrix(x)
Regresion Lasso - Selección de variables
## Lasso - AUC por cada lambda
cvfit_lasso = cv.glmnet(x, y, family = "binomial", alpha = 1,
parallel = TRUE, type.measur='auc')
plot(cvfit_lasso)

Resultados
pred = predict(cvfit_lasso, newx = as.matrix(test[,predictors]),
s = "lambda.min", type = "class")
confusionMatrix(pred, test$clase)
## Confusion Matrix and Statistics
##
## Reference
## Prediction shopping sports
## shopping 132 3
## sports 17 92
##
## Accuracy : 0.918
## 95% CI : (0.876, 0.949)
## No Information Rate : 0.611
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.832
## Mcnemar's Test P-Value : 0.00365
##
## Sensitivity : 0.886
## Specificity : 0.968
## Pos Pred Value : 0.978
## Neg Pred Value : 0.844
## Prevalence : 0.611
## Detection Rate : 0.541
## Detection Prevalence : 0.553
## Balanced Accuracy : 0.927
##
## 'Positive' Class : shopping
##
# Ridge, binomial - Mete todas las variables!
cvfit_ridge = cv.glmnet(x, y, family = "multinomial",
alpha = 0, parallel=TRUE, standardize=TRUE)
plot(cvfit_ridge)

Resultados Ridge
pred = predict(cvfit_ridge, newx = as.matrix(test[,predictors]),
s = "lambda.min", type = "class", type.measur='auc')
confusionMatrix(pred, test$clase)
## Confusion Matrix and Statistics
##
## Reference
## Prediction shopping sports
## shopping 137 0
## sports 12 95
##
## Accuracy : 0.951
## 95% CI : (0.916, 0.974)
## No Information Rate : 0.611
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.899
## Mcnemar's Test P-Value : 0.0015
##
## Sensitivity : 0.919
## Specificity : 1.000
## Pos Pred Value : 1.000
## Neg Pred Value : 0.888
## Prevalence : 0.611
## Detection Rate : 0.561
## Detection Prevalence : 0.561
## Balanced Accuracy : 0.960
##
## 'Positive' Class : shopping
##
Elastic - Net
# Elastic-net, Multinomial, sin estandarizar
cvfit_elastic = cv.glmnet(x, y, family = "multinomial", parallel = TRUE, standardize=FALSE)
plot(cvfit_elastic)

Resultados Elastic-Net
pred = predict(cvfit_elastic, newx = as.matrix(test[,predictors]), s = "lambda.min", type = "class")
confusionMatrix(pred, test$clase)
## Confusion Matrix and Statistics
##
## Reference
## Prediction shopping sports
## shopping 143 4
## sports 6 91
##
## Accuracy : 0.959
## 95% CI : (0.926, 0.98)
## No Information Rate : 0.611
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.914
## Mcnemar's Test P-Value : 0.752
##
## Sensitivity : 0.960
## Specificity : 0.958
## Pos Pred Value : 0.973
## Neg Pred Value : 0.938
## Prevalence : 0.611
## Detection Rate : 0.586
## Detection Prevalence : 0.602
## Balanced Accuracy : 0.959
##
## 'Positive' Class : shopping
##