Contenido

Leo el archivo y Cargo paquetes

# Leo el archivo
d <- read.table('web_content.txt')

## Dimension (Matriz dispersa)
dim(d)
## [1]  816 2078
# Que se busca predecir?
# Si la página web pertenece a la categoria sports o shopping en base a su contenido (texto en la página)
table(d$clase)
## 
## shopping   sports 
##      497      319
## Nombres de las variables
names(d)[1:100]
##   [1] "abajo"       "abierta"     "abierto"     "abril"       "abrió"      
##   [6] "abrir"       "absoluta"    "acá"         "acaba"       "academia"   
##  [11] "acceso"      "accesorio"   "accion"      "acción"      "acerca"     
##  [16] "acosta"      "actitud"     "actividad"   "activo"      "acto"       
##  [21] "actuacion"   "actuación"   "actual"      "actualidad"  "actualment" 
##  [26] "acuerdo"     "adecuado"    "adelant"     "adema"       "ademá"      
##  [31] "adentro"     "adolfo"      "adrogué"     "adulto"      "afa"        
##  [36] "aficionado"  "afuera"      "agosto"      "agua"        "aguilar"    
##  [41] "agustín"     "ahí"         "ahora"       "air"         "alberto"    
##  [46] "alcanzar"    "alcorta"     "alegr"       "alegría"     "alejandro"  
##  [51] "alem"        "alemania"    "alex"        "alfredo"     "alguien"    
##  [56] "algún"       "alguna"      "alguno"      "all"         "allá"       
##  [61] "allend"      "allí"        "alma"        "almagro"     "almirant"   
##  [66] "alquil"      "alrededor"   "alta"        "alternativa" "alto"       
##  [71] "altura"      "alumno"      "amant"       "amarilla"    "amateur"    
##  [76] "amba"        "ambient"     "ambo"        "américa"     "amigo"      
##  [81] "amistad"     "amistoso"    "amor"        "amplia"      "amplio"     
##  [86] "ana"         "análisi"     "ancho"       "andes"       "andré"      
##  [91] "año"         "anotó"       "anterior"    "antigua"     "antiguo"    
##  [96] "antonio"     "aparec"      "apart"       "apena"       "apertura"
## top
d[1:5, 1:10]
##      abajo abierta  abierto abril abrió    abrir absoluta acá    acaba
## 1 0.004236       0 0.000000     0     0 0.004698        0   0 0.005393
## 2 0.000000       0 0.000000     0     0 0.000000        0   0 0.000000
## 3 0.000000       0 0.009674     0     0 0.000000        0   0 0.000000
## 4 0.000000       0 0.000000     0     0 0.000000        0   0 0.000000
## 5 0.000000       0 0.000000     0     0 0.000000        0   0 0.000000
##   academia
## 1        0
## 2        0
## 3        0
## 4        0
## 5        0
### Instalar los paquetes que digan FALSE
# En paralelo no anda en windows
library(caret)
## Warning: package 'caret' was built under R version 3.1.1
## Loading required package: lattice
## Loading required package: ggplot2
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 1.9-8
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
# Para procesar en paralelo
registerDoParallel(cores=4)

Divido en Training y Testing

# Nombres de features
predictors <- names(d)[!names(d) %in% "clase"]

# ?createDataPartition
inTrainingSet <- createDataPartition(d$clase, p = 0.7, list = FALSE)

train <- d[inTrainingSet, ]
test <- d[-inTrainingSet, ]

x = train[,predictors]
y = train$clase
# glmnet pide una matrix como input, uso as.matrix()
x = as.matrix(x)

Regresion Lasso - Selección de variables

## Lasso - AUC por cada lambda
cvfit_lasso = cv.glmnet(x, y, family = "binomial", alpha = 1, 
                        parallel = TRUE, type.measur='auc')
plot(cvfit_lasso)

plot of chunk unnamed-chunk-4

Resultados

pred = predict(cvfit_lasso, newx = as.matrix(test[,predictors]), 
               s = "lambda.min", type = "class")
confusionMatrix(pred, test$clase)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction shopping sports
##   shopping      132      3
##   sports         17     92
##                                         
##                Accuracy : 0.918         
##                  95% CI : (0.876, 0.949)
##     No Information Rate : 0.611         
##     P-Value [Acc > NIR] : < 2e-16       
##                                         
##                   Kappa : 0.832         
##  Mcnemar's Test P-Value : 0.00365       
##                                         
##             Sensitivity : 0.886         
##             Specificity : 0.968         
##          Pos Pred Value : 0.978         
##          Neg Pred Value : 0.844         
##              Prevalence : 0.611         
##          Detection Rate : 0.541         
##    Detection Prevalence : 0.553         
##       Balanced Accuracy : 0.927         
##                                         
##        'Positive' Class : shopping      
## 

# Ridge, binomial - Mete todas las variables!

cvfit_ridge = cv.glmnet(x, y, family = "multinomial", 
                        alpha = 0, parallel=TRUE, standardize=TRUE)
plot(cvfit_ridge)

plot of chunk unnamed-chunk-6

Resultados Ridge

pred = predict(cvfit_ridge, newx = as.matrix(test[,predictors]), 
               s = "lambda.min", type = "class", type.measur='auc')
confusionMatrix(pred, test$clase)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction shopping sports
##   shopping      137      0
##   sports         12     95
##                                         
##                Accuracy : 0.951         
##                  95% CI : (0.916, 0.974)
##     No Information Rate : 0.611         
##     P-Value [Acc > NIR] : <2e-16        
##                                         
##                   Kappa : 0.899         
##  Mcnemar's Test P-Value : 0.0015        
##                                         
##             Sensitivity : 0.919         
##             Specificity : 1.000         
##          Pos Pred Value : 1.000         
##          Neg Pred Value : 0.888         
##              Prevalence : 0.611         
##          Detection Rate : 0.561         
##    Detection Prevalence : 0.561         
##       Balanced Accuracy : 0.960         
##                                         
##        'Positive' Class : shopping      
## 

Elastic - Net

# Elastic-net, Multinomial, sin estandarizar
cvfit_elastic = cv.glmnet(x, y, family = "multinomial", parallel = TRUE, standardize=FALSE)

plot(cvfit_elastic)

plot of chunk unnamed-chunk-8

Resultados Elastic-Net

pred = predict(cvfit_elastic, newx = as.matrix(test[,predictors]), s = "lambda.min", type = "class")
confusionMatrix(pred, test$clase)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction shopping sports
##   shopping      143      4
##   sports          6     91
##                                        
##                Accuracy : 0.959        
##                  95% CI : (0.926, 0.98)
##     No Information Rate : 0.611        
##     P-Value [Acc > NIR] : <2e-16       
##                                        
##                   Kappa : 0.914        
##  Mcnemar's Test P-Value : 0.752        
##                                        
##             Sensitivity : 0.960        
##             Specificity : 0.958        
##          Pos Pred Value : 0.973        
##          Neg Pred Value : 0.938        
##              Prevalence : 0.611        
##          Detection Rate : 0.586        
##    Detection Prevalence : 0.602        
##       Balanced Accuracy : 0.959        
##                                        
##        'Positive' Class : shopping     
##