Ejemplo

Leo el archivo y Cargo paquetes

# Saco la notacion cientifica
options(scipen=666)

# Pongo un seed.
set.seed(666)

# Leo el archivo
d <- read.table('web_content.txt')

## Dimension (Matriz dispersa)
dim(d)
## [1]  816 2078
# Que se busca predecir?
# Si la pagina web pertenece a la categoria sports o shopping en base a su contenido (texto en la pagina)
table(d$clase)
## 
## shopping   sports 
##      497      319
## Nombres de las variables
names(d)[sample(2000, 100)]
##   [1] "presenta"     "comunicación" "turismo"      "conferencia" 
##   [5] "espera"       "pinamar"      "tucumán"      "ingreso"     
##   [9] "adecuado"     "debut"        "preparación"  "adrogué"     
##  [13] "belgrano"     "carrera"      "constitución" "puro"        
##  [17] "amant"        "santo"        "iba"          "hermano"     
##  [21] "trébol"       "marcar"       "amba"         "cargo"       
##  [25] "pudo"         "defend"       "amplio"       "mas"         
##  [29] "leer"         "reino"        "hermosa"      "feliciano"   
##  [33] "próximo"      "instituto"    "método"       "importancia" 
##  [37] "demostró"     "segunda"      "precio"       "lesion"      
##  [41] "cuanto"       "jorg"         "busco"        "tomé"        
##  [45] "salió"        "peñarol"      "estonia"      "manual"      
##  [49] "institucion"  "tortuguita"   "aseguró"      "bahía"       
##  [53] "campana"      "perdido"      "usado"        "noticia"     
##  [57] "usa"          "roqu"         "banfield"     "amarilla"    
##  [61] "pleno"        "propia"       "dar"          "mínima"      
##  [65] "seguro"       "deberá"       "trabaja"      "salto"       
##  [69] "llevó"        "sauc"         "maestro"      "ducha"       
##  [73] "morón"        "reloj"        "aruba"        "posicion"    
##  [77] "letonia"      "antiguo"      "tanta"        "turno"       
##  [81] "peso"         "pista"        "dinamarca"    "quedan"      
##  [85] "dio"          "pequeña"      "suelo"        "chipr"       
##  [89] "nota"         "recuerda"     "trayectoria"  "ausencia"    
##  [93] "varia"        "libro"        "isidro"       "blanco"      
##  [97] "parec"        "gesel"        "mejía"        "pasada"
## top de variables explicativas
d[1:5, 1:10]
##      abajo abierta  abierto abril abrió    abrir absoluta acá    acaba
## 1 0.004236       0 0.000000     0     0 0.004698        0   0 0.005393
## 2 0.000000       0 0.000000     0     0 0.000000        0   0 0.000000
## 3 0.000000       0 0.009674     0     0 0.000000        0   0 0.000000
## 4 0.000000       0 0.000000     0     0 0.000000        0   0 0.000000
## 5 0.000000       0 0.000000     0     0 0.000000        0   0 0.000000
##   academia
## 1        0
## 2        0
## 3        0
## 4        0
## 5        0
### Instalar los paquetes que digan FALSE
# En paralelo no anda en windows
library(caret)
## Warning: package 'caret' was built under R version 3.1.1
## Loading required package: lattice
## Loading required package: ggplot2
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 1.9-8
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
library(corrplot)

# Para procesar en paralelo
registerDoParallel(cores=4)

# cor <- cor(d[,names(d)[!names(d) %in% c('clase')]])
# corrplot(cor)

Divido en Training y Testing

# Nombres de variables
predictors <- names(d)[!names(d) %in% "clase"]

# ?createDataPartition
set.seed(666)
inTrainingSet <- createDataPartition(d$clase, p = 0.5, list = FALSE)

train <- d[inTrainingSet, ]
test <- d[-inTrainingSet, ]

x = train[,predictors]
y = train$clase

# glmnet pide una matrix como input, uso as.matrix
# Si hubiera variables categoricas, habria que usar model.matrix()
x = as.matrix(x)

Ejemplo del path usando regresion lasso

## Ejemplo del path usando regresion lasso
set.seed(666)
fit_lasso = glmnet(x, y, family = "binomial", alpha = 1, standardize=TRUE)

# Coeficientes y Devianza
plot(fit_lasso, xvar = "dev", label = TRUE)

plot of chunk unnamed-chunk-4

# elijo 32 variables (lambda = 0.059660)
coef2 <- function(fit, s){
  cf <- as.matrix(coef(fit, s=s))
  cf <- data.frame(coef=cf[cf[,1] != 0 , ])
  cf$vars <- row.names(cf)
  cf[order(abs(cf$coef), decreasing=T), ]
}

# Algunos coeficientes son muy altos, los puedo sacar
coef2(fit_lasso, s=0.059660)
##                    coef        vars
## blanca      -274.652257      blanca
## kong        -124.723650        kong
## inmuebl      -92.301002     inmuebl
## bajo         -47.041971        bajo
## ideal        -25.538109       ideal
## vez           25.520169         vez
## plata        -20.531918       plata
## publicacion  -17.213590 publicacion
## último        12.179236      último
## arab         -11.827196        arab
## capit        -11.225988       capit
## partido        7.987047     partido
## fecha          6.941993       fecha
## cierto         6.480958      cierto
## jugador        6.016521     jugador
## equipo         4.839720      equipo
## hace           4.179027        hace
## venta         -4.005150       venta
## segunda        3.430944     segunda
## pago          -3.306324        pago
## club           2.990045        club
## servicio      -2.789868    servicio
## entrega       -2.321063     entrega
## auto          -2.269953        auto
## torneo         1.262399      torneo
## aviso         -1.042313       aviso
## oferta        -0.998149      oferta
## deport         0.802503      deport
## (Intercept)    0.628807 (Intercept)
## pesar          0.402310       pesar
## país          -0.180308        país
## rival          0.159893       rival
## encuentro      0.083354   encuentro
## alta          -0.001166        alta
# Se pueden poner limites inferiores y superiores a los coeficientes
fit_lasso_restringido <- glmnet(x, y, family = "binomial", alpha = 1,
                                standardize=TRUE, lower=-40,upper=40)
fit_lasso_restringido
## 
## Call:  glmnet(x = x, y = y, family = "binomial", alpha = 1, standardize = TRUE,      lower.limits = -40, upper.limits = 40) 
## 
##         Df                %Dev  Lambda
##   [1,]   0 0.00000000000000382 0.33400
##   [2,]   2 0.03290000000000000 0.31800
##   [3,]   2 0.06460000000000000 0.30400
##   [4,]   2 0.09310000000000000 0.29000
##   [5,]   4 0.11799999999999999 0.27700
##   [6,]   4 0.14199999999999999 0.26400
##   [7,]   4 0.16400000000000001 0.25200
##   [8,]   4 0.18500000000000000 0.24100
##   [9,]   5 0.20200000000000001 0.23000
##  [10,]   5 0.21800000000000000 0.21900
##  [11,]   5 0.23400000000000001 0.20900
##  [12,]   7 0.24800000000000000 0.20000
##  [13,]   7 0.26200000000000001 0.19100
##  [14,]   8 0.27500000000000002 0.18200
##  [15,]   8 0.28699999999999998 0.17400
##  [16,]   8 0.29899999999999999 0.16600
##  [17,]   8 0.31000000000000000 0.15800
##  [18,]   8 0.32100000000000001 0.15100
##  [19,]   9 0.33000000000000002 0.14400
##  [20,]  11 0.33900000000000002 0.13800
##  [21,]  13 0.34999999999999998 0.13200
##  [22,]  14 0.36199999999999999 0.12600
##  [23,]  16 0.37800000000000000 0.12000
##  [24,]  16 0.39400000000000002 0.11400
##  [25,]  17 0.40999999999999998 0.10900
##  [26,]  17 0.42399999999999999 0.10400
##  [27,]  19 0.43800000000000000 0.09950
##  [28,]  24 0.45300000000000001 0.09500
##  [29,]  26 0.46600000000000003 0.09070
##  [30,]  24 0.47899999999999998 0.08660
##  [31,]  25 0.49199999999999999 0.08260
##  [32,]  26 0.50400000000000000 0.07890
##  [33,]  29 0.51700000000000002 0.07530
##  [34,]  31 0.53100000000000003 0.07190
##  [35,]  38 0.54500000000000004 0.06860
##  [36,]  41 0.56200000000000006 0.06550
##  [37,]  43 0.57799999999999996 0.06250
##  [38,]  46 0.59399999999999997 0.05970
##  [39,]  47 0.60899999999999999 0.05700
##  [40,]  47 0.62400000000000000 0.05440
##  [41,]  49 0.63700000000000001 0.05190
##  [42,]  51 0.65000000000000002 0.04950
##  [43,]  50 0.66300000000000003 0.04730
##  [44,]  56 0.67400000000000004 0.04510
##  [45,]  59 0.68600000000000005 0.04310
##  [46,]  64 0.69699999999999995 0.04110
##  [47,]  65 0.70799999999999996 0.03930
##  [48,]  68 0.71899999999999997 0.03750
##  [49,]  74 0.73099999999999998 0.03580
##  [50,]  75 0.74299999999999999 0.03410
##  [51,]  78 0.75400000000000000 0.03260
##  [52,]  84 0.76400000000000001 0.03110
##  [53,]  87 0.77500000000000002 0.02970
##  [54,]  87 0.78600000000000003 0.02830
##  [55,]  87 0.79500000000000004 0.02710
##  [56,]  89 0.80500000000000005 0.02580
##  [57,]  92 0.81399999999999995 0.02470
##  [58,]  94 0.82299999999999995 0.02350
##  [59,]  99 0.83099999999999996 0.02250
##  [60,] 102 0.83899999999999997 0.02140
##  [61,] 109 0.84599999999999997 0.02050
##  [62,] 110 0.85399999999999998 0.01950
##  [63,] 117 0.86099999999999999 0.01860
##  [64,] 118 0.86699999999999999 0.01780
##  [65,] 121 0.87300000000000000 0.01700
##  [66,] 119 0.87900000000000000 0.01620
##  [67,] 119 0.88500000000000001 0.01550
##  [68,] 123 0.89000000000000001 0.01480
##  [69,] 125 0.89500000000000002 0.01410
##  [70,] 126 0.90000000000000002 0.01350
##  [71,] 128 0.90400000000000003 0.01290
##  [72,] 131 0.90900000000000003 0.01230
##  [73,] 132 0.91300000000000003 0.01170
##  [74,] 135 0.91700000000000004 0.01120
##  [75,] 135 0.92100000000000004 0.01070
##  [76,] 138 0.92400000000000004 0.01020
##  [77,] 139 0.92800000000000005 0.00972
##  [78,] 139 0.93100000000000005 0.00928
##  [79,] 142 0.93400000000000005 0.00886
##  [80,] 142 0.93700000000000006 0.00846
##  [81,] 143 0.93999999999999995 0.00807
##  [82,] 143 0.94299999999999995 0.00771
##  [83,] 145 0.94599999999999995 0.00736
##  [84,] 146 0.94799999999999995 0.00702
##  [85,] 149 0.95099999999999996 0.00670
##  [86,] 150 0.95299999999999996 0.00640
##  [87,] 151 0.95499999999999996 0.00611
##  [88,] 152 0.95699999999999996 0.00583
##  [89,] 153 0.95899999999999996 0.00556
##  [90,] 152 0.96099999999999997 0.00531
##  [91,] 154 0.96299999999999997 0.00507
##  [92,] 153 0.96399999999999997 0.00484
##  [93,] 150 0.96599999999999997 0.00462
##  [94,] 150 0.96699999999999997 0.00441
##  [95,] 151 0.96899999999999997 0.00421
##  [96,] 152 0.96999999999999997 0.00402
##  [97,] 154 0.97099999999999997 0.00384
##  [98,] 159 0.97299999999999998 0.00366
##  [99,] 162 0.97399999999999998 0.00349
## [100,] 165 0.97499999999999998 0.00334
### Penalizacion alta -> entran pocas variables
coef2(fit_lasso_restringido, s=0.1)
##                            coef        vars
## arab        -40.000000000000000        arab
## bahía       -40.000000000000000       bahía
## blanca      -40.000000000000000      blanca
## inmuebl     -40.000000000000000     inmuebl
## kong        -40.000000000000000        kong
## malta       -40.000000000000000       malta
## posada      -40.000000000000000      posada
## rusa        -40.000000000000000        rusa
## bajo        -38.295836923808764        bajo
## aruba       -35.915038308642110       aruba
## costa       -12.921127953665563       costa
## plata       -10.792214126457910       plata
## capit        -6.437598546805154       capit
## vez           6.198014336240986         vez
## partido       5.453426088216941     partido
## equipo        2.891886140663674      equipo
## fecha         0.777579368535495       fecha
## (Intercept)   0.414760490959842 (Intercept)
## jugador       0.220952524395958     jugador
## bahama       -0.000000000004021      bahama
### Menor penalizacion -> Más variables, algunas mas sospechosas
coef2(fit_lasso_restringido, s=0.01)
##                              coef         vars
## algún         40.0000000000000000        algún
## alta         -40.0000000000000000         alta
## arab         -40.0000000000000000         arab
## aruba        -40.0000000000000000        aruba
## bahama       -40.0000000000000000       bahama
## bahía        -40.0000000000000000        bahía
## bajo         -40.0000000000000000         bajo
## bangladesh   -40.0000000000000000   bangladesh
## belic        -40.0000000000000000        belic
## blanca       -40.0000000000000000       blanca
## cierto        40.0000000000000000       cierto
## corrient     -40.0000000000000000     corrient
## ello          40.0000000000000000         ello
## ideal        -40.0000000000000000        ideal
## inmuebl      -40.0000000000000000      inmuebl
## interesa     -40.0000000000000000     interesa
## kong         -40.0000000000000000         kong
## país         -40.0000000000000000         país
## posada       -40.0000000000000000       posada
## publicacion  -40.0000000000000000  publicacion
## vez           40.0000000000000000          vez
## plata        -39.7115779397192910        plata
## argelia      -39.3244818426938423      argelia
## capit        -38.7215865344786039        capit
## caico        -37.9405150995484917        caico
## costa        -36.4032445362836015        costa
## fundament    -35.4146958971711143    fundament
## malta        -34.1101071403390819        malta
## miedo        -30.0349328315879873        miedo
## queda         28.6462681745811771        queda
## vacacion     -28.4958461219429218     vacacion
## mar          -28.0879662039648395          mar
## individual   -26.8367224850287229   individual
## domínica     -26.6360454822746675     domínica
## unido        -26.4015471166369267        unido
## cameroon     -25.8919860297048778     cameroon
## defensa       25.5368565977205577      defensa
## pesar         25.4035284636747392        pesar
## ser           24.3318228699038528          ser
## rico         -24.0761921174218472         rico
## caja         -23.3830153656451927         caja
## sri          -20.8497999146224267          sri
## simpl        -20.7486706670411039        simpl
## días          18.7250018262542852         días
## venta        -18.2530495562963431        venta
## último        17.7050419534127634       último
## entrega      -17.5963054359478335      entrega
## servicio     -17.2117802320654931     servicio
## proceso      -16.5965401198227198      proceso
## entrenador    16.5021452224543026   entrenador
## guía         -16.4842499953217043         guía
## bielorusia   -16.1398499662682440   bielorusia
## rosario      -15.6780795176210876      rosario
## fecha         15.0639183885254226        fecha
## pago         -14.2126106821085738         pago
## cualquier    -14.1745267525745717    cualquier
## tel          -14.0002214927757898          tel
## libro        -13.6559203971825127        libro
## oferta       -13.2065166015854896       oferta
## hace          13.1847628464904396         hace
## deport        13.1773367543195832       deport
## distancia    -12.5552906624415179    distancia
## chile        -12.5022036124949860        chile
## lui          -12.3511906614605742          lui
## aviso        -12.1804892329072061        aviso
## gran         -11.8428705368588751         gran
## club          11.7592554479427029         club
## auto         -11.5381869448275598         auto
## producto     -11.2767277517273499     producto
## intensidad   -11.0958134112722071   intensidad
## partido        9.6318781943995901      partido
## kazajstán     -9.4913699822695534    kazajstán
## meno           9.4290720546113338         meno
## palabra        9.3856372962717565      palabra
## vece           8.9348721831562976         vece
## pintura       -8.5943828653027285      pintura
## equipo         8.4339588517743245       equipo
## asociación     8.3692501533964183   asociación
## luego          8.1117875010926639        luego
## feder         -8.1030471730490845        feder
## instrumento   -7.9883596999168782  instrumento
## dueño         -7.9285726720505494        dueño
## dado           7.7056895867672681         dado
## rusa          -7.1540430381113609         rusa
## camino         6.9663595833229266       camino
## sabe           6.7098227026790687         sabe
## toda          -6.3766342918853915         toda
## torneo         6.3368197721397603       torneo
## encuentro      6.2763065925231096    encuentro
## electrónico    5.8026382871484028  electrónico
## diez          -5.7643438065540931         diez
## técnica        5.2391747909575050      técnica
## nivel          5.0853556807237190        nivel
## cancha         4.9788800016662229       cancha
## segunda        4.7945701004247727      segunda
## mucha          4.5425686585537441        mucha
## patada         4.2838233667380869       patada
## combat         4.0057406907747666       combat
## etapa         -3.7925135887466785        etapa
## nacion         3.5331911219342551       nacion
## compra        -3.3942754519704268       compra
## base           3.3246379667888331         base
## final          3.2744576424561931        final
## ley            3.1419418379692163          ley
## calidad       -2.9792382962834676      calidad
## espero        -2.9461677327819622       espero
## condicion     -2.7465977932120822    condicion
## continuación   2.6491206769659019 continuación
## clima         -2.4777412343509280        clima
## usado         -2.4674865164601760        usado
## client        -2.3873377754651370       client
## misma          2.3593013803289695        misma
## alumno         1.9137208003405093       alumno
## (Intercept)    1.7157654083836689  (Intercept)
## mercado       -1.6992392207577849      mercado
## jamaica       -1.6575610235780958      jamaica
## jugador        1.6557140772373300      jugador
## quinta        -1.6459334555864686       quinta
## mientra        1.5990925541996701      mientra
## deportiva      1.3484325709989946    deportiva
## cocina        -1.3356173954464619       cocina
## modelo        -1.2979309564655490       modelo
## san           -1.2940609101705638          san
## carga         -1.2642013811668580        carga
## cuota         -1.0097675861915947        cuota
## deportivo      0.6651721149137295    deportivo
## obra          -0.4810210406210172         obra
## mauritius     -0.3862723154666317    mauritius
## localidad      0.3835546931199125    localidad
## principado    -0.2523150907928940   principado
## comentario     0.2482785762389935   comentario
## motor         -0.2302880575478789        motor
## eje           -0.2027238572069245          eje
## mundial        0.1407378255026032      mundial
## super         -0.0404909901014335        super
## alguien        0.0373751697253266      alguien
## pakistan      -0.0351963389295912     pakistan
## emirato       -0.0114864800815509      emirato
## botswana      -0.0000000000008630     botswana
## lanka         -0.0000000000004657        lanka
### Graficos
plot(fit_lasso_restringido)

plot of chunk unnamed-chunk-4

plot(fit_lasso_restringido, xvar = "dev", label = TRUE)

plot of chunk unnamed-chunk-4

plot(fit_lasso_restringido, xvar = 'lambda', label = TRUE)

plot of chunk unnamed-chunk-4

# Lambdas - Devianza
# fit_res <- as.data.frame(cbind(fit_lasso$df, fit_lasso$dev, fit_lasso$lambda))
# names(fit_res) <- c("Q Variables (DF)", "Devianza", "Lambda")
# head(fit_res, 100)

# Predicciones usando todos los lambdas
pred_mat <- predict(fit_lasso_restringido, as.matrix(test[,predictors]), type='class')
accuracy <- function(mat_confusion) sum(diag(mat_confusion)) / sum(mat_confusion)

eval_modelos <- function(x){
  # toma la matriz de confusion como input y calcula metricas
  tbl <- table(x, test$clase)
  if(dim(tbl)[[1]] == 1){
    list(tbl=tbl, acc=0, recall=0, precision=0, specificity=0, F1=0)
  } else {
    accuracy <- sum(diag(tbl)) / sum(tbl)
    sensitividad <- tbl[1,1] / sum(tbl[,1])
    especificidad <- tbl[2,2] / sum(tbl[,2])
    list(tbl=tbl, accuracy=accuracy, sensitividad=sensitividad, especificidad=especificidad)
  } 
}

res_fit_lasso <- apply(pred_mat, 2, eval_modelos)
metricas <- c('sensitividad', 'especificidad', 'accuracy')
met_lasso <- lapply(metricas, function(x) {
  as.vector(unlist(sapply(res_fit_lasso, "[[", x)))
})

met_lasso <- as.data.frame(do.call('cbind', met_lasso))
names(met_lasso) <- metricas
# met_lasso$lambda_fact <- as.factor(row.names(met_lasso))
met_lasso$lambda <- fit_lasso$lambda[-c(1:10)]
melt_met_lasso <- reshape2::melt(met_lasso, id.vars=c('lambda'))

# Grafico
ggplot(melt_met_lasso, aes(x=lambda, y=value, color=variable, group=variable)) +
  geom_line(size=1.2) + scale_x_continuous(breaks=seq(0, 1, 0.03)) +
  labs(title = 'Metricas - Regresion Lasso')

plot of chunk unnamed-chunk-4

# Las variables
coef2(fit_lasso_restringido, s=0.03)
##                                coef          vars
## arab          -40.00000000000000000          arab
## bahía         -40.00000000000000000         bahía
## bajo          -40.00000000000000000          bajo
## blanca        -40.00000000000000000        blanca
## cierto         40.00000000000000000        cierto
## ideal         -40.00000000000000000         ideal
## inmuebl       -40.00000000000000000       inmuebl
## kong          -40.00000000000000000          kong
## malta         -40.00000000000000000         malta
## posada        -40.00000000000000000        posada
## publicacion   -40.00000000000000000   publicacion
## rusa          -40.00000000000000000          rusa
## unido         -40.00000000000000000         unido
## bahama        -39.99932691196014645        bahama
## plata         -31.47006844801154202         plata
## vez            31.39657308801278290           vez
## aruba         -28.86289033989351793         aruba
## país          -24.27169243110974151          país
## interesa      -23.91837778873620124      interesa
## costa         -23.37709926358586543         costa
## capit         -22.55205002984466489         capit
## alta          -20.96477939141379210          alta
## corrient      -20.88270814553446186      corrient
## último         20.74489936228430409        último
## algún          16.19909795564559829         algún
## pesar          14.10384641040603171         pesar
## bangladesh    -13.83897991897264568    bangladesh
## fecha          12.27047632768618968         fecha
## venta         -11.48763130937532040         venta
## hace           10.89763731283235693          hace
## pago          -10.10391276294430085          pago
## ser             9.18648550476234149           ser
## partido         9.17043168395568387       partido
## oferta         -9.05064084211478104        oferta
## servicio       -8.65003664686511975      servicio
## club            7.63451759624667581          club
## deport          7.56932749994904874        deport
## equipo          7.49195367520923394        equipo
## entrega        -7.31730279501204528       entrega
## entrenador      6.39854648784200020    entrenador
## ello            6.36382908806739334          ello
## auto           -6.31382415383148032          auto
## tel            -6.29514525562471405           tel
## rosario        -6.29156263964139484       rosario
## rival           6.22316474436080291         rival
## jugador         6.11083681715957105       jugador
## aviso          -6.11038975499248682         aviso
## meno            6.08466992202060286          meno
## individual     -5.76553361007994347    individual
## defensa         5.24474675166504412       defensa
## feder          -4.92263228799246555         feder
## caja           -4.62552161933269002          caja
## vece            4.55522001936098331          vece
## mar            -4.43298253029397848           mar
## luego           4.22515330099807151         luego
## torneo          4.05075003181442117        torneo
## proceso        -3.93906094446558308       proceso
## final           3.75793259441874294         final
## aunqu           3.69605739024292257         aunqu
## motor          -3.41768375285612613         motor
## cualquier      -3.27709072823563652     cualquier
## producto       -3.18256359521445908      producto
## queda           2.67037326206913583         queda
## encuentro       2.26757913934614219     encuentro
## técnica         2.24562410603837304       técnica
## calidad        -1.98265321565457220       calidad
## segunda         1.98135347009402185       segunda
## simpl          -1.91399943899542291         simpl
## cancha          1.72492077613540973        cancha
## días            1.55464602520842243          días
## toda           -1.41646311949624937          toda
## asociación      1.38038058929948915    asociación
## electrónico     0.95510712322789582   electrónico
## misma           0.93762672140637515         misma
## (Intercept)     0.85655940793664109   (Intercept)
## guía           -0.79937605581556526          guía
## instrumento    -0.61603912819121698   instrumento
## entrenamiento   0.56472045572571761 entrenamiento
## sol            -0.36362747720491256           sol
## camino          0.31789497298592179        camino
## compra         -0.20761590593549836        compra
## comentario      0.01804591056237849    comentario
## cameroon       -0.01536584184296038      cameroon
## mientra         0.00829615567997205       mientra
## caico          -0.00000000025601848         caico
## belic          -0.00000000000112899         belic
## bielorusia     -0.00000000000001792    bielorusia
## botswana       -0.00000000000001792      botswana

Comparación entre lasso, ridge y elastic-net - Validacion cruzada

Regresion Lasso

set.seed(666)
cvfit_lasso = cv.glmnet(x, y, family = "binomial", alpha = 1, nfold=3,
                        parallel = TRUE, standardize=TRUE, type.measur='auc')
plot(cvfit_lasso)

plot of chunk unnamed-chunk-5

Resultados Lasso

pred_lasso = predict(cvfit_lasso, newx = as.matrix(test[,predictors]), 
               s = "lambda.min", type = "class")
confusionMatrix(pred_lasso, test$clase)$table
##           Reference
## Prediction shopping sports
##   shopping      228      7
##   sports         20    152

Ridge - Incluye a todas las variables en la estimación

set.seed(666)
cvfit_ridge = cv.glmnet(x, y, family = "binomial", alpha = 0, nfold=3,
                        parallel=TRUE, standardize=TRUE, type.measur='auc')
plot(cvfit_ridge)

plot of chunk unnamed-chunk-7

Resultados Ridge

pred_ridge = predict(cvfit_ridge, newx = as.matrix(test[,predictors]), 
               s = "lambda.min", type = "class")
confusionMatrix(pred_ridge, test$clase)$table
##           Reference
## Prediction shopping sports
##   shopping      225      8
##   sports         23    151

Elastic - Net

Alpha 0.2 - más cerca de ridge

set.seed(666)
cvfit_elastic_02 = cv.glmnet(x, y, family = "binomial", alpha = 0.2, nfold=3, 
                        parallel=TRUE, standardize=TRUE, type.measur='auc')
plot(cvfit_elastic_02)

plot of chunk unnamed-chunk-9

Resultados Elastic-Net

pred_enet_02 = predict(cvfit_elastic_02, newx = as.matrix(test[,predictors]), s = "lambda.min", type = "class")
confusionMatrix(pred_enet_02, test$clase)$table
##           Reference
## Prediction shopping sports
##   shopping      236      7
##   sports         12    152

Elastic - Net

Alpha 0.8 - más cerca de lasso que ridge

set.seed(666)
cvfit_elastic_08 = cv.glmnet(x, y, family = "binomial", alpha = 0.8, nfold=3, 
                        parallel=TRUE, standardize=TRUE, type.measur='auc')
plot(cvfit_elastic_08)

plot of chunk unnamed-chunk-11

Resultados Elastic-Net

pred_enet_08 = predict(cvfit_elastic_08, newx = as.matrix(test[,predictors]), s = "lambda.min", type = "class")
confusionMatrix(pred_enet_08, test$clase)$table
##           Reference
## Prediction shopping sports
##   shopping      228      7
##   sports         20    152

Comparación de modelos

train_alpha <- function(alpha){
  set.seed(666)
  fit = cv.glmnet(x, y, family = "binomial", alpha = alpha, nfold=3, 
                  parallel=TRUE, standardize=TRUE, type.measur='auc')
  pred = predict(fit, newx = as.matrix(test[,predictors]), 
                 s = "lambda.min", type = "class")
  list(fit=fit, pred=pred)
}

# Defino una secuencia de alphas para modelar
alphas <- seq(0, 1, 0.01)

# Aplico la funcion train_alpha() al vector de alphas
enets <- lapply(alphas, function(a) train_alpha(a))
fits <- lapply(enets, '[[', "fit")
preds <- lapply(enets, '[[', "pred")

# Obtengo AUC y cantidad de variables de cada modelo.
AUC <- sapply(fits, function(x) x$cvm[x$lambda == x$lambda.min])
Q_VARIABLES <- sapply(fits, function(x) sum(coef(x, s='lambda.min') != 0))

# Métricas
resultados <- lapply(preds, function(x){
  tbl <- table(x, test$clase)
  accuracy <- accuracy(tbl)
  sensitividad <- tbl[1,1] / sum(tbl[,1])
  especificidad <- tbl[2,2] / sum(tbl[,2])
  F1 <- (2 * (especificidad * sensitividad) ) / (especificidad + sensitividad)
  list(tbl=tbl, accuracy=accuracy, sensitividad=sensitividad, especificidad=especificidad, F1=F1)
})

# Resultados de todos los modelos
modelos <- data.frame(
  modelo=alphas, 
  accuracy=sapply(resultados, "[[", 'accuracy'),
  sensitividad=sapply(resultados, "[[", 'sensitividad'),
  especificidad = sapply(resultados, "[[", 'especificidad'),
  F1=sapply(resultados, "[[", 'F1'), AUC, Q_VARIABLES
)

melt_modelos <- reshape2::melt(modelos, id.vars="modelo")

# AUC - Cantidad de Variables
ggplot(melt_modelos[melt_modelos$variable %in% c('AUC', 'Q_VARIABLES'),], 
       aes(x=modelo, y=value, color=variable, group=variable)) +
  geom_line(size=1.2) + 
  facet_grid(variable ~ ., scales="free") +
  scale_x_continuous(breaks=seq(0, 1, 0.05)) + 
  labs(list(title = 'Elastic Net - AUC - Cantidad de Variables', 
            x="Alpha", y="AUC - Cantidad de Variables"))

plot of chunk unnamed-chunk-13

# Especificidad - Sensitividad
ggplot(melt_modelos[melt_modelos$variable %in% c('especificidad', 'sensitividad'),], 
       aes(x=modelo, y=value, color=variable, group=variable)) +
  geom_line(size=1.2) +
  scale_x_continuous(breaks=seq(0, 1, 0.1)) + 
  labs(list(title = 'Elastic Net - Especificidad - Sensitividad', 
          x="Alpha", y="Metricas"))

plot of chunk unnamed-chunk-13

# Todo junto
ggplot(melt_modelos, aes(x=modelo, y=value, color=variable, group=variable)) +
  geom_line(size=1.2) + facet_wrap(~ variable, scales="free")

plot of chunk unnamed-chunk-13

head(modelos, 50)
##    modelo accuracy sensitividad especificidad     F1    AUC Q_VARIABLES
## 1    0.00   0.9238       0.9073        0.9497 0.9280 0.9855        2078
## 2    0.01   0.9509       0.9435        0.9623 0.9528 0.9890        1428
## 3    0.02   0.9533       0.9476        0.9623 0.9549 0.9895        1119
## 4    0.03   0.9533       0.9476        0.9623 0.9549 0.9895         932
## 5    0.04   0.9533       0.9476        0.9623 0.9549 0.9899         845
## 6    0.05   0.9533       0.9516        0.9560 0.9538 0.9899         807
## 7    0.06   0.9558       0.9556        0.9560 0.9558 0.9896         699
## 8    0.07   0.9558       0.9556        0.9560 0.9558 0.9896         651
## 9    0.08   0.9558       0.9556        0.9560 0.9558 0.9896         638
## 10   0.09   0.9558       0.9556        0.9560 0.9558 0.9896         621
## 11   0.10   0.9558       0.9556        0.9560 0.9558 0.9893         607
## 12   0.11   0.9558       0.9556        0.9560 0.9558 0.9887         572
## 13   0.12   0.9558       0.9556        0.9560 0.9558 0.9877         532
## 14   0.13   0.9582       0.9556        0.9623 0.9589 0.9872         512
## 15   0.14   0.9582       0.9556        0.9623 0.9589 0.9872         511
## 16   0.15   0.9582       0.9556        0.9623 0.9589 0.9868         495
## 17   0.16   0.9533       0.9516        0.9560 0.9538 0.9864         419
## 18   0.17   0.9558       0.9556        0.9560 0.9558 0.9862         408
## 19   0.18   0.9558       0.9556        0.9560 0.9558 0.9862         413
## 20   0.19   0.9558       0.9556        0.9560 0.9558 0.9859         412
## 21   0.20   0.9533       0.9516        0.9560 0.9538 0.9856         383
## 22   0.21   0.9533       0.9516        0.9560 0.9538 0.9855         381
## 23   0.22   0.9533       0.9516        0.9560 0.9538 0.9853         377
## 24   0.23   0.9533       0.9516        0.9560 0.9538 0.9851         367
## 25   0.24   0.9533       0.9516        0.9560 0.9538 0.9851         369
## 26   0.25   0.9533       0.9516        0.9560 0.9538 0.9849         369
## 27   0.26   0.9509       0.9476        0.9560 0.9518 0.9845         324
## 28   0.27   0.9509       0.9476        0.9560 0.9518 0.9844         325
## 29   0.28   0.9509       0.9476        0.9560 0.9518 0.9843         322
## 30   0.29   0.9484       0.9476        0.9497 0.9486 0.9843         307
## 31   0.30   0.9484       0.9476        0.9497 0.9486 0.9841         305
## 32   0.31   0.9484       0.9476        0.9497 0.9486 0.9841         288
## 33   0.32   0.9509       0.9476        0.9560 0.9518 0.9839         319
## 34   0.33   0.9509       0.9476        0.9560 0.9518 0.9838         320
## 35   0.34   0.9484       0.9476        0.9497 0.9486 0.9835         293
## 36   0.35   0.9484       0.9476        0.9497 0.9486 0.9830         289
## 37   0.36   0.9459       0.9435        0.9497 0.9466 0.9832         278
## 38   0.37   0.9459       0.9435        0.9497 0.9466 0.9826         283
## 39   0.38   0.9435       0.9395        0.9497 0.9446 0.9826         270
## 40   0.39   0.9410       0.9355        0.9497 0.9425 0.9821         253
## 41   0.40   0.9410       0.9355        0.9497 0.9425 0.9818         257
## 42   0.41   0.9386       0.9274        0.9560 0.9415 0.9815         174
## 43   0.42   0.9386       0.9274        0.9560 0.9415 0.9811         172
## 44   0.43   0.9386       0.9274        0.9560 0.9415 0.9809         177
## 45   0.44   0.9386       0.9274        0.9560 0.9415 0.9809         177
## 46   0.45   0.9410       0.9315        0.9560 0.9436 0.9805         179
## 47   0.46   0.9435       0.9355        0.9560 0.9456 0.9804         202
## 48   0.47   0.9435       0.9355        0.9560 0.9456 0.9802         200
## 49   0.48   0.9410       0.9315        0.9560 0.9436 0.9800         169
## 50   0.49   0.9386       0.9274        0.9560 0.9415 0.9797         161
tail(modelos, 50)
##     modelo accuracy sensitividad especificidad     F1    AUC Q_VARIABLES
## 52    0.51   0.9386       0.9274         0.956 0.9415 0.9794         159
## 53    0.52   0.9337       0.9194         0.956 0.9373 0.9789         148
## 54    0.53   0.9337       0.9194         0.956 0.9373 0.9790         148
## 55    0.54   0.9337       0.9194         0.956 0.9373 0.9789         146
## 56    0.55   0.9337       0.9194         0.956 0.9373 0.9788         141
## 57    0.56   0.9361       0.9234         0.956 0.9394 0.9786         146
## 58    0.57   0.9337       0.9194         0.956 0.9373 0.9784         138
## 59    0.58   0.9337       0.9194         0.956 0.9373 0.9781         137
## 60    0.59   0.9337       0.9194         0.956 0.9373 0.9783         136
## 61    0.60   0.9337       0.9194         0.956 0.9373 0.9781         135
## 62    0.61   0.9337       0.9194         0.956 0.9373 0.9779         134
## 63    0.62   0.9337       0.9194         0.956 0.9373 0.9777         131
## 64    0.63   0.9312       0.9153         0.956 0.9352 0.9773         129
## 65    0.64   0.9337       0.9194         0.956 0.9373 0.9771         133
## 66    0.65   0.9337       0.9194         0.956 0.9373 0.9768         133
## 67    0.66   0.9337       0.9194         0.956 0.9373 0.9765         132
## 68    0.67   0.9337       0.9194         0.956 0.9373 0.9763         134
## 69    0.68   0.9337       0.9194         0.956 0.9373 0.9761         131
## 70    0.69   0.9337       0.9194         0.956 0.9373 0.9758         131
## 71    0.70   0.9337       0.9194         0.956 0.9373 0.9755         128
## 72    0.71   0.9337       0.9194         0.956 0.9373 0.9756         129
## 73    0.72   0.9337       0.9194         0.956 0.9373 0.9754         127
## 74    0.73   0.9337       0.9194         0.956 0.9373 0.9754         122
## 75    0.74   0.9337       0.9194         0.956 0.9373 0.9751         120
## 76    0.75   0.9337       0.9194         0.956 0.9373 0.9749         120
## 77    0.76   0.9337       0.9194         0.956 0.9373 0.9745         119
## 78    0.77   0.9337       0.9194         0.956 0.9373 0.9744         118
## 79    0.78   0.9337       0.9194         0.956 0.9373 0.9741         115
## 80    0.79   0.9337       0.9194         0.956 0.9373 0.9738         114
## 81    0.80   0.9337       0.9194         0.956 0.9373 0.9734         107
## 82    0.81   0.9337       0.9194         0.956 0.9373 0.9734         108
## 83    0.82   0.9337       0.9194         0.956 0.9373 0.9729         105
## 84    0.83   0.9361       0.9234         0.956 0.9394 0.9727          97
## 85    0.84   0.9361       0.9234         0.956 0.9394 0.9727          89
## 86    0.85   0.9361       0.9234         0.956 0.9394 0.9725          84
## 87    0.86   0.9361       0.9234         0.956 0.9394 0.9723          83
## 88    0.87   0.9361       0.9234         0.956 0.9394 0.9718          85
## 89    0.88   0.9361       0.9234         0.956 0.9394 0.9717          80
## 90    0.89   0.9361       0.9234         0.956 0.9394 0.9716          79
## 91    0.90   0.9361       0.9234         0.956 0.9394 0.9714          79
## 92    0.91   0.9361       0.9234         0.956 0.9394 0.9712          79
## 93    0.92   0.9361       0.9234         0.956 0.9394 0.9708          79
## 94    0.93   0.9361       0.9234         0.956 0.9394 0.9704          78
## 95    0.94   0.9361       0.9234         0.956 0.9394 0.9703          78
## 96    0.95   0.9361       0.9234         0.956 0.9394 0.9701          78
## 97    0.96   0.9361       0.9234         0.956 0.9394 0.9702          77
## 98    0.97   0.9337       0.9194         0.956 0.9373 0.9701          77
## 99    0.98   0.9337       0.9194         0.956 0.9373 0.9703          77
## 100   0.99   0.9337       0.9194         0.956 0.9373 0.9704          75
## 101   1.00   0.9337       0.9194         0.956 0.9373 0.9703          74