Leo el archivo y Cargo paquetes
# Saco la notacion cientifica
options(scipen=666)
# Pongo un seed.
set.seed(666)
# Leo el archivo
d <- read.table('web_content.txt')
## Dimension (Matriz dispersa)
dim(d)
## [1] 816 2078
# Que se busca predecir?
# Si la pagina web pertenece a la categoria sports o shopping en base a su contenido (texto en la pagina)
table(d$clase)
##
## shopping sports
## 497 319
## Nombres de las variables
names(d)[sample(2000, 100)]
## [1] "presenta" "comunicación" "turismo" "conferencia"
## [5] "espera" "pinamar" "tucumán" "ingreso"
## [9] "adecuado" "debut" "preparación" "adrogué"
## [13] "belgrano" "carrera" "constitución" "puro"
## [17] "amant" "santo" "iba" "hermano"
## [21] "trébol" "marcar" "amba" "cargo"
## [25] "pudo" "defend" "amplio" "mas"
## [29] "leer" "reino" "hermosa" "feliciano"
## [33] "próximo" "instituto" "método" "importancia"
## [37] "demostró" "segunda" "precio" "lesion"
## [41] "cuanto" "jorg" "busco" "tomé"
## [45] "salió" "peñarol" "estonia" "manual"
## [49] "institucion" "tortuguita" "aseguró" "bahÃa"
## [53] "campana" "perdido" "usado" "noticia"
## [57] "usa" "roqu" "banfield" "amarilla"
## [61] "pleno" "propia" "dar" "mÃnima"
## [65] "seguro" "deberá" "trabaja" "salto"
## [69] "llevó" "sauc" "maestro" "ducha"
## [73] "morón" "reloj" "aruba" "posicion"
## [77] "letonia" "antiguo" "tanta" "turno"
## [81] "peso" "pista" "dinamarca" "quedan"
## [85] "dio" "pequeña" "suelo" "chipr"
## [89] "nota" "recuerda" "trayectoria" "ausencia"
## [93] "varia" "libro" "isidro" "blanco"
## [97] "parec" "gesel" "mejÃa" "pasada"
## top de variables explicativas
d[1:5, 1:10]
## abajo abierta abierto abril abrió abrir absoluta acá acaba
## 1 0.004236 0 0.000000 0 0 0.004698 0 0 0.005393
## 2 0.000000 0 0.000000 0 0 0.000000 0 0 0.000000
## 3 0.000000 0 0.009674 0 0 0.000000 0 0 0.000000
## 4 0.000000 0 0.000000 0 0 0.000000 0 0 0.000000
## 5 0.000000 0 0.000000 0 0 0.000000 0 0 0.000000
## academia
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
### Instalar los paquetes que digan FALSE
# En paralelo no anda en windows
library(caret)
## Warning: package 'caret' was built under R version 3.1.1
## Loading required package: lattice
## Loading required package: ggplot2
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 1.9-8
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
library(corrplot)
# Para procesar en paralelo
registerDoParallel(cores=4)
# cor <- cor(d[,names(d)[!names(d) %in% c('clase')]])
# corrplot(cor)
Ejemplo del path usando regresion lasso
## Ejemplo del path usando regresion lasso
set.seed(666)
fit_lasso = glmnet(x, y, family = "binomial", alpha = 1, standardize=TRUE)
# Coeficientes y Devianza
plot(fit_lasso, xvar = "dev", label = TRUE)

# elijo 32 variables (lambda = 0.059660)
coef2 <- function(fit, s){
cf <- as.matrix(coef(fit, s=s))
cf <- data.frame(coef=cf[cf[,1] != 0 , ])
cf$vars <- row.names(cf)
cf[order(abs(cf$coef), decreasing=T), ]
}
# Algunos coeficientes son muy altos, los puedo sacar
coef2(fit_lasso, s=0.059660)
## coef vars
## blanca -274.652257 blanca
## kong -124.723650 kong
## inmuebl -92.301002 inmuebl
## bajo -47.041971 bajo
## ideal -25.538109 ideal
## vez 25.520169 vez
## plata -20.531918 plata
## publicacion -17.213590 publicacion
## último 12.179236 último
## arab -11.827196 arab
## capit -11.225988 capit
## partido 7.987047 partido
## fecha 6.941993 fecha
## cierto 6.480958 cierto
## jugador 6.016521 jugador
## equipo 4.839720 equipo
## hace 4.179027 hace
## venta -4.005150 venta
## segunda 3.430944 segunda
## pago -3.306324 pago
## club 2.990045 club
## servicio -2.789868 servicio
## entrega -2.321063 entrega
## auto -2.269953 auto
## torneo 1.262399 torneo
## aviso -1.042313 aviso
## oferta -0.998149 oferta
## deport 0.802503 deport
## (Intercept) 0.628807 (Intercept)
## pesar 0.402310 pesar
## paÃs -0.180308 paÃs
## rival 0.159893 rival
## encuentro 0.083354 encuentro
## alta -0.001166 alta
# Se pueden poner limites inferiores y superiores a los coeficientes
fit_lasso_restringido <- glmnet(x, y, family = "binomial", alpha = 1,
standardize=TRUE, lower=-40,upper=40)
fit_lasso_restringido
##
## Call: glmnet(x = x, y = y, family = "binomial", alpha = 1, standardize = TRUE, lower.limits = -40, upper.limits = 40)
##
## Df %Dev Lambda
## [1,] 0 0.00000000000000382 0.33400
## [2,] 2 0.03290000000000000 0.31800
## [3,] 2 0.06460000000000000 0.30400
## [4,] 2 0.09310000000000000 0.29000
## [5,] 4 0.11799999999999999 0.27700
## [6,] 4 0.14199999999999999 0.26400
## [7,] 4 0.16400000000000001 0.25200
## [8,] 4 0.18500000000000000 0.24100
## [9,] 5 0.20200000000000001 0.23000
## [10,] 5 0.21800000000000000 0.21900
## [11,] 5 0.23400000000000001 0.20900
## [12,] 7 0.24800000000000000 0.20000
## [13,] 7 0.26200000000000001 0.19100
## [14,] 8 0.27500000000000002 0.18200
## [15,] 8 0.28699999999999998 0.17400
## [16,] 8 0.29899999999999999 0.16600
## [17,] 8 0.31000000000000000 0.15800
## [18,] 8 0.32100000000000001 0.15100
## [19,] 9 0.33000000000000002 0.14400
## [20,] 11 0.33900000000000002 0.13800
## [21,] 13 0.34999999999999998 0.13200
## [22,] 14 0.36199999999999999 0.12600
## [23,] 16 0.37800000000000000 0.12000
## [24,] 16 0.39400000000000002 0.11400
## [25,] 17 0.40999999999999998 0.10900
## [26,] 17 0.42399999999999999 0.10400
## [27,] 19 0.43800000000000000 0.09950
## [28,] 24 0.45300000000000001 0.09500
## [29,] 26 0.46600000000000003 0.09070
## [30,] 24 0.47899999999999998 0.08660
## [31,] 25 0.49199999999999999 0.08260
## [32,] 26 0.50400000000000000 0.07890
## [33,] 29 0.51700000000000002 0.07530
## [34,] 31 0.53100000000000003 0.07190
## [35,] 38 0.54500000000000004 0.06860
## [36,] 41 0.56200000000000006 0.06550
## [37,] 43 0.57799999999999996 0.06250
## [38,] 46 0.59399999999999997 0.05970
## [39,] 47 0.60899999999999999 0.05700
## [40,] 47 0.62400000000000000 0.05440
## [41,] 49 0.63700000000000001 0.05190
## [42,] 51 0.65000000000000002 0.04950
## [43,] 50 0.66300000000000003 0.04730
## [44,] 56 0.67400000000000004 0.04510
## [45,] 59 0.68600000000000005 0.04310
## [46,] 64 0.69699999999999995 0.04110
## [47,] 65 0.70799999999999996 0.03930
## [48,] 68 0.71899999999999997 0.03750
## [49,] 74 0.73099999999999998 0.03580
## [50,] 75 0.74299999999999999 0.03410
## [51,] 78 0.75400000000000000 0.03260
## [52,] 84 0.76400000000000001 0.03110
## [53,] 87 0.77500000000000002 0.02970
## [54,] 87 0.78600000000000003 0.02830
## [55,] 87 0.79500000000000004 0.02710
## [56,] 89 0.80500000000000005 0.02580
## [57,] 92 0.81399999999999995 0.02470
## [58,] 94 0.82299999999999995 0.02350
## [59,] 99 0.83099999999999996 0.02250
## [60,] 102 0.83899999999999997 0.02140
## [61,] 109 0.84599999999999997 0.02050
## [62,] 110 0.85399999999999998 0.01950
## [63,] 117 0.86099999999999999 0.01860
## [64,] 118 0.86699999999999999 0.01780
## [65,] 121 0.87300000000000000 0.01700
## [66,] 119 0.87900000000000000 0.01620
## [67,] 119 0.88500000000000001 0.01550
## [68,] 123 0.89000000000000001 0.01480
## [69,] 125 0.89500000000000002 0.01410
## [70,] 126 0.90000000000000002 0.01350
## [71,] 128 0.90400000000000003 0.01290
## [72,] 131 0.90900000000000003 0.01230
## [73,] 132 0.91300000000000003 0.01170
## [74,] 135 0.91700000000000004 0.01120
## [75,] 135 0.92100000000000004 0.01070
## [76,] 138 0.92400000000000004 0.01020
## [77,] 139 0.92800000000000005 0.00972
## [78,] 139 0.93100000000000005 0.00928
## [79,] 142 0.93400000000000005 0.00886
## [80,] 142 0.93700000000000006 0.00846
## [81,] 143 0.93999999999999995 0.00807
## [82,] 143 0.94299999999999995 0.00771
## [83,] 145 0.94599999999999995 0.00736
## [84,] 146 0.94799999999999995 0.00702
## [85,] 149 0.95099999999999996 0.00670
## [86,] 150 0.95299999999999996 0.00640
## [87,] 151 0.95499999999999996 0.00611
## [88,] 152 0.95699999999999996 0.00583
## [89,] 153 0.95899999999999996 0.00556
## [90,] 152 0.96099999999999997 0.00531
## [91,] 154 0.96299999999999997 0.00507
## [92,] 153 0.96399999999999997 0.00484
## [93,] 150 0.96599999999999997 0.00462
## [94,] 150 0.96699999999999997 0.00441
## [95,] 151 0.96899999999999997 0.00421
## [96,] 152 0.96999999999999997 0.00402
## [97,] 154 0.97099999999999997 0.00384
## [98,] 159 0.97299999999999998 0.00366
## [99,] 162 0.97399999999999998 0.00349
## [100,] 165 0.97499999999999998 0.00334
### Penalizacion alta -> entran pocas variables
coef2(fit_lasso_restringido, s=0.1)
## coef vars
## arab -40.000000000000000 arab
## bahÃa -40.000000000000000 bahÃa
## blanca -40.000000000000000 blanca
## inmuebl -40.000000000000000 inmuebl
## kong -40.000000000000000 kong
## malta -40.000000000000000 malta
## posada -40.000000000000000 posada
## rusa -40.000000000000000 rusa
## bajo -38.295836923808764 bajo
## aruba -35.915038308642110 aruba
## costa -12.921127953665563 costa
## plata -10.792214126457910 plata
## capit -6.437598546805154 capit
## vez 6.198014336240986 vez
## partido 5.453426088216941 partido
## equipo 2.891886140663674 equipo
## fecha 0.777579368535495 fecha
## (Intercept) 0.414760490959842 (Intercept)
## jugador 0.220952524395958 jugador
## bahama -0.000000000004021 bahama
### Menor penalizacion -> Más variables, algunas mas sospechosas
coef2(fit_lasso_restringido, s=0.01)
## coef vars
## algún 40.0000000000000000 algún
## alta -40.0000000000000000 alta
## arab -40.0000000000000000 arab
## aruba -40.0000000000000000 aruba
## bahama -40.0000000000000000 bahama
## bahÃa -40.0000000000000000 bahÃa
## bajo -40.0000000000000000 bajo
## bangladesh -40.0000000000000000 bangladesh
## belic -40.0000000000000000 belic
## blanca -40.0000000000000000 blanca
## cierto 40.0000000000000000 cierto
## corrient -40.0000000000000000 corrient
## ello 40.0000000000000000 ello
## ideal -40.0000000000000000 ideal
## inmuebl -40.0000000000000000 inmuebl
## interesa -40.0000000000000000 interesa
## kong -40.0000000000000000 kong
## paÃs -40.0000000000000000 paÃs
## posada -40.0000000000000000 posada
## publicacion -40.0000000000000000 publicacion
## vez 40.0000000000000000 vez
## plata -39.7115779397192910 plata
## argelia -39.3244818426938423 argelia
## capit -38.7215865344786039 capit
## caico -37.9405150995484917 caico
## costa -36.4032445362836015 costa
## fundament -35.4146958971711143 fundament
## malta -34.1101071403390819 malta
## miedo -30.0349328315879873 miedo
## queda 28.6462681745811771 queda
## vacacion -28.4958461219429218 vacacion
## mar -28.0879662039648395 mar
## individual -26.8367224850287229 individual
## domÃnica -26.6360454822746675 domÃnica
## unido -26.4015471166369267 unido
## cameroon -25.8919860297048778 cameroon
## defensa 25.5368565977205577 defensa
## pesar 25.4035284636747392 pesar
## ser 24.3318228699038528 ser
## rico -24.0761921174218472 rico
## caja -23.3830153656451927 caja
## sri -20.8497999146224267 sri
## simpl -20.7486706670411039 simpl
## dÃas 18.7250018262542852 dÃas
## venta -18.2530495562963431 venta
## último 17.7050419534127634 último
## entrega -17.5963054359478335 entrega
## servicio -17.2117802320654931 servicio
## proceso -16.5965401198227198 proceso
## entrenador 16.5021452224543026 entrenador
## guÃa -16.4842499953217043 guÃa
## bielorusia -16.1398499662682440 bielorusia
## rosario -15.6780795176210876 rosario
## fecha 15.0639183885254226 fecha
## pago -14.2126106821085738 pago
## cualquier -14.1745267525745717 cualquier
## tel -14.0002214927757898 tel
## libro -13.6559203971825127 libro
## oferta -13.2065166015854896 oferta
## hace 13.1847628464904396 hace
## deport 13.1773367543195832 deport
## distancia -12.5552906624415179 distancia
## chile -12.5022036124949860 chile
## lui -12.3511906614605742 lui
## aviso -12.1804892329072061 aviso
## gran -11.8428705368588751 gran
## club 11.7592554479427029 club
## auto -11.5381869448275598 auto
## producto -11.2767277517273499 producto
## intensidad -11.0958134112722071 intensidad
## partido 9.6318781943995901 partido
## kazajstán -9.4913699822695534 kazajstán
## meno 9.4290720546113338 meno
## palabra 9.3856372962717565 palabra
## vece 8.9348721831562976 vece
## pintura -8.5943828653027285 pintura
## equipo 8.4339588517743245 equipo
## asociación 8.3692501533964183 asociación
## luego 8.1117875010926639 luego
## feder -8.1030471730490845 feder
## instrumento -7.9883596999168782 instrumento
## dueño -7.9285726720505494 dueño
## dado 7.7056895867672681 dado
## rusa -7.1540430381113609 rusa
## camino 6.9663595833229266 camino
## sabe 6.7098227026790687 sabe
## toda -6.3766342918853915 toda
## torneo 6.3368197721397603 torneo
## encuentro 6.2763065925231096 encuentro
## electrónico 5.8026382871484028 electrónico
## diez -5.7643438065540931 diez
## técnica 5.2391747909575050 técnica
## nivel 5.0853556807237190 nivel
## cancha 4.9788800016662229 cancha
## segunda 4.7945701004247727 segunda
## mucha 4.5425686585537441 mucha
## patada 4.2838233667380869 patada
## combat 4.0057406907747666 combat
## etapa -3.7925135887466785 etapa
## nacion 3.5331911219342551 nacion
## compra -3.3942754519704268 compra
## base 3.3246379667888331 base
## final 3.2744576424561931 final
## ley 3.1419418379692163 ley
## calidad -2.9792382962834676 calidad
## espero -2.9461677327819622 espero
## condicion -2.7465977932120822 condicion
## continuación 2.6491206769659019 continuación
## clima -2.4777412343509280 clima
## usado -2.4674865164601760 usado
## client -2.3873377754651370 client
## misma 2.3593013803289695 misma
## alumno 1.9137208003405093 alumno
## (Intercept) 1.7157654083836689 (Intercept)
## mercado -1.6992392207577849 mercado
## jamaica -1.6575610235780958 jamaica
## jugador 1.6557140772373300 jugador
## quinta -1.6459334555864686 quinta
## mientra 1.5990925541996701 mientra
## deportiva 1.3484325709989946 deportiva
## cocina -1.3356173954464619 cocina
## modelo -1.2979309564655490 modelo
## san -1.2940609101705638 san
## carga -1.2642013811668580 carga
## cuota -1.0097675861915947 cuota
## deportivo 0.6651721149137295 deportivo
## obra -0.4810210406210172 obra
## mauritius -0.3862723154666317 mauritius
## localidad 0.3835546931199125 localidad
## principado -0.2523150907928940 principado
## comentario 0.2482785762389935 comentario
## motor -0.2302880575478789 motor
## eje -0.2027238572069245 eje
## mundial 0.1407378255026032 mundial
## super -0.0404909901014335 super
## alguien 0.0373751697253266 alguien
## pakistan -0.0351963389295912 pakistan
## emirato -0.0114864800815509 emirato
## botswana -0.0000000000008630 botswana
## lanka -0.0000000000004657 lanka
### Graficos
plot(fit_lasso_restringido)

plot(fit_lasso_restringido, xvar = "dev", label = TRUE)

plot(fit_lasso_restringido, xvar = 'lambda', label = TRUE)

# Lambdas - Devianza
# fit_res <- as.data.frame(cbind(fit_lasso$df, fit_lasso$dev, fit_lasso$lambda))
# names(fit_res) <- c("Q Variables (DF)", "Devianza", "Lambda")
# head(fit_res, 100)
# Predicciones usando todos los lambdas
pred_mat <- predict(fit_lasso_restringido, as.matrix(test[,predictors]), type='class')
accuracy <- function(mat_confusion) sum(diag(mat_confusion)) / sum(mat_confusion)
eval_modelos <- function(x){
# toma la matriz de confusion como input y calcula metricas
tbl <- table(x, test$clase)
if(dim(tbl)[[1]] == 1){
list(tbl=tbl, acc=0, recall=0, precision=0, specificity=0, F1=0)
} else {
accuracy <- sum(diag(tbl)) / sum(tbl)
sensitividad <- tbl[1,1] / sum(tbl[,1])
especificidad <- tbl[2,2] / sum(tbl[,2])
list(tbl=tbl, accuracy=accuracy, sensitividad=sensitividad, especificidad=especificidad)
}
}
res_fit_lasso <- apply(pred_mat, 2, eval_modelos)
metricas <- c('sensitividad', 'especificidad', 'accuracy')
met_lasso <- lapply(metricas, function(x) {
as.vector(unlist(sapply(res_fit_lasso, "[[", x)))
})
met_lasso <- as.data.frame(do.call('cbind', met_lasso))
names(met_lasso) <- metricas
# met_lasso$lambda_fact <- as.factor(row.names(met_lasso))
met_lasso$lambda <- fit_lasso$lambda[-c(1:10)]
melt_met_lasso <- reshape2::melt(met_lasso, id.vars=c('lambda'))
# Grafico
ggplot(melt_met_lasso, aes(x=lambda, y=value, color=variable, group=variable)) +
geom_line(size=1.2) + scale_x_continuous(breaks=seq(0, 1, 0.03)) +
labs(title = 'Metricas - Regresion Lasso')

# Las variables
coef2(fit_lasso_restringido, s=0.03)
## coef vars
## arab -40.00000000000000000 arab
## bahÃa -40.00000000000000000 bahÃa
## bajo -40.00000000000000000 bajo
## blanca -40.00000000000000000 blanca
## cierto 40.00000000000000000 cierto
## ideal -40.00000000000000000 ideal
## inmuebl -40.00000000000000000 inmuebl
## kong -40.00000000000000000 kong
## malta -40.00000000000000000 malta
## posada -40.00000000000000000 posada
## publicacion -40.00000000000000000 publicacion
## rusa -40.00000000000000000 rusa
## unido -40.00000000000000000 unido
## bahama -39.99932691196014645 bahama
## plata -31.47006844801154202 plata
## vez 31.39657308801278290 vez
## aruba -28.86289033989351793 aruba
## paÃs -24.27169243110974151 paÃs
## interesa -23.91837778873620124 interesa
## costa -23.37709926358586543 costa
## capit -22.55205002984466489 capit
## alta -20.96477939141379210 alta
## corrient -20.88270814553446186 corrient
## último 20.74489936228430409 último
## algún 16.19909795564559829 algún
## pesar 14.10384641040603171 pesar
## bangladesh -13.83897991897264568 bangladesh
## fecha 12.27047632768618968 fecha
## venta -11.48763130937532040 venta
## hace 10.89763731283235693 hace
## pago -10.10391276294430085 pago
## ser 9.18648550476234149 ser
## partido 9.17043168395568387 partido
## oferta -9.05064084211478104 oferta
## servicio -8.65003664686511975 servicio
## club 7.63451759624667581 club
## deport 7.56932749994904874 deport
## equipo 7.49195367520923394 equipo
## entrega -7.31730279501204528 entrega
## entrenador 6.39854648784200020 entrenador
## ello 6.36382908806739334 ello
## auto -6.31382415383148032 auto
## tel -6.29514525562471405 tel
## rosario -6.29156263964139484 rosario
## rival 6.22316474436080291 rival
## jugador 6.11083681715957105 jugador
## aviso -6.11038975499248682 aviso
## meno 6.08466992202060286 meno
## individual -5.76553361007994347 individual
## defensa 5.24474675166504412 defensa
## feder -4.92263228799246555 feder
## caja -4.62552161933269002 caja
## vece 4.55522001936098331 vece
## mar -4.43298253029397848 mar
## luego 4.22515330099807151 luego
## torneo 4.05075003181442117 torneo
## proceso -3.93906094446558308 proceso
## final 3.75793259441874294 final
## aunqu 3.69605739024292257 aunqu
## motor -3.41768375285612613 motor
## cualquier -3.27709072823563652 cualquier
## producto -3.18256359521445908 producto
## queda 2.67037326206913583 queda
## encuentro 2.26757913934614219 encuentro
## técnica 2.24562410603837304 técnica
## calidad -1.98265321565457220 calidad
## segunda 1.98135347009402185 segunda
## simpl -1.91399943899542291 simpl
## cancha 1.72492077613540973 cancha
## dÃas 1.55464602520842243 dÃas
## toda -1.41646311949624937 toda
## asociación 1.38038058929948915 asociación
## electrónico 0.95510712322789582 electrónico
## misma 0.93762672140637515 misma
## (Intercept) 0.85655940793664109 (Intercept)
## guÃa -0.79937605581556526 guÃa
## instrumento -0.61603912819121698 instrumento
## entrenamiento 0.56472045572571761 entrenamiento
## sol -0.36362747720491256 sol
## camino 0.31789497298592179 camino
## compra -0.20761590593549836 compra
## comentario 0.01804591056237849 comentario
## cameroon -0.01536584184296038 cameroon
## mientra 0.00829615567997205 mientra
## caico -0.00000000025601848 caico
## belic -0.00000000000112899 belic
## bielorusia -0.00000000000001792 bielorusia
## botswana -0.00000000000001792 botswana
Elastic - Net
Alpha 0.8 - más cerca de lasso que ridge
set.seed(666)
cvfit_elastic_08 = cv.glmnet(x, y, family = "binomial", alpha = 0.8, nfold=3,
parallel=TRUE, standardize=TRUE, type.measur='auc')
plot(cvfit_elastic_08)

Resultados Elastic-Net
pred_enet_08 = predict(cvfit_elastic_08, newx = as.matrix(test[,predictors]), s = "lambda.min", type = "class")
confusionMatrix(pred_enet_08, test$clase)$table
## Reference
## Prediction shopping sports
## shopping 228 7
## sports 20 152
Comparación de modelos
train_alpha <- function(alpha){
set.seed(666)
fit = cv.glmnet(x, y, family = "binomial", alpha = alpha, nfold=3,
parallel=TRUE, standardize=TRUE, type.measur='auc')
pred = predict(fit, newx = as.matrix(test[,predictors]),
s = "lambda.min", type = "class")
list(fit=fit, pred=pred)
}
# Defino una secuencia de alphas para modelar
alphas <- seq(0, 1, 0.01)
# Aplico la funcion train_alpha() al vector de alphas
enets <- lapply(alphas, function(a) train_alpha(a))
fits <- lapply(enets, '[[', "fit")
preds <- lapply(enets, '[[', "pred")
# Obtengo AUC y cantidad de variables de cada modelo.
AUC <- sapply(fits, function(x) x$cvm[x$lambda == x$lambda.min])
Q_VARIABLES <- sapply(fits, function(x) sum(coef(x, s='lambda.min') != 0))
# Métricas
resultados <- lapply(preds, function(x){
tbl <- table(x, test$clase)
accuracy <- accuracy(tbl)
sensitividad <- tbl[1,1] / sum(tbl[,1])
especificidad <- tbl[2,2] / sum(tbl[,2])
F1 <- (2 * (especificidad * sensitividad) ) / (especificidad + sensitividad)
list(tbl=tbl, accuracy=accuracy, sensitividad=sensitividad, especificidad=especificidad, F1=F1)
})
# Resultados de todos los modelos
modelos <- data.frame(
modelo=alphas,
accuracy=sapply(resultados, "[[", 'accuracy'),
sensitividad=sapply(resultados, "[[", 'sensitividad'),
especificidad = sapply(resultados, "[[", 'especificidad'),
F1=sapply(resultados, "[[", 'F1'), AUC, Q_VARIABLES
)
melt_modelos <- reshape2::melt(modelos, id.vars="modelo")
# AUC - Cantidad de Variables
ggplot(melt_modelos[melt_modelos$variable %in% c('AUC', 'Q_VARIABLES'),],
aes(x=modelo, y=value, color=variable, group=variable)) +
geom_line(size=1.2) +
facet_grid(variable ~ ., scales="free") +
scale_x_continuous(breaks=seq(0, 1, 0.05)) +
labs(list(title = 'Elastic Net - AUC - Cantidad de Variables',
x="Alpha", y="AUC - Cantidad de Variables"))

# Especificidad - Sensitividad
ggplot(melt_modelos[melt_modelos$variable %in% c('especificidad', 'sensitividad'),],
aes(x=modelo, y=value, color=variable, group=variable)) +
geom_line(size=1.2) +
scale_x_continuous(breaks=seq(0, 1, 0.1)) +
labs(list(title = 'Elastic Net - Especificidad - Sensitividad',
x="Alpha", y="Metricas"))

# Todo junto
ggplot(melt_modelos, aes(x=modelo, y=value, color=variable, group=variable)) +
geom_line(size=1.2) + facet_wrap(~ variable, scales="free")

head(modelos, 50)
## modelo accuracy sensitividad especificidad F1 AUC Q_VARIABLES
## 1 0.00 0.9238 0.9073 0.9497 0.9280 0.9855 2078
## 2 0.01 0.9509 0.9435 0.9623 0.9528 0.9890 1428
## 3 0.02 0.9533 0.9476 0.9623 0.9549 0.9895 1119
## 4 0.03 0.9533 0.9476 0.9623 0.9549 0.9895 932
## 5 0.04 0.9533 0.9476 0.9623 0.9549 0.9899 845
## 6 0.05 0.9533 0.9516 0.9560 0.9538 0.9899 807
## 7 0.06 0.9558 0.9556 0.9560 0.9558 0.9896 699
## 8 0.07 0.9558 0.9556 0.9560 0.9558 0.9896 651
## 9 0.08 0.9558 0.9556 0.9560 0.9558 0.9896 638
## 10 0.09 0.9558 0.9556 0.9560 0.9558 0.9896 621
## 11 0.10 0.9558 0.9556 0.9560 0.9558 0.9893 607
## 12 0.11 0.9558 0.9556 0.9560 0.9558 0.9887 572
## 13 0.12 0.9558 0.9556 0.9560 0.9558 0.9877 532
## 14 0.13 0.9582 0.9556 0.9623 0.9589 0.9872 512
## 15 0.14 0.9582 0.9556 0.9623 0.9589 0.9872 511
## 16 0.15 0.9582 0.9556 0.9623 0.9589 0.9868 495
## 17 0.16 0.9533 0.9516 0.9560 0.9538 0.9864 419
## 18 0.17 0.9558 0.9556 0.9560 0.9558 0.9862 408
## 19 0.18 0.9558 0.9556 0.9560 0.9558 0.9862 413
## 20 0.19 0.9558 0.9556 0.9560 0.9558 0.9859 412
## 21 0.20 0.9533 0.9516 0.9560 0.9538 0.9856 383
## 22 0.21 0.9533 0.9516 0.9560 0.9538 0.9855 381
## 23 0.22 0.9533 0.9516 0.9560 0.9538 0.9853 377
## 24 0.23 0.9533 0.9516 0.9560 0.9538 0.9851 367
## 25 0.24 0.9533 0.9516 0.9560 0.9538 0.9851 369
## 26 0.25 0.9533 0.9516 0.9560 0.9538 0.9849 369
## 27 0.26 0.9509 0.9476 0.9560 0.9518 0.9845 324
## 28 0.27 0.9509 0.9476 0.9560 0.9518 0.9844 325
## 29 0.28 0.9509 0.9476 0.9560 0.9518 0.9843 322
## 30 0.29 0.9484 0.9476 0.9497 0.9486 0.9843 307
## 31 0.30 0.9484 0.9476 0.9497 0.9486 0.9841 305
## 32 0.31 0.9484 0.9476 0.9497 0.9486 0.9841 288
## 33 0.32 0.9509 0.9476 0.9560 0.9518 0.9839 319
## 34 0.33 0.9509 0.9476 0.9560 0.9518 0.9838 320
## 35 0.34 0.9484 0.9476 0.9497 0.9486 0.9835 293
## 36 0.35 0.9484 0.9476 0.9497 0.9486 0.9830 289
## 37 0.36 0.9459 0.9435 0.9497 0.9466 0.9832 278
## 38 0.37 0.9459 0.9435 0.9497 0.9466 0.9826 283
## 39 0.38 0.9435 0.9395 0.9497 0.9446 0.9826 270
## 40 0.39 0.9410 0.9355 0.9497 0.9425 0.9821 253
## 41 0.40 0.9410 0.9355 0.9497 0.9425 0.9818 257
## 42 0.41 0.9386 0.9274 0.9560 0.9415 0.9815 174
## 43 0.42 0.9386 0.9274 0.9560 0.9415 0.9811 172
## 44 0.43 0.9386 0.9274 0.9560 0.9415 0.9809 177
## 45 0.44 0.9386 0.9274 0.9560 0.9415 0.9809 177
## 46 0.45 0.9410 0.9315 0.9560 0.9436 0.9805 179
## 47 0.46 0.9435 0.9355 0.9560 0.9456 0.9804 202
## 48 0.47 0.9435 0.9355 0.9560 0.9456 0.9802 200
## 49 0.48 0.9410 0.9315 0.9560 0.9436 0.9800 169
## 50 0.49 0.9386 0.9274 0.9560 0.9415 0.9797 161
tail(modelos, 50)
## modelo accuracy sensitividad especificidad F1 AUC Q_VARIABLES
## 52 0.51 0.9386 0.9274 0.956 0.9415 0.9794 159
## 53 0.52 0.9337 0.9194 0.956 0.9373 0.9789 148
## 54 0.53 0.9337 0.9194 0.956 0.9373 0.9790 148
## 55 0.54 0.9337 0.9194 0.956 0.9373 0.9789 146
## 56 0.55 0.9337 0.9194 0.956 0.9373 0.9788 141
## 57 0.56 0.9361 0.9234 0.956 0.9394 0.9786 146
## 58 0.57 0.9337 0.9194 0.956 0.9373 0.9784 138
## 59 0.58 0.9337 0.9194 0.956 0.9373 0.9781 137
## 60 0.59 0.9337 0.9194 0.956 0.9373 0.9783 136
## 61 0.60 0.9337 0.9194 0.956 0.9373 0.9781 135
## 62 0.61 0.9337 0.9194 0.956 0.9373 0.9779 134
## 63 0.62 0.9337 0.9194 0.956 0.9373 0.9777 131
## 64 0.63 0.9312 0.9153 0.956 0.9352 0.9773 129
## 65 0.64 0.9337 0.9194 0.956 0.9373 0.9771 133
## 66 0.65 0.9337 0.9194 0.956 0.9373 0.9768 133
## 67 0.66 0.9337 0.9194 0.956 0.9373 0.9765 132
## 68 0.67 0.9337 0.9194 0.956 0.9373 0.9763 134
## 69 0.68 0.9337 0.9194 0.956 0.9373 0.9761 131
## 70 0.69 0.9337 0.9194 0.956 0.9373 0.9758 131
## 71 0.70 0.9337 0.9194 0.956 0.9373 0.9755 128
## 72 0.71 0.9337 0.9194 0.956 0.9373 0.9756 129
## 73 0.72 0.9337 0.9194 0.956 0.9373 0.9754 127
## 74 0.73 0.9337 0.9194 0.956 0.9373 0.9754 122
## 75 0.74 0.9337 0.9194 0.956 0.9373 0.9751 120
## 76 0.75 0.9337 0.9194 0.956 0.9373 0.9749 120
## 77 0.76 0.9337 0.9194 0.956 0.9373 0.9745 119
## 78 0.77 0.9337 0.9194 0.956 0.9373 0.9744 118
## 79 0.78 0.9337 0.9194 0.956 0.9373 0.9741 115
## 80 0.79 0.9337 0.9194 0.956 0.9373 0.9738 114
## 81 0.80 0.9337 0.9194 0.956 0.9373 0.9734 107
## 82 0.81 0.9337 0.9194 0.956 0.9373 0.9734 108
## 83 0.82 0.9337 0.9194 0.956 0.9373 0.9729 105
## 84 0.83 0.9361 0.9234 0.956 0.9394 0.9727 97
## 85 0.84 0.9361 0.9234 0.956 0.9394 0.9727 89
## 86 0.85 0.9361 0.9234 0.956 0.9394 0.9725 84
## 87 0.86 0.9361 0.9234 0.956 0.9394 0.9723 83
## 88 0.87 0.9361 0.9234 0.956 0.9394 0.9718 85
## 89 0.88 0.9361 0.9234 0.956 0.9394 0.9717 80
## 90 0.89 0.9361 0.9234 0.956 0.9394 0.9716 79
## 91 0.90 0.9361 0.9234 0.956 0.9394 0.9714 79
## 92 0.91 0.9361 0.9234 0.956 0.9394 0.9712 79
## 93 0.92 0.9361 0.9234 0.956 0.9394 0.9708 79
## 94 0.93 0.9361 0.9234 0.956 0.9394 0.9704 78
## 95 0.94 0.9361 0.9234 0.956 0.9394 0.9703 78
## 96 0.95 0.9361 0.9234 0.956 0.9394 0.9701 78
## 97 0.96 0.9361 0.9234 0.956 0.9394 0.9702 77
## 98 0.97 0.9337 0.9194 0.956 0.9373 0.9701 77
## 99 0.98 0.9337 0.9194 0.956 0.9373 0.9703 77
## 100 0.99 0.9337 0.9194 0.956 0.9373 0.9704 75
## 101 1.00 0.9337 0.9194 0.956 0.9373 0.9703 74