#————————————————————————- # ANÁLISIS DE REGRESIÓN #————————————————————————-
Para este proyecto estaremos trabajando con la data Boston del
paquete MASS que recoge la media del valor de la vivienda en 506 áreas
residenciales de Boston. Junto con el precio, se han registrado 13
variables adicionales.
Este marco de datos contiene las siguientes columnas:
crim: tasa de criminalidad per cápita por ciudad.
zn: proporción de terrenos residenciales con zonificación para lotes de
más de 25.000 pies cuadrados.
indus: proporción de acres comerciales no minoristas por ciudad.
chas: Variable ficticia del río Charles (= 1 si el tramo linda con el
río; 0 en caso contrario).
nox: concentración de óxidos de nitrógeno (partes por 10
millones).
rm: número medio de habitaciones por vivienda.
age: proporción de unidades ocupadas por sus propietarios coantes de
1940.
dis: media ponderada de las distancias a cinco centros de trabajo de
Boston.
rad: índice de accesibilidad a las autopistas radiales.
tax: propiedad de valor total - tipo impositivo por 10.000
dólares.
ptratio: ratio alumnos-docente por ciudad.
black: 1000(Bk - 0,63)^2 donde BkBk es la proporción de negros por
ciudad.
lstat: estatus inferior de la población (porcentaje).
medv: valor medio de las viviendas ocupada.
En primer lugar se realiza un análisis básico de los datos de forma
numérica y gráfica.
library(MASS)
data=Boston
head(data)
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
## medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
# La variable chas es una variable categórica por lo que se transforma a factor
data$chas <- as.factor(Boston$chas)
summary(data)
## crim zn indus chas nox
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 0:471 Min. :0.3850
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1: 35 1st Qu.:0.4490
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.5380
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.5547
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.6240
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :0.8710
## rm age dis rad
## Min. :3.561 Min. : 2.90 Min. : 1.130 Min. : 1.000
## 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100 1st Qu.: 4.000
## Median :6.208 Median : 77.50 Median : 3.207 Median : 5.000
## Mean :6.285 Mean : 68.57 Mean : 3.795 Mean : 9.549
## 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188 3rd Qu.:24.000
## Max. :8.780 Max. :100.00 Max. :12.127 Max. :24.000
## tax ptratio black lstat
## Min. :187.0 Min. :12.60 Min. : 0.32 Min. : 1.73
## 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38 1st Qu.: 6.95
## Median :330.0 Median :19.05 Median :391.44 Median :11.36
## Mean :408.2 Mean :18.46 Mean :356.67 Mean :12.65
## 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23 3rd Qu.:16.95
## Max. :711.0 Max. :22.00 Max. :396.90 Max. :37.97
## medv
## Min. : 5.00
## 1st Qu.:17.02
## Median :21.20
## Mean :22.53
## 3rd Qu.:25.00
## Max. :50.00
# Distribución del valor medio de las viviendas (aparentemente no normal, distribución asimétrica positiva)
hist(data$medv, probability = TRUE)
lines(density(data$medv), col=2, lwd=2)
# Relación del valor medio de las viviendas con los posibles predictores
par(mfrow=c(2,3))
# Diagramas de Dispersión
plot(data$crim,data$medv) ## No es lineal
plot(data$zn,data$medv) ## no es lineal
plot(data$indus,data$medv)## no es lineal
plot(data$chas,data$medv) ## no es lineal
plot(data$nox,data$medv) ## no es lineal
plot(data$rm,data$medv) ## lineal
plot(data$age,data$medv) ## no es lineal
plot(data$dis,data$medv) ## lineal
plot(data$rad,data$medv) ## no es lineal
plot(data$tax,data$medv) ## no es lineal
plot(data$med,data$ptratio) ## no es lineal
plot(data$black,data$medv) ## no es lineal
plot(data$lstat,data$medv) ## lineal
#========================================================================== # Estimación #==========================================================================
# Modelo de regresión
form <- medv ~ rm+dis+lstat+ptratio
modelo <- lm(formula=form, data=data)
summary(modelo)
##
## Call:
## lm(formula = form, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.4172 -3.0971 -0.6397 1.8727 27.1088
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.47136 4.07802 6.001 3.77e-09 ***
## rm 4.22379 0.42382 9.966 < 2e-16 ***
## dis -0.55193 0.12695 -4.348 1.67e-05 ***
## lstat -0.66544 0.04675 -14.233 < 2e-16 ***
## ptratio -0.97365 0.11603 -8.391 4.94e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.139 on 501 degrees of freedom
## Multiple R-squared: 0.6903, Adjusted R-squared: 0.6878
## F-statistic: 279.2 on 4 and 501 DF, p-value: < 2.2e-16
#=============================================================================== # Evaluación de Supuestos del modelo #===============================================================================
# """"""""""""""""
# Gráfico Q-Q
qqnorm(rstudent(modelo),
ylab="Cuantiles de los Residuos Estudentizados",
xlab="Cuantiles teóricos")
qqline(rstudent(modelo))
# Test de normalidad
shapiro.test(rstudent(modelo)) ## No es normal
##
## Shapiro-Wilk normality test
##
## data: rstudent(modelo)
## W = 0.90565, p-value < 2.2e-16
# Gráfico de residuos estudentizados
rs <- rstudent(modelo) # Residuos studentizados
ypred <- predict(modelo) # Valores ajustados (predicci?n de la media)
# L?mites para detectar outliers:
# Cuantiles de la distribuci?n t-student (correcci?n de Bonferroni)
n <- dim(data)[1]
p <- length(modelo$coefficients)
alfa <- 0.01
t <- qt(alfa/(n*2), df=n-p-1) # distribuci?n t
par(mfrow=c(3,2))
plot(ypred, rs, ylab="Residuos Studentizados", xlab="Valor medio de las viviendas ocupadas",
main="residuos Studentizados vs valore medio de las habitaciones ocupadas",ylim=c(-5,5))
abline(-t, 0)
abline( t, 0)
abline( 0, 0)
plot(data$rm, rs, ylab="Residuos Studentizados",
xlab="Número medio de habitaciones por vivienda",
main="Residuos Studentizados vs Número medio de habitaciones por vivienda", ylim=c(-5,5))
abline(-t, 0)
abline( t, 0)
abline( 0, 0)
plot(data$dis, rs, ylab="Residuos Studentizados", xlab="media ponderada de las distancias a cinco centros de trabajo de Boston",
main="Residuos Studentizados vs media ponderada de las distancias a cinco centros de trabajo de Boston", ylim=c(-5,5))
abline(-t, 0)
abline( t, 0)
abline( 0, 0)
plot(data$lstat,rs, ylab="Residuos Studentizados",
xlab="estatus inferior de la población (porcentaje)",
main="Residuos Studentizados vs estatus inferior de la población (porcentaje)", ylim=c(-5,5))
abline(-t, 0)
abline( t, 0)
abline( 0, 0)
par(mfrow=c(1,1))
plot(data$ptratio,rs, ylab="Residuos Studentizados",
xlab="ratio alumnos-docente por ciudad",
main="Residuos Studentizados vs ratio alumnos-docente por ciudad", ylim=c(-5,5))
abline(-t, 0)
abline( t, 0)
abline( 0, 0)
par(mfrow=c(1,1))
# Test de Heterocedasticidad (Test de Breusch-Pagan)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bptest(modelo, varformula = ~fitted.values(modelo), studentize=FALSE)
##
## Breusch-Pagan test
##
## data: modelo
## BP = 0.80169, df = 1, p-value = 0.3706
## Hay Homecedasticidad
# Test chi-cuadrado (similar al anterior)
library(car)
## Loading required package: carData
ncvTest(modelo) # Test para varianza no constante de error
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 0.8016934, Df = 1, p = 0.37059
car::vif(modelo) ## como los valores de Vif son menores a 7 entonces
## rm dis lstat ptratio
## 1.695903 1.366730 2.131782 1.206839
## no hay multicolinealidad
# (4) Autocorrelaci?n
# """"""""""""""""""""""""
# Gr?fico de Residuos a trav?s del tiempo
plot(residuals(modelo), ylab="Residuos")
abline(h=0)
# Gráfico de Residuos a través del tiempo
plot(residuals(modelo), ylab="Residuos")
abline(h=0)
# Prueba de Durbin-Watson ## si hay autocorrelación
library(lmtest)
dwtest(modelo)
##
## Durbin-Watson test
##
## data: modelo
## DW = 0.97414, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
cor(data[c(6,8,13,11)])
## rm dis lstat ptratio
## rm 1.0000000 0.2052462 -0.6138083 -0.3555015
## dis 0.2052462 1.0000000 -0.4969958 -0.2324705
## lstat -0.6138083 -0.4969958 1.0000000 0.3740443
## ptratio -0.3555015 -0.2324705 0.3740443 1.0000000
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
set.seed(325)
# Divisi?n de datos: conjunto de entrenamiento/validaci?n (75%) y evaluaci?n (25%)
idx_train <- createDataPartition(y=data$medv, p=0.75, list=FALSE, times=1)
datos_train <- data[idx_train,]
datos_test <- data[-idx_train,]
mean(datos_train$medv)
## [1] 22.7021
mean(datos_test$medv)
## [1] 22.0168
mean(data$medv)
## [1] 22.53281
formul <- medv ~ rm+dis+lstat+ptratio
# Par?metros
k <- 10
repeticiones <- 5
trc <- trainControl(method="repeatedcv", number=k, repeats=repeticiones,
returnResamp="all")
## Modelo Lineal Normal
# ______________________
set.seed(835)
modeloLN <- train(formul, data=datos_train, method ="glm", trControl=trc)
modeloLN
## Generalized Linear Model
##
## 381 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 341, 344, 343, 343, 344, 343, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 5.000539 0.7327531 3.604369
summary(modeloLN)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -15.9203 -2.9926 -0.4902 2.0343 28.3828
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 21.8536 4.3658 5.006 8.57e-07 ***
## rm 4.6615 0.4467 10.436 < 2e-16 ***
## dis -0.3998 0.1450 -2.758 0.00611 **
## lstat -0.6544 0.0527 -12.416 < 2e-16 ***
## ptratio -1.0266 0.1295 -7.926 2.60e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 25.32944)
##
## Null deviance: 34966.3 on 380 degrees of freedom
## Residual deviance: 9523.9 on 376 degrees of freedom
## AIC: 2319.6
##
## Number of Fisher Scoring iterations: 2
modeloLN$results # RMSE, R2, MAE (y sus desviaciones estándar)
## parameter RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 none 5.000539 0.7327531 3.604369 1.116272 0.1184313 0.6178617
modeloLN$resample # Valores para cada muestra (50 = K*repeticiones)
## RMSE Rsquared MAE parameter Resample
## 1 4.408379 0.7014579 3.039035 none Fold01.Rep1
## 2 4.338228 0.8792529 3.272949 none Fold02.Rep1
## 3 5.247855 0.6679846 3.746958 none Fold03.Rep1
## 4 4.683635 0.7527255 3.467382 none Fold04.Rep1
## 5 6.787827 0.4796984 4.184304 none Fold05.Rep1
## 6 4.801293 0.7719866 3.915903 none Fold06.Rep1
## 7 4.551654 0.8139078 3.354722 none Fold07.Rep1
## 8 5.204671 0.7222084 3.795361 none Fold08.Rep1
## 9 6.449686 0.6192732 4.164082 none Fold09.Rep1
## 10 3.924061 0.8644889 3.075365 none Fold10.Rep1
## 11 4.837342 0.6519809 3.330426 none Fold01.Rep2
## 12 5.686388 0.7354271 4.156856 none Fold02.Rep2
## 13 3.776516 0.8736785 2.783363 none Fold03.Rep2
## 14 3.389044 0.8238059 2.462767 none Fold04.Rep2
## 15 4.857572 0.7754441 3.710986 none Fold05.Rep2
## 16 8.452796 0.3866198 5.247763 none Fold06.Rep2
## 17 4.056696 0.8056301 2.981245 none Fold07.Rep2
## 18 4.928469 0.8530391 3.803843 none Fold08.Rep2
## 19 4.725286 0.7865132 3.703237 none Fold09.Rep2
## 20 4.818587 0.7043865 3.722729 none Fold10.Rep2
## 21 7.128709 0.5350221 4.610090 none Fold01.Rep3
## 22 3.664787 0.8679226 2.879945 none Fold02.Rep3
## 23 3.992174 0.8640661 2.903413 none Fold03.Rep3
## 24 5.499416 0.8546932 4.258526 none Fold04.Rep3
## 25 5.966345 0.6276309 4.105701 none Fold05.Rep3
## 26 4.397165 0.7125252 3.182818 none Fold06.Rep3
## 27 5.731852 0.5892663 3.527246 none Fold07.Rep3
## 28 4.861257 0.7591147 3.722412 none Fold08.Rep3
## 29 3.697400 0.8038619 3.104669 none Fold09.Rep3
## 30 5.253287 0.7091883 3.841354 none Fold10.Rep3
## 31 7.667727 0.4894943 5.341016 none Fold01.Rep4
## 32 5.306014 0.7834401 3.950162 none Fold02.Rep4
## 33 5.462774 0.6026594 3.769523 none Fold03.Rep4
## 34 3.915572 0.8010192 3.122524 none Fold04.Rep4
## 35 3.675710 0.8282373 2.910275 none Fold05.Rep4
## 36 5.020130 0.7289566 3.794208 none Fold06.Rep4
## 37 4.267967 0.8955359 3.168030 none Fold07.Rep4
## 38 4.919291 0.7373698 3.614677 none Fold08.Rep4
## 39 5.478496 0.6881504 3.209126 none Fold09.Rep4
## 40 4.341529 0.7983864 3.242765 none Fold10.Rep4
## 41 6.034121 0.6384404 4.434889 none Fold01.Rep5
## 42 6.536502 0.6616645 4.369320 none Fold02.Rep5
## 43 4.631785 0.7600204 3.665671 none Fold03.Rep5
## 44 3.301993 0.8591778 2.723161 none Fold04.Rep5
## 45 7.013296 0.4494633 4.407800 none Fold05.Rep5
## 46 3.538128 0.7715193 2.657300 none Fold06.Rep5
## 47 4.495501 0.7866458 3.293699 none Fold07.Rep5
## 48 4.635316 0.7885990 3.115548 none Fold08.Rep5
## 49 4.828698 0.7765287 3.539522 none Fold09.Rep5
## 50 4.838006 0.7995453 3.833798 none Fold10.Rep5
summary(modeloLN$resample$RMSE) # Media del RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.302 4.339 4.833 5.001 5.475 8.453
sd(modeloLN$resample$RMSE) # Desviaci?n est?ndar del RMSE
## [1] 1.116272
se <- sd(modeloLN$resample$RMSE)/sqrt(k*repeticiones)
se
## [1] 0.1578646
library(ggplot2)
library(ggpubr)
# Gráficas de evaluación de la precisión (evolución de RMSE con cada muestra)
p1 <- ggplot(data=modeloLN$resample, aes(x=RMSE)) +
geom_density(alpha=0.5, fill="gray50") +
geom_vline(xintercept=mean(modeloLN$resample$RMSE), linetype="dashed") +
theme_bw()
p2 <- ggplot(data=modeloLN$resample, aes(x=1, y=RMSE)) +
geom_boxplot(outlier.shape=NA, alpha=0.5, fill="gray50") +
geom_jitter(width=0.05) + labs(x="") + theme_bw() +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
p3 <- ggplot(data=modeloLN$resample, aes(x=Rsquared)) +
geom_density(alpha=0.5, fill="gray50") +
geom_vline(xintercept=mean(modeloLN$resample$Rsquared), linetype="dashed") +
theme_bw()
p4 <- ggplot(data=modeloLN$resample, aes(x=1, y=Rsquared)) +
geom_boxplot(outlier.shape=NA, alpha=0.5, fill="gray50") +
geom_jitter(width=0.05) + labs(x="") + theme_bw() +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
p5 <- ggplot(data=modeloLN$resample, aes(x=MAE)) +
geom_density(alpha=0.5, fill="gray50") +
geom_vline(xintercept=mean(modeloLN$resample$MAE), linetype="dashed") +
theme_bw()
p6 <- ggplot(data=modeloLN$resample, aes(x=1, y=MAE)) +
geom_boxplot(outlier.shape=NA, alpha=0.5, fill="gray50") +
geom_jitter(width=0.05) + labs(x="") + theme_bw() +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
final_plot <- ggarrange(p1, p2, p3, p4, p5, p6, ncol=2, nrow=3)
final_plot <- annotate_figure(final_plot,
top=text_grob("Evaluación: Modelo Lineal Normal", size=15))
final_plot
set.seed(835)
modeloGI <- train(formul, data=datos_train, method ="glm",
family=Gamma(link=identity), trControl=trc)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
modeloGI
## Generalized Linear Model
##
## 381 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 341, 344, 343, 343, 344, 343, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 5.711663 0.6856126 4.059872
summary(modeloGI)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.87149 -0.15264 -0.02804 0.09981 1.22306
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 44.01286 4.72327 9.318 < 2e-16 ***
## rm 1.38839 0.46662 2.975 0.00311 **
## dis -0.04619 0.16919 -0.273 0.78498
## lstat -0.51827 0.04400 -11.778 < 2e-16 ***
## ptratio -1.29108 0.15567 -8.294 1.98e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 0.06854626)
##
## Null deviance: 66.192 on 380 degrees of freedom
## Residual deviance: 23.210 on 376 degrees of freedom
## AIC: 2344.1
##
## Number of Fisher Scoring iterations: 25
modeloGI$results
## parameter RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 none 5.711663 0.6856126 4.059872 1.056502 0.1099726 0.6156645
# Gr?ficas de evaluaci?n de la precisi?n (evoluci?n de RMSE con cada muestra)
p1 <- ggplot(data=modeloGI$resample, aes(x=RMSE)) +
geom_density(alpha=0.5, fill="gray50") +
geom_vline(xintercept=mean(modeloGI$resample$RMSE), linetype="dashed") +
theme_bw()
p2 <- ggplot(data=modeloGI$resample, aes(x=1, y=RMSE)) +
geom_boxplot(outlier.shape=NA, alpha=0.5, fill="gray50") +
geom_jitter(width=0.05) + labs(x="") + theme_bw() +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
p3 <- ggplot(data=modeloGI$resample, aes(x=Rsquared)) +
geom_density(alpha=0.5, fill="gray50") +
geom_vline(xintercept=mean(modeloGI$resample$Rsquared), linetype="dashed") +
theme_bw()
p4 <- ggplot(data=modeloGI$resample, aes(x=1, y=Rsquared)) +
geom_boxplot(outlier.shape=NA, alpha=0.5, fill="gray50") +
geom_jitter(width=0.05) + labs(x="") + theme_bw() +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
p5 <- ggplot(data=modeloGI$resample, aes(x=MAE)) +
geom_density(alpha=0.5, fill="gray50") +
geom_vline(xintercept=mean(modeloGI$resample$MAE), linetype="dashed") +
theme_bw()
p6 <- ggplot(data=modeloGI$resample, aes(x=1, y=MAE)) +
geom_boxplot(outlier.shape=NA, alpha=0.5, fill="gray50") +
geom_jitter(width=0.05) + labs(x="") + theme_bw() +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
final_plot <- ggarrange(p1, p2, p3, p4, p5, p6, ncol=2, nrow=3)
final_plot <- annotate_figure(final_plot,
top=text_grob("Evaluaci?n: Gamma (enlace identidad)", size = 15))
final_plot
set.seed(835)
modeloGL <- train(formul, data=datos_train, method="glm",
family=Gamma(link=log), trControl=trc)
modeloGL
## Generalized Linear Model
##
## 381 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 341, 344, 343, 343, 344, 343, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 4.459273 0.7879906 3.223239
summary(modeloGL)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.81941 -0.11750 -0.02004 0.09142 1.00288
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.644296 0.197485 18.454 < 2e-16 ***
## rm 0.106294 0.020204 5.261 2.41e-07 ***
## dis -0.010850 0.006559 -1.654 0.0989 .
## lstat -0.036807 0.002384 -15.439 < 2e-16 ***
## ptratio -0.041054 0.005859 -7.007 1.13e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 0.05182761)
##
## Null deviance: 66.192 on 380 degrees of freedom
## Residual deviance: 17.734 on 376 degrees of freedom
## AIC: 2240.6
##
## Number of Fisher Scoring iterations: 6
modeloGL$results
## parameter RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 none 4.459273 0.7879906 3.223239 1.050621 0.1033864 0.5591482
# Gr?ficas de evaluaci?n de la precisi?n
p1 <- ggplot(data = modeloGL$resample, aes(x = RMSE)) +
geom_density(alpha = 0.5, fill = "gray50") +
geom_vline(xintercept = mean(modeloGL$resample$RMSE),
linetype = "dashed") +
theme_bw()
p2 <- ggplot(data = modeloGL$resample, aes(x = 1, y = RMSE)) +
geom_boxplot(outlier.shape = NA, alpha = 0.5, fill = "gray50") +
geom_jitter(width = 0.05) +
labs(x = "") +
theme_bw() +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
p3 <- ggplot(data = modeloGL$resample, aes(x = Rsquared)) +
geom_density(alpha = 0.5, fill = "gray50") +
geom_vline(xintercept = mean(modeloGL$resample$Rsquared),
linetype = "dashed") +
theme_bw()
p4 <- ggplot(data = modeloGL$resample, aes(x = 1, y = Rsquared)) +
geom_boxplot(outlier.shape = NA, alpha = 0.5, fill = "gray50") +
geom_jitter(width = 0.05) +
labs(x = "") +
theme_bw() +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
p5 <- ggplot(data = modeloGL$resample, aes(x = MAE)) +
geom_density(alpha = 0.5, fill = "gray50") +
geom_vline(xintercept = mean(modeloGL$resample$MAE),
linetype = "dashed") +
theme_bw()
p6 <- ggplot(data = modeloGL$resample, aes(x = 1, y = MAE)) +
geom_boxplot(outlier.shape = NA, alpha = 0.5, fill = "gray50") +
geom_jitter(width = 0.05) +
labs(x = "") +
theme_bw() +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
final_plot <- ggarrange(p1, p2, p3, p4, p5, p6, ncol = 2, nrow = 3)
final_plot <- annotate_figure(
final_plot,
top = text_grob("Evaluaci?n: Modelo Gamma (enlace logar?tmico)", size = 15))
final_plot
set.seed(835)
modeloGR <- train(formul, data=datos_train, method ="glm",
family=Gamma(link=inverse), trControl=trc)
modeloGR
## Generalized Linear Model
##
## 381 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 341, 344, 343, 343, 344, 343, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 4.196792 0.8060329 3.070947
summary(modeloGR)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.85021 -0.11448 -0.01522 0.10336 0.92976
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0238310 0.0076155 3.129 0.00189 **
## rm -0.0036528 0.0007227 -5.054 6.75e-07 ***
## dis 0.0005783 0.0002415 2.394 0.01714 *
## lstat 0.0020435 0.0001207 16.930 < 2e-16 ***
## ptratio 0.0011623 0.0002234 5.202 3.25e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 0.04529821)
##
## Null deviance: 66.192 on 380 degrees of freedom
## Residual deviance: 16.656 on 376 degrees of freedom
## AIC: 2216.6
##
## Number of Fisher Scoring iterations: 4
modeloGR$results
## parameter RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 none 4.196792 0.8060329 3.070947 1.0509 0.1004044 0.5453499
## Gr?ficas de evaluaci?n de la precisi?n
p1 <- ggplot(data = modeloGR$resample, aes(x = RMSE)) +
geom_density(alpha = 0.5, fill = "gray50") +
geom_vline(xintercept = mean(modeloGR$resample$RMSE),
linetype = "dashed") +
theme_bw()
p2 <- ggplot(data = modeloGR$resample, aes(x = 1, y = RMSE)) +
geom_boxplot(outlier.shape = NA, alpha = 0.5, fill = "gray50") +
geom_jitter(width = 0.05) +
labs(x = "") +
theme_bw() +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
p3 <- ggplot(data = modeloGR$resample, aes(x = Rsquared)) +
geom_density(alpha = 0.5, fill = "gray50") +
geom_vline(xintercept = mean(modeloGR$resample$Rsquared),
linetype = "dashed") +
theme_bw()
p4 <- ggplot(data = modeloGR$resample, aes(x = 1, y = Rsquared)) +
geom_boxplot(outlier.shape = NA, alpha = 0.5, fill = "gray50") +
geom_jitter(width = 0.05) +
labs(x = "") +
theme_bw() +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
p5 <- ggplot(data = modeloGR$resample, aes(x = MAE)) +
geom_density(alpha = 0.5, fill = "gray50") +
geom_vline(xintercept = mean(modeloGR$resample$MAE),
linetype = "dashed") +
theme_bw()
p6 <- ggplot(data = modeloGR$resample, aes(x = 1, y = MAE)) +
geom_boxplot(outlier.shape = NA, alpha = 0.5, fill = "gray50") +
geom_jitter(width = 0.05) +
labs(x = "") +
theme_bw() +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
final_plot <- ggarrange(p1, p2, p3, p4, p5, p6, ncol = 2, nrow = 3)
final_plot <- annotate_figure(
final_plot,
top = text_grob("Evaluaci?n: Modelo Gamma (enlace rec?proco)", size = 15))
final_plot
set.seed(835)
modeloNII <- train(formul, data=datos_train, method="glm",
family=inverse.gaussian(link=identity), trControl=trc)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning in sqrt((weights[good] * mu.eta.val[good]^2)/variance(mu)[good]): Se han
## producido NaNs
## Warning: model fit failed for Fold03.Rep1: parameter=none Error in glm.fit(x = structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, :
## NA/NaN/Inf in 'x'
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning in sqrt((weights[good] * mu.eta.val[good]^2)/variance(mu)[good]): Se han
## producido NaNs
## Warning: model fit failed for Fold05.Rep2: parameter=none Error in glm.fit(x = structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, :
## NA/NaN/Inf in 'x'
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning in sqrt((weights[good] * mu.eta.val[good]^2)/variance(mu)[good]): Se han
## producido NaNs
## Warning: model fit failed for Fold10.Rep3: parameter=none Error in glm.fit(x = structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, :
## NA/NaN/Inf in 'x'
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: algorithm did not converge
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## Warning: glm.fit: algorithm did not converge
modeloNII
## Generalized Linear Model
##
## 381 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 341, 344, 343, 343, 344, 343, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 6.161884 0.6243431 4.359462
summary(modeloNII)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.284289 -0.035274 -0.006564 0.024345 0.309988
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53.93633 4.97423 10.843 < 2e-16 ***
## rm 0.09596 0.45305 0.212 0.832
## dis 0.13015 0.18997 0.685 0.494
## lstat -0.51766 0.04231 -12.236 < 2e-16 ***
## ptratio -1.43795 0.17754 -8.099 7.79e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for inverse.gaussian family taken to be 0.003896856)
##
## Null deviance: 3.3766 on 380 degrees of freedom
## Residual deviance: 1.3616 on 376 degrees of freedom
## AIC: 2416.3
##
## Number of Fisher Scoring iterations: 25
modeloNII$results
## parameter RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 none 6.161884 0.6243431 4.359462 1.145676 0.1150634 0.6637477
## Gr?ficas de evaluaci?n de la precisi?n
p1 <- ggplot(data = modeloNII$resample, aes(x = RMSE)) +
geom_density(alpha = 0.5, fill = "gray50") +
geom_vline(xintercept = mean(modeloNII$resample$RMSE),
linetype = "dashed") +
theme_bw()
p2 <- ggplot(data = modeloNII$resample, aes(x = 1, y = RMSE)) +
geom_boxplot(outlier.shape = NA, alpha = 0.5, fill = "gray50") +
geom_jitter(width = 0.05) +
labs(x = "") +
theme_bw() +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
p3 <- ggplot(data = modeloNII$resample, aes(x = Rsquared)) +
geom_density(alpha = 0.5, fill = "gray50") +
geom_vline(xintercept = mean(modeloNII$resample$Rsquared),
linetype = "dashed") +
theme_bw()
p4 <- ggplot(data = modeloNII$resample, aes(x = 1, y = Rsquared)) +
geom_boxplot(outlier.shape = NA, alpha = 0.5, fill = "gray50") +
geom_jitter(width = 0.05) +
labs(x = "") +
theme_bw() +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
p5 <- ggplot(data = modeloNII$resample, aes(x = MAE)) +
geom_density(alpha = 0.5, fill = "gray50") +
geom_vline(xintercept = mean(modeloNII$resample$MAE),
linetype = "dashed") +
theme_bw()
p6 <- ggplot(data = modeloNII$resample, aes(x = 1, y = MAE)) +
geom_boxplot(outlier.shape = NA, alpha = 0.5, fill = "gray50") +
geom_jitter(width = 0.05) +
labs(x = "") +
theme_bw() +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
final_plot <- ggarrange(p1, p2, p3, p4, p5, p6, ncol = 2, nrow = 3)
## Warning: Removed 3 rows containing non-finite values (stat_density).
## Warning: Removed 1 rows containing missing values (geom_vline).
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing non-finite values (stat_density).
## Warning: Removed 1 rows containing missing values (geom_vline).
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing non-finite values (stat_density).
## Warning: Removed 1 rows containing missing values (geom_vline).
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).
## Warning: Removed 3 rows containing missing values (geom_point).
final_plot <- annotate_figure(
final_plot,
top = text_grob("Evaluaci?n: Modelo Normal Inversa (enlace identidad)", size = 15))
final_plot
set.seed(835)
modeloNIL <- train(formul, data=datos_train, method ="glm",
family=inverse.gaussian(link=log), trControl=trc)
modeloNIL
## Generalized Linear Model
##
## 381 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 341, 344, 343, 343, 344, 343, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 4.904326 0.7517099 3.543702
summary(modeloNIL)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.275035 -0.027426 -0.003875 0.022128 0.283251
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.067471 0.226195 17.982 < 2e-16 ***
## rm 0.055182 0.023489 2.349 0.0193 *
## dis -0.002636 0.007865 -0.335 0.7377
## lstat -0.035848 0.002421 -14.807 < 2e-16 ***
## ptratio -0.048989 0.006932 -7.068 7.71e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for inverse.gaussian family taken to be 0.003244903)
##
## Null deviance: 3.3766 on 380 degrees of freedom
## Residual deviance: 1.1460 on 376 degrees of freedom
## AIC: 2350.6
##
## Number of Fisher Scoring iterations: 9
modeloNIL$results
## parameter RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 none 4.904326 0.7517099 3.543702 1.049479 0.1017423 0.57823
## Gr?ficas de evaluaci?n de la precisi?n
p1 <- ggplot(data = modeloNIL$resample, aes(x = RMSE)) +
geom_density(alpha = 0.5, fill = "gray50") +
geom_vline(xintercept = mean(modeloNIL$resample$RMSE),
linetype = "dashed") +
theme_bw()
p2 <- ggplot(data = modeloNIL$resample, aes(x = 1, y = RMSE)) +
geom_boxplot(outlier.shape = NA, alpha = 0.5, fill = "gray50") +
geom_jitter(width = 0.05) +
labs(x = "") +
theme_bw() +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
p3 <- ggplot(data = modeloNIL$resample, aes(x = Rsquared)) +
geom_density(alpha = 0.5, fill = "gray50") +
geom_vline(xintercept = mean(modeloNIL$resample$Rsquared),
linetype = "dashed") +
theme_bw()
p4 <- ggplot(data = modeloNIL$resample, aes(x = 1, y = Rsquared)) +
geom_boxplot(outlier.shape = NA, alpha = 0.5, fill = "gray50") +
geom_jitter(width = 0.05) +
labs(x = "") +
theme_bw() +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
p5 <- ggplot(data = modeloNIL$resample, aes(x = MAE)) +
geom_density(alpha = 0.5, fill = "gray50") +
geom_vline(xintercept = mean(modeloNIL$resample$MAE),
linetype = "dashed") +
theme_bw()
p6 <- ggplot(data = modeloNIL$resample, aes(x = 1, y = MAE)) +
geom_boxplot(outlier.shape = NA, alpha = 0.5, fill = "gray50") +
geom_jitter(width = 0.05) +
labs(x = "") +
theme_bw() +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
final_plot <- ggarrange(p1, p2, p3, p4, p5, p6, ncol = 2, nrow = 3)
final_plot <- annotate_figure(
final_plot,
top = text_grob("Evaluaci?n: Modelo Normal Inversa (enlace logar?tmico", size = 15))
final_plot
set.seed(835)
modeloNIRC <- train(formul, data=datos_train, method="glm",
family=inverse.gaussian(link=1/mu^2), trControl=trc)
## Warning in sqrt(eta): Se han producido NaNs
## Warning: model fit failed for Fold02.Rep1: parameter=none Error : no valid set of coefficients has been found: please supply starting values
## Warning in sqrt(eta): Se han producido NaNs
## Warning: model fit failed for Fold02.Rep2: parameter=none Error : no valid set of coefficients has been found: please supply starting values
## Warning in sqrt(eta): Se han producido NaNs
## Warning: model fit failed for Fold01.Rep3: parameter=none Error : no valid set of coefficients has been found: please supply starting values
## Warning in sqrt(eta): Se han producido NaNs
## Warning: model fit failed for Fold06.Rep4: parameter=none Error : no valid set of coefficients has been found: please supply starting values
## Warning in sqrt(eta): Se han producido NaNs
## Warning: model fit failed for Fold10.Rep4: parameter=none Error : no valid set of coefficients has been found: please supply starting values
## Warning in sqrt(eta): Se han producido NaNs
## Warning: model fit failed for Fold01.Rep5: parameter=none Error : no valid set of coefficients has been found: please supply starting values
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
modeloNIRC
## Generalized Linear Model
##
## 381 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 341, 344, 343, 343, 344, 343, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 5.959871 0.7224323 3.687739
summary(modeloNIRC)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.290199 -0.025180 -0.000327 0.028298 0.189072
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.057e-04 6.873e-04 -0.299 0.76483
## rm -1.448e-04 6.599e-05 -2.195 0.02876 *
## dis 2.146e-05 1.881e-05 1.141 0.25466
## lstat 1.980e-04 1.168e-05 16.951 < 2e-16 ***
## ptratio 6.824e-05 2.125e-05 3.210 0.00144 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for inverse.gaussian family taken to be 0.002785377)
##
## Null deviance: 3.3766 on 380 degrees of freedom
## Residual deviance: 1.2191 on 376 degrees of freedom
## AIC: 2374.2
##
## Number of Fisher Scoring iterations: 5
modeloNIRC$results
## parameter RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 none 5.959871 0.7224323 3.687739 3.301849 0.1170106 0.8979335
## Gr?ficas de evaluaci?n de la precisi?n
p1 <- ggplot(data = modeloNIRC$resample, aes(x = RMSE)) +
geom_density(alpha = 0.5, fill = "gray50") +
geom_vline(xintercept = mean(modeloNIRC$resample$RMSE),
linetype = "dashed") +
theme_bw()
p2 <- ggplot(data = modeloNIRC$resample, aes(x = 1, y = RMSE)) +
geom_boxplot(outlier.shape = NA, alpha = 0.5, fill = "gray50") +
geom_jitter(width = 0.05) +
labs(x = "") +
theme_bw() +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
p3 <- ggplot(data = modeloNIRC$resample, aes(x = Rsquared)) +
geom_density(alpha = 0.5, fill = "gray50") +
geom_vline(xintercept = mean(modeloNIRC$resample$Rsquared),
linetype = "dashed") +
theme_bw()
p4 <- ggplot(data = modeloNIRC$resample, aes(x = 1, y = Rsquared)) +
geom_boxplot(outlier.shape = NA, alpha = 0.5, fill = "gray50") +
geom_jitter(width = 0.05) +
labs(x = "") +
theme_bw() +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
p5 <- ggplot(data = modeloNIRC$resample, aes(x = MAE)) +
geom_density(alpha = 0.5, fill = "gray50") +
geom_vline(xintercept = mean(modeloNIRC$resample$MAE),
linetype = "dashed") +
theme_bw()
p6 <- ggplot(data = modeloNIRC$resample, aes(x = 1, y = MAE)) +
geom_boxplot(outlier.shape = NA, alpha = 0.5, fill = "gray50") +
geom_jitter(width = 0.05) +
labs(x = "") +
theme_bw() +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
final_plot <- ggarrange(p1, p2, p3, p4, p5, p6, ncol = 2, nrow = 3)
## Warning: Removed 6 rows containing non-finite values (stat_density).
## Warning: Removed 1 rows containing missing values (geom_vline).
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
## Warning: Removed 6 rows containing missing values (geom_point).
## Warning: Removed 6 rows containing non-finite values (stat_density).
## Warning: Removed 1 rows containing missing values (geom_vline).
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
## Warning: Removed 6 rows containing missing values (geom_point).
## Warning: Removed 6 rows containing non-finite values (stat_density).
## Warning: Removed 1 rows containing missing values (geom_vline).
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
## Warning: Removed 6 rows containing missing values (geom_point).
final_plot <- annotate_figure(
final_plot,
top = text_grob("Evaluaci?n: Modelo Normal Inversa (enlace rec?proco cuadr?tico)", size = 15))
final_plot
# Comparación de los Modelos Lineales Generalizados
modelos <- list(Normal=modeloLN, Gamma_Ident=modeloGI, Gamma_Log=modeloGL,
Gamma_Recip=modeloGR, NI_Ident=modeloNII, NI_Log=modeloNIL)
resultados_resamples <- resamples(modelos)
## Warning in resamples.default(modelos): 'Normal' did not have
## 'returnResamp="final"; the optimal tuning parameters are used
## Warning in resamples.default(modelos): 'Gamma_Ident' did not have
## 'returnResamp="final"; the optimal tuning parameters are used
## Warning in resamples.default(modelos): 'Gamma_Log' did not have
## 'returnResamp="final"; the optimal tuning parameters are used
## Warning in resamples.default(modelos): 'Gamma_Recip' did not have
## 'returnResamp="final"; the optimal tuning parameters are used
## Warning in resamples.default(modelos): 'NI_Ident' did not have
## 'returnResamp="final"; the optimal tuning parameters are used
## Warning in resamples.default(modelos): 'NI_Log' did not have
## 'returnResamp="final"; the optimal tuning parameters are used
resultados_resamples
##
## Call:
## resamples.default(x = modelos)
##
## Models: Normal, Gamma_Ident, Gamma_Log, Gamma_Recip, NI_Ident, NI_Log
## Number of resamples: 50
## Performance metrics: MAE, RMSE, Rsquared
## Time estimates for: everything, final model fit
# Extraer información relevante
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ✔ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ dplyr::recode() masks car::recode()
## ✖ dplyr::select() masks MASS::select()
## ✖ purrr::some() masks car::some()
resultados_resamples$values %>% head(10)
## Resample Normal~MAE Normal~RMSE Normal~Rsquared Gamma_Ident~MAE
## 1 Fold01.Rep1 3.039035 4.408379 0.7014579 3.289608
## 2 Fold01.Rep2 3.330426 4.837342 0.6519809 3.631980
## 3 Fold01.Rep3 4.610090 7.128709 0.5350221 4.965420
## 4 Fold01.Rep4 5.341016 7.667727 0.4894943 5.433950
## 5 Fold01.Rep5 4.434889 6.034121 0.6384404 4.493912
## 6 Fold02.Rep1 3.272949 4.338228 0.8792529 4.270443
## 7 Fold02.Rep2 4.156856 5.686388 0.7354271 4.612160
## 8 Fold02.Rep3 2.879945 3.664787 0.8679226 3.478335
## 9 Fold02.Rep4 3.950162 5.306014 0.7834401 4.848444
## 10 Fold02.Rep5 4.369320 6.536502 0.6616645 5.153543
## Gamma_Ident~RMSE Gamma_Ident~Rsquared Gamma_Log~MAE Gamma_Log~RMSE
## 1 4.265582 0.7367741 2.640865 3.502803
## 2 4.344743 0.7075936 3.097494 4.197341
## 3 7.361917 0.5369314 4.000095 6.201068
## 4 7.775865 0.4857927 4.622161 6.750375
## 5 5.889745 0.6121647 4.010354 5.165170
## 6 6.124717 0.7850588 3.083113 4.118837
## 7 6.358474 0.7563881 3.873617 5.036190
## 8 4.814540 0.8368464 2.746054 3.330783
## 9 6.736532 0.6836059 3.705072 4.958346
## 10 7.850065 0.5262551 4.086823 6.472542
## Gamma_Log~Rsquared Gamma_Recip~MAE Gamma_Recip~RMSE Gamma_Recip~Rsquared
## 1 0.8064999 2.674823 3.481337 0.8090731
## 2 0.7130687 3.100285 4.489327 0.6427565
## 3 0.6507581 3.841487 5.770913 0.7082294
## 4 0.6069493 4.202224 6.208661 0.6744136
## 5 0.7176640 3.668564 4.694038 0.7727969
## 6 0.8999631 2.542937 3.421167 0.9118689
## 7 0.8019713 4.129872 5.071027 0.7952259
## 8 0.9032360 2.505770 3.137666 0.9016495
## 9 0.8236755 3.050768 4.209782 0.8679898
## 10 0.6774835 3.482047 5.959900 0.7197131
## NI_Ident~MAE NI_Ident~RMSE NI_Ident~Rsquared NI_Log~MAE NI_Log~RMSE
## 1 3.712745 4.602049 0.6910285 2.888792 3.634943
## 2 3.774470 4.511749 0.6758746 3.196078 3.999812
## 3 5.175212 7.609621 0.5089909 4.303900 6.361862
## 4 5.470273 7.985444 0.4602988 4.756618 6.973484
## 5 4.573802 6.140305 0.5819923 4.234171 5.486635
## 6 4.692465 6.787084 0.6965167 3.544448 4.924396
## 7 4.854303 6.762226 0.7279016 4.109549 5.401008
## 8 3.795149 5.352015 0.7834315 3.071313 3.939181
## 9 5.402708 7.522895 0.5901958 4.334811 5.806071
## 10 5.642726 8.445196 0.4339390 4.595134 7.129818
## NI_Log~Rsquared
## 1 0.7982435
## 2 0.7472074
## 3 0.6410030
## 4 0.5778341
## 5 0.6805413
## 6 0.8482303
## 7 0.7917370
## 8 0.8713493
## 9 0.7642306
## 10 0.5999077
# Se trasforma el dataframe devuelto por resamples() para separar el nombre del
# modelo y las métricas en columnas distintas.
metricas_resamples <- resultados_resamples$values %>%
gather(key="modelo", value="valor", -Resample) %>%
separate(col="modelo", into=c("modelo", "metrica"), sep="~", remove=TRUE)
metricas_resamples %>% head()
## Resample modelo metrica valor
## 1 Fold01.Rep1 Normal MAE 3.039035
## 2 Fold01.Rep2 Normal MAE 3.330426
## 3 Fold01.Rep3 Normal MAE 4.610090
## 4 Fold01.Rep4 Normal MAE 5.341016
## 5 Fold01.Rep5 Normal MAE 4.434889
## 6 Fold02.Rep1 Normal MAE 3.272949
# Comparación de los RMSE, MAE y R2 promedio
metricas_resamples %>% group_by(modelo, metrica) %>%
summarise(media = mean(valor)) %>% spread(key = metrica, value = media) %>%
arrange(RMSE) ##
## `summarise()` has grouped output by 'modelo'. You can override using the
## `.groups` argument.
## # A tibble: 6 × 4
## # Groups: modelo [6]
## modelo MAE RMSE Rsquared
## <chr> <dbl> <dbl> <dbl>
## 1 Gamma_Recip 3.07 4.20 0.806
## 2 Gamma_Log 3.22 4.46 0.788
## 3 NI_Log 3.54 4.90 0.752
## 4 Normal 3.60 5.00 0.733
## 5 Gamma_Ident 4.06 5.71 0.686
## 6 NI_Ident NA NA NA
#arrange(desc(Rsquared))
# Gráfica para comparar los modelos según RMSE
metricas_resamples %>%
filter(metrica == "RMSE") %>%
group_by(modelo) %>%
summarise(media = mean(valor)) %>%
ggplot(aes(x = reorder(modelo, media), y = media, label = round(media, 2))) +
geom_segment(aes(x = reorder(modelo, media), y = 0.5,
xend = modelo, yend = media),
color = "grey50") +
geom_point(size = 15, color = "firebrick") +
geom_text(color = "white", size = 5) +
scale_y_continuous(limits = c(2, 10)) +
# RMSE de referencia
geom_hline(yintercept = 0.73, linetype = "dashed") +
annotate(geom = "text", y = 0.72, x = 7.5, label = "RMSE referencia") +
labs(title = "Validaci?n: RMSE medio repeated-CV",
subtitle = "Modelos ordenados por media",
x = "modelo") +
coord_flip() +
theme_bw()
## Warning: Removed 6 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text).
## Warning: Removed 1 rows containing missing values (geom_hline).
## Warning: Removed 1 rows containing missing values (geom_text).
# Gráfica para comparación considerando la variabilidad
metricas_resamples %>% filter(metrica == "RMSE") %>%
group_by(modelo) %>%
mutate(media = mean(valor)) %>%
ungroup() %>%
ggplot(aes(x = reorder(modelo, media), y = valor, color = modelo)) +
geom_boxplot(alpha = 0.6, outlier.shape = NA) +
geom_jitter(width = 0.1, alpha = 0.6) +
scale_y_continuous(limits = c(2, 8)) +
# RMSE de referencia
geom_hline(yintercept = 0.72, linetype = "dashed") +
annotate(geom = "text", y = 0.72, x = 7.5, label = "RMSE de referencia") +
theme_bw() +
labs(title = "Validaci?n: RMSE medio repeated-CV",
subtitle = "Modelos ordenados por media") +
coord_flip() +
theme(legend.position = "none")
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Warning: Removed 8 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_hline).
## Warning: Removed 1 rows containing missing values (geom_text).
# Evaluar diferencias
# -------------------
# Test de Friedman
# H0: todas las medianas son iguales
# H1: alguna de las medianas (de los RMSE) es distinta a las demás
matriz_metricas <- metricas_resamples %>% filter(metrica == "RMSE") %>%
spread(key = modelo, value = valor) %>%
dplyr::select(-Resample, -metrica) %>% as.matrix()
# alfa = 0.05
friedman.test(y=matriz_metricas) # Se rechaza la hipotesis nula,almenos una de las medianas es diferente
##
## Friedman rank sum test
##
## data: matriz_metricas
## Friedman chi-squared = 193.69, df = 5, p-value < 2.2e-16
# Error de Test
# -------------
predicciones <- extractPrediction(models = modelos,
testX = datos_test[, -1],
testY = datos_test$medv)
predicciones %>% head()
## obs pred model dataType object
## 1 24.0 31.90136 glm Training Normal
## 2 21.6 25.54408 glm Training Normal
## 3 34.7 32.44926 glm Training Normal
## 4 36.2 30.05964 glm Training Normal
## 5 28.7 26.79588 glm Training Normal
## 6 22.9 23.91666 glm Training Normal
metricas_predicciones <- predicciones %>% mutate(acierto = (obs-pred)^2) %>%
group_by(object, dataType) %>% summarise(RMSE = sqrt(mean(acierto)))
## `summarise()` has grouped output by 'object'. You can override using the
## `.groups` argument.
# RMSE en conjunto de evaluaci?n (y de prueba)
metricas_predicciones %>% spread(key = dataType, value = RMSE) %>%
arrange(desc(Test))
## # A tibble: 6 × 3
## # Groups: object [6]
## object Test Training
## <chr> <dbl> <dbl>
## 1 NI_Ident 5.83 6.21
## 2 Gamma_Ident 5.55 5.73
## 3 Normal 5.54 5.00
## 4 NI_Log 5.23 4.94
## 5 Gamma_Log 4.97 4.49
## 6 Gamma_Recip 4.69 4.22
ggplot(data = metricas_predicciones, aes(x = reorder(object, RMSE), y = RMSE,
color = dataType, label = round(RMSE, 3))) +
geom_point(size=14) + scale_color_manual(values = c("orangered2", "gray50")) +
geom_text(color="white", size=3) + scale_y_continuous(limits=c(2, 10)) +
# RMSE de referencia
geom_hline(yintercept = 0.73, linetype = "dashed") +
annotate(geom = "text", y = 0.73, x = 8.5, label = "RMSE referencia") +
coord_flip() + labs(title="RMSE de entrenamiento y test", x="Modelo") +
theme_bw() + theme(legend.position = "bottom")
## Warning: Removed 1 rows containing missing values (geom_hline).
## Warning: Removed 1 rows containing missing values (geom_text).
# Modelo Final
modeloGR
## Generalized Linear Model
##
## 381 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 341, 344, 343, 343, 344, 343, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 4.196792 0.8060329 3.070947
###KNN
# =================================================================
# K-NN (K-nearest neighbor) para construir el modelo
# =================================================================
library(Metrics)
##
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
# install.packages('kknn')
library(kknn)
##
## Attaching package: 'kknn'
## The following object is masked from 'package:caret':
##
## contr.dummy
# ===============================================================================
# Selección de hiperparámetros ("modelos")
# ===============================================================================
# Determinación del mejor valor de K (usando Caret y 10 CV)
particiones <- 10 # valor de k (folds)
repeticiones <- 5
# Metrica para seleccionar los hiperparametros óptimos
metrica <- "RMSE"
# Definicion de hiperpar?metros
hiperparametros <- expand.grid(kmax = seq(from=1, to=50, by=2),
distance = 2,
kernel = c("optimal", "epanechnikov","gaussian")
)
hiperparametros
## kmax distance kernel
## 1 1 2 optimal
## 2 3 2 optimal
## 3 5 2 optimal
## 4 7 2 optimal
## 5 9 2 optimal
## 6 11 2 optimal
## 7 13 2 optimal
## 8 15 2 optimal
## 9 17 2 optimal
## 10 19 2 optimal
## 11 21 2 optimal
## 12 23 2 optimal
## 13 25 2 optimal
## 14 27 2 optimal
## 15 29 2 optimal
## 16 31 2 optimal
## 17 33 2 optimal
## 18 35 2 optimal
## 19 37 2 optimal
## 20 39 2 optimal
## 21 41 2 optimal
## 22 43 2 optimal
## 23 45 2 optimal
## 24 47 2 optimal
## 25 49 2 optimal
## 26 1 2 epanechnikov
## 27 3 2 epanechnikov
## 28 5 2 epanechnikov
## 29 7 2 epanechnikov
## 30 9 2 epanechnikov
## 31 11 2 epanechnikov
## 32 13 2 epanechnikov
## 33 15 2 epanechnikov
## 34 17 2 epanechnikov
## 35 19 2 epanechnikov
## 36 21 2 epanechnikov
## 37 23 2 epanechnikov
## 38 25 2 epanechnikov
## 39 27 2 epanechnikov
## 40 29 2 epanechnikov
## 41 31 2 epanechnikov
## 42 33 2 epanechnikov
## 43 35 2 epanechnikov
## 44 37 2 epanechnikov
## 45 39 2 epanechnikov
## 46 41 2 epanechnikov
## 47 43 2 epanechnikov
## 48 45 2 epanechnikov
## 49 47 2 epanechnikov
## 50 49 2 epanechnikov
## 51 1 2 gaussian
## 52 3 2 gaussian
## 53 5 2 gaussian
## 54 7 2 gaussian
## 55 9 2 gaussian
## 56 11 2 gaussian
## 57 13 2 gaussian
## 58 15 2 gaussian
## 59 17 2 gaussian
## 60 19 2 gaussian
## 61 21 2 gaussian
## 62 23 2 gaussian
## 63 25 2 gaussian
## 64 27 2 gaussian
## 65 29 2 gaussian
## 66 31 2 gaussian
## 67 33 2 gaussian
## 68 35 2 gaussian
## 69 37 2 gaussian
## 70 39 2 gaussian
## 71 41 2 gaussian
## 72 43 2 gaussian
## 73 45 2 gaussian
## 74 47 2 gaussian
## 75 49 2 gaussian
library(caret)
control_train <- trainControl(method = "repeatedcv", number = particiones,
repeats = repeticiones, returnResamp = "final",
verboseIter = FALSE)
set.seed(555)
modeloKNN <- train(formul, data = data, method = "kknn", tuneGrid = hiperparametros,
metric = metrica, trControl = control_train)
modeloKNN
## k-Nearest Neighbors
##
## 506 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 455, 455, 456, 456, 456, 455, ...
## Resampling results across tuning parameters:
##
## kmax kernel RMSE Rsquared MAE
## 1 optimal 4.753915 0.7438044 3.141004
## 1 epanechnikov 4.753915 0.7438044 3.141004
## 1 gaussian 4.753915 0.7438044 3.141004
## 3 optimal 4.133026 0.7961658 2.752106
## 3 epanechnikov 4.054021 0.8026834 2.737437
## 3 gaussian 4.017134 0.8063702 2.719312
## 5 optimal 3.955092 0.8120900 2.633554
## 5 epanechnikov 3.972119 0.8101330 2.643992
## 5 gaussian 3.989421 0.8084163 2.649733
## 7 optimal 3.953769 0.8122371 2.619902
## 7 epanechnikov 3.934419 0.8138345 2.621045
## 7 gaussian 4.002049 0.8071211 2.653968
## 9 optimal 3.967592 0.8108495 2.624955
## 9 epanechnikov 3.934739 0.8137436 2.616566
## 9 gaussian 4.002049 0.8071211 2.653968
## 11 optimal 3.967592 0.8108495 2.624955
## 11 epanechnikov 3.935197 0.8136466 2.616783
## 11 gaussian 4.002049 0.8071211 2.653968
## 13 optimal 3.967592 0.8108495 2.624955
## 13 epanechnikov 3.935197 0.8136466 2.616783
## 13 gaussian 4.006633 0.8068273 2.654531
## 15 optimal 3.967592 0.8108495 2.624955
## 15 epanechnikov 3.935197 0.8136466 2.616783
## 15 gaussian 4.006633 0.8068273 2.654531
## 17 optimal 3.967592 0.8108495 2.624955
## 17 epanechnikov 3.935197 0.8136466 2.616783
## 17 gaussian 4.006633 0.8068273 2.654531
## 19 optimal 3.967592 0.8108495 2.624955
## 19 epanechnikov 3.935197 0.8136466 2.616783
## 19 gaussian 4.006633 0.8068273 2.654531
## 21 optimal 3.967592 0.8108495 2.624955
## 21 epanechnikov 3.935197 0.8136466 2.616783
## 21 gaussian 4.006633 0.8068273 2.654531
## 23 optimal 3.967592 0.8108495 2.624955
## 23 epanechnikov 3.935197 0.8136466 2.616783
## 23 gaussian 4.006633 0.8068273 2.654531
## 25 optimal 3.967592 0.8108495 2.624955
## 25 epanechnikov 3.935197 0.8136466 2.616783
## 25 gaussian 4.006633 0.8068273 2.654531
## 27 optimal 3.967592 0.8108495 2.624955
## 27 epanechnikov 3.935197 0.8136466 2.616783
## 27 gaussian 4.006633 0.8068273 2.654531
## 29 optimal 3.967592 0.8108495 2.624955
## 29 epanechnikov 3.935197 0.8136466 2.616783
## 29 gaussian 4.006633 0.8068273 2.654531
## 31 optimal 3.967592 0.8108495 2.624955
## 31 epanechnikov 3.935197 0.8136466 2.616783
## 31 gaussian 4.006633 0.8068273 2.654531
## 33 optimal 3.967592 0.8108495 2.624955
## 33 epanechnikov 3.935197 0.8136466 2.616783
## 33 gaussian 4.006633 0.8068273 2.654531
## 35 optimal 3.967592 0.8108495 2.624955
## 35 epanechnikov 3.935197 0.8136466 2.616783
## 35 gaussian 4.006633 0.8068273 2.654531
## 37 optimal 3.967592 0.8108495 2.624955
## 37 epanechnikov 3.935197 0.8136466 2.616783
## 37 gaussian 4.006633 0.8068273 2.654531
## 39 optimal 3.967592 0.8108495 2.624955
## 39 epanechnikov 3.935197 0.8136466 2.616783
## 39 gaussian 4.006633 0.8068273 2.654531
## 41 optimal 3.967592 0.8108495 2.624955
## 41 epanechnikov 3.935197 0.8136466 2.616783
## 41 gaussian 4.006633 0.8068273 2.654531
## 43 optimal 3.967592 0.8108495 2.624955
## 43 epanechnikov 3.935197 0.8136466 2.616783
## 43 gaussian 4.006633 0.8068273 2.654531
## 45 optimal 3.967592 0.8108495 2.624955
## 45 epanechnikov 3.935197 0.8136466 2.616783
## 45 gaussian 4.006633 0.8068273 2.654531
## 47 optimal 3.967592 0.8108495 2.624955
## 47 epanechnikov 3.935197 0.8136466 2.616783
## 47 gaussian 4.006633 0.8068273 2.654531
## 49 optimal 3.967592 0.8108495 2.624955
## 49 epanechnikov 3.935197 0.8136466 2.616783
## 49 gaussian 4.006633 0.8068273 2.654531
##
## Tuning parameter 'distance' was held constant at a value of 2
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were kmax = 7, distance = 2 and kernel
## = epanechnikov.
ggplot(modeloKNN, highlight=TRUE) + scale_x_discrete(breaks=hiperparametros$kmax) +
labs(title="Evolución de exactitud del modelo KNN", x="K") + theme_bw()
##KNN
library(kknn)
library(rpart)
modeloKNN <- kknn(formula=formul, train=datos_train, test=datos_test, kernel="epanechnikov", k=3)
modeloKNN
##
## Call:
## kknn(formula = formul, train = datos_train, test = datos_test, k = 3, kernel = "epanechnikov")
##
## Response: "continuous"
?kknn
## starting httpd help server ... done
Metrics::rmse(actual = datos_train$medv, predicted = predict(modeloKNN))
## Warning in actual - predicted: longitud de objeto mayor no es múltiplo de la
## longitud de uno menor
## [1] 10.6983
Metrics::rmse(actual = datos_test$medv, predicted = predict(modeloKNN))
## [1] 4.835782
##Rpart
library("partykit")
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
arbol_completo <- rpart(formul, data=datos_train, cp=0, minbucket=3)
arbol_completo
## n= 381
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 381 3.496634e+04 22.702100
## 2) rm< 6.92 318 1.221547e+04 19.647800
## 4) lstat>=14.785 121 2.468612e+03 14.459500
## 8) dis< 2.0643 65 8.228822e+02 11.847690
## 16) ptratio>=19.6 54 6.320637e+02 11.125930
## 32) lstat>=21.42 31 2.733671e+02 9.609677
## 64) lstat< 34.195 28 1.767886e+02 9.257143
## 128) lstat>=25.735 13 2.500308e+01 7.623077
## 256) dis< 1.5981 6 9.408333e+00 6.716667 *
## 257) dis>=1.5981 7 6.440000e+00 8.400000 *
## 129) lstat< 25.735 15 8.698933e+01 10.673330
## 258) lstat< 23.135 6 3.595500e+01 9.650000 *
## 259) lstat>=23.135 9 4.056222e+01 11.355560
## 518) dis>=1.68365 4 2.086000e+01 9.900000 *
## 519) dis< 1.68365 5 4.448000e+00 12.520000 *
## 65) lstat>=34.195 3 6.062000e+01 12.900000 *
## 33) lstat< 21.42 23 1.913687e+02 13.169570
## 66) dis< 1.7852 13 1.012477e+02 12.069230
## 132) lstat< 21.15 10 6.612900e+01 11.310000
## 264) lstat>=19.57 7 5.684857e+01 10.685710 *
## 265) lstat< 19.57 3 1.866667e-01 12.766670 *
## 133) lstat>=21.15 3 1.014000e+01 14.600000 *
## 67) dis>=1.7852 10 5.392000e+01 14.600000
## 134) dis>=1.97915 3 1.108667e+01 11.866670 *
## 135) dis< 1.97915 7 1.081429e+01 15.771430 *
## 17) ptratio< 19.6 11 2.458909e+01 15.390910
## 34) dis< 1.4887 3 7.466667e-01 13.933330 *
## 35) dis>=1.4887 8 1.507875e+01 15.937500 *
## 9) dis>=2.0643 56 6.876655e+02 17.491070
## 18) ptratio>=19.7 34 2.693497e+02 15.802940
## 36) lstat>=18.885 8 1.431875e+01 13.762500 *
## 37) lstat< 18.885 26 2.114754e+02 16.430770
## 74) dis>=2.16915 22 1.440345e+02 15.845450
## 148) dis< 2.44985 9 4.090222e+01 14.455560
## 296) lstat< 16.855 3 3.486667e+00 12.866670 *
## 297) lstat>=16.855 6 2.605500e+01 15.250000 *
## 149) dis>=2.44985 13 7.370923e+01 16.807690
## 298) lstat>=16.355 8 3.863875e+01 15.637500 *
## 299) lstat< 16.355 5 6.588000e+00 18.680000 *
## 75) dis< 2.16915 4 1.845000e+01 19.650000 *
## 19) ptratio< 19.7 22 1.716800e+02 20.100000
## 38) dis>=6.33335 3 9.206667e+00 17.466670 *
## 39) dis< 6.33335 19 1.383853e+02 20.515790
## 78) ptratio>=17.6 14 8.446357e+01 19.821430
## 156) dis>=4.86055 3 1.256000e+01 16.800000 *
## 157) dis< 4.86055 11 3.704727e+01 20.645450
## 314) lstat< 15.985 4 5.340000e+00 19.000000 *
## 315) lstat>=15.985 7 1.468857e+01 21.585710 *
## 79) ptratio< 17.6 5 2.827200e+01 22.460000 *
## 5) lstat< 14.785 197 4.489165e+03 22.834520
## 10) lstat>=7.81 137 2.120775e+03 21.268610
## 20) rm< 6.0985 71 4.260237e+02 19.928170
## 40) ptratio>=20.6 8 5.991500e+01 17.525000 *
## 41) ptratio< 20.6 63 3.140400e+02 20.233330
## 82) rm< 5.822 20 1.205255e+02 19.135000
## 164) lstat< 13.185 11 7.138545e+01 18.336360
## 328) lstat>=12.065 4 3.580000e+00 16.600000 *
## 329) lstat< 12.065 7 4.885429e+01 19.328570 *
## 165) lstat>=13.185 9 3.354889e+01 20.111110
## 330) rm>=5.6805 5 7.368000e+00 18.880000 *
## 331) rm< 5.6805 4 9.130000e+00 21.650000 *
## 83) rm>=5.822 43 1.581660e+02 20.744190
## 166) ptratio>=16.85 31 9.691097e+01 20.364520
## 332) dis>=6.56935 3 2.846667e+00 18.033330 *
## 333) dis< 6.56935 28 7.601429e+01 20.614290
## 666) ptratio>=19.85 8 4.438750e+00 19.862500 *
## 667) ptratio< 19.85 20 6.524550e+01 20.915000
## 1334) dis< 3.6126 9 3.666222e+01 20.155560
## 2668) dis>=2.43995 6 7.680000e+00 19.100000 *
## 2669) dis< 2.43995 3 8.926667e+00 22.266670 *
## 1335) dis>=3.6126 11 1.914545e+01 21.536360
## 2670) rm< 5.923 6 4.073333e+00 20.766670 *
## 2671) rm>=5.923 5 7.252000e+00 22.460000 *
## 167) ptratio< 16.85 12 4.524250e+01 21.725000
## 334) lstat>=12.94 3 1.768667e+01 20.133330 *
## 335) lstat< 12.94 9 1.742222e+01 22.255560
## 670) ptratio< 16.5 6 1.411500e+01 21.850000 *
## 671) ptratio>=16.5 3 3.466667e-01 23.066670 *
## 21) rm>=6.0985 66 1.429943e+03 22.710610
## 42) lstat>=10.1 37 1.225611e+02 20.867570
## 84) rm>=6.4035 8 3.914000e+01 19.300000 *
## 85) rm< 6.4035 29 5.834000e+01 21.300000
## 170) rm< 6.318 21 2.700952e+01 20.838100
## 340) rm>=6.25 5 3.788000e+00 19.820000 *
## 341) rm< 6.25 16 1.641937e+01 21.156250
## 682) lstat>=12.7 9 6.328889e+00 20.711110
## 1364) dis>=3.03135 3 1.006667e+00 20.233330 *
## 1365) dis< 3.03135 6 4.295000e+00 20.950000 *
## 683) lstat< 12.7 7 6.014286e+00 21.728570 *
## 171) rm>=6.318 8 1.508875e+01 22.512500 *
## 43) lstat< 10.1 29 1.021348e+03 25.062070
## 86) lstat< 9.44 19 1.731253e+02 23.084210
## 172) rm< 6.5915 16 7.506937e+01 22.106250
## 344) dis>=5.68815 5 3.310800e+01 20.620000 *
## 345) dis< 5.68815 11 2.589636e+01 22.781820
## 690) ptratio>=17.2 8 7.028750e+00 22.087500 *
## 691) ptratio< 17.2 3 4.726667e+00 24.633330 *
## 173) rm>=6.5915 3 1.140000e+00 28.300000 *
## 87) lstat>=9.44 10 6.326760e+02 28.820000
## 174) rm>=6.3025 6 2.749333e+01 25.333330 *
## 175) rm< 6.3025 4 4.228300e+02 34.050000 *
## 11) lstat< 7.81 60 1.265414e+03 26.410000
## 22) lstat>=4.52 52 4.415698e+02 25.251920
## 44) rm< 6.7225 45 2.156711e+02 24.555560
## 88) dis>=6.12605 15 3.372400e+01 23.180000
## 176) rm< 6.364 7 7.820000e+00 22.100000 *
## 177) rm>=6.364 8 1.059500e+01 24.125000 *
## 89) dis< 6.12605 30 1.393737e+02 25.243330
## 178) rm< 6.4225 18 3.402000e+01 24.433330
## 356) dis>=2.1583 15 1.599333e+01 24.066670
## 712) rm< 6.168 3 3.800000e-01 22.600000 *
## 713) rm>=6.168 12 7.546667e+00 24.433330
## 1426) ptratio>=19.1 5 4.120000e+00 23.900000 *
## 1427) ptratio< 19.1 7 9.885714e-01 24.814290 *
## 357) dis< 2.1583 3 5.926667e+00 26.266670 *
## 179) rm>=6.4225 12 7.582917e+01 26.458330
## 358) lstat< 5.145 4 1.428750e+01 24.825000 *
## 359) lstat>=5.145 8 4.553500e+01 27.275000 *
## 45) rm>=6.7225 7 6.379429e+01 29.728570 *
## 23) lstat< 4.52 8 3.007987e+02 33.937500 *
## 3) rm>=6.92 63 4.810337e+03 38.119050
## 6) rm< 7.435 33 9.230406e+02 31.775760
## 12) ptratio>=19.65 5 1.396400e+02 22.300000 *
## 13) ptratio< 19.65 28 2.542811e+02 33.467860
## 26) lstat>=9.65 3 1.152667e+01 29.466670 *
## 27) lstat< 9.65 25 1.889624e+02 33.948000
## 54) dis>=3.02235 21 1.126524e+02 33.352380
## 108) dis< 4.35735 5 1.065200e+01 31.340000 *
## 109) dis>=4.35735 16 7.542437e+01 33.981250
## 218) dis>=7.7406 4 2.707000e+01 31.750000 *
## 219) dis< 7.7406 12 2.180250e+01 34.725000
## 438) rm>=7.213 3 2.066667e-01 33.066670 *
## 439) rm< 7.213 9 1.059556e+01 35.277780
## 878) dis< 5.77695 5 3.080000e+00 34.600000 *
## 879) dis>=5.77695 4 2.347500e+00 36.125000 *
## 55) dis< 3.02235 4 2.974750e+01 37.075000 *
## 7) rm>=7.435 30 1.098850e+03 45.096670
## 14) ptratio>=18.3 3 2.238200e+02 33.300000 *
## 15) ptratio< 18.3 27 4.111585e+02 46.407410
## 30) ptratio>=14.8 13 1.946277e+02 44.369230
## 60) lstat< 3.665 5 4.076800e+01 42.620000 *
## 61) lstat>=3.665 8 1.289988e+02 45.462500 *
## 31) ptratio< 14.8 14 1.123800e+02 48.300000
## 62) rm< 7.706 4 3.784750e+01 44.725000 *
## 63) rm>=7.706 10 2.961000e+00 49.730000
## 126) lstat>=3.755 4 1.867500e+00 49.325000 *
## 127) lstat< 3.755 6 0.000000e+00 50.000000 *
xerr <- arbol_completo$cptable[,"xerror"]
minxerr <- which.min(xerr)
mincp <- arbol_completo$cptable[minxerr, "CP"] ### Criterio de Parada es 0.001538394
arbol_prune <- prune(arbol_completo, cp=mincp)
plot(as.party(arbol_prune), tp_args = list(id=FALSE))
Metrics::rmse(actual = datos_train$medv, predicted = predict(arbol_prune))
## [1] 3.077051
Metrics::rmse(actual = datos_test$medv, predicted = predict(arbol_prune))
## Warning in actual - predicted: longitud de objeto mayor no es múltiplo de la
## longitud de uno menor
## [1] 11.21912