#————————————————————————- # ANÁLISIS DE REGRESIÓN #————————————————————————-

Para este proyecto estaremos trabajando con la data Boston del paquete MASS que recoge la media del valor de la vivienda en 506 áreas residenciales de Boston. Junto con el precio, se han registrado 13 variables adicionales.

Este marco de datos contiene las siguientes columnas:
crim: tasa de criminalidad per cápita por ciudad.
zn: proporción de terrenos residenciales con zonificación para lotes de más de 25.000 pies cuadrados.
indus: proporción de acres comerciales no minoristas por ciudad.
chas: Variable ficticia del río Charles (= 1 si el tramo linda con el río; 0 en caso contrario).
nox: concentración de óxidos de nitrógeno (partes por 10 millones).
rm: número medio de habitaciones por vivienda.
age: proporción de unidades ocupadas por sus propietarios coantes de 1940.
dis: media ponderada de las distancias a cinco centros de trabajo de Boston.
rad: índice de accesibilidad a las autopistas radiales.
tax: propiedad de valor total - tipo impositivo por 10.000 dólares.
ptratio: ratio alumnos-docente por ciudad.
black: 1000(Bk - 0,63)^2 donde BkBk es la proporción de negros por ciudad.
lstat: estatus inferior de la población (porcentaje).
medv: valor medio de las viviendas ocupada.

En primer lugar se realiza un análisis básico de los datos de forma numérica y gráfica.

library(MASS)


data=Boston
head(data)
##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
##   medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
# La variable chas es una variable categórica por lo que se transforma a factor
data$chas <- as.factor(Boston$chas)
summary(data)
##       crim                zn             indus       chas         nox        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   0:471   Min.   :0.3850  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1: 35   1st Qu.:0.4490  
##  Median : 0.25651   Median :  0.00   Median : 9.69           Median :0.5380  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14           Mean   :0.5547  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10           3rd Qu.:0.6240  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74           Max.   :0.8710  
##        rm             age              dis              rad        
##  Min.   :3.561   Min.   :  2.90   Min.   : 1.130   Min.   : 1.000  
##  1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100   1st Qu.: 4.000  
##  Median :6.208   Median : 77.50   Median : 3.207   Median : 5.000  
##  Mean   :6.285   Mean   : 68.57   Mean   : 3.795   Mean   : 9.549  
##  3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188   3rd Qu.:24.000  
##  Max.   :8.780   Max.   :100.00   Max.   :12.127   Max.   :24.000  
##       tax           ptratio          black            lstat      
##  Min.   :187.0   Min.   :12.60   Min.   :  0.32   Min.   : 1.73  
##  1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38   1st Qu.: 6.95  
##  Median :330.0   Median :19.05   Median :391.44   Median :11.36  
##  Mean   :408.2   Mean   :18.46   Mean   :356.67   Mean   :12.65  
##  3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23   3rd Qu.:16.95  
##  Max.   :711.0   Max.   :22.00   Max.   :396.90   Max.   :37.97  
##       medv      
##  Min.   : 5.00  
##  1st Qu.:17.02  
##  Median :21.20  
##  Mean   :22.53  
##  3rd Qu.:25.00  
##  Max.   :50.00
# Distribución del valor medio de las viviendas   (aparentemente no normal,  distribución asimétrica positiva)
hist(data$medv, probability = TRUE)
lines(density(data$medv), col=2, lwd=2)

# Relación del valor medio de las viviendas con los posibles predictores
par(mfrow=c(2,3))
# Diagramas de Dispersión
plot(data$crim,data$medv) ## No es lineal 

plot(data$zn,data$medv) ## no es lineal

plot(data$indus,data$medv)## no es lineal

plot(data$chas,data$medv) ## no es lineal

plot(data$nox,data$medv) ## no es lineal

plot(data$rm,data$medv)  ## lineal

plot(data$age,data$medv) ## no es lineal
plot(data$dis,data$medv) ## lineal
plot(data$rad,data$medv) ## no es lineal
plot(data$tax,data$medv) ## no es lineal
plot(data$med,data$ptratio) ## no es lineal
plot(data$black,data$medv) ## no es lineal

plot(data$lstat,data$medv)  ## lineal

#========================================================================== # Estimación #==========================================================================

# Modelo de regresión
form <- medv ~ rm+dis+lstat+ptratio
modelo <- lm(formula=form, data=data)
summary(modelo)
## 
## Call:
## lm(formula = form, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.4172  -3.0971  -0.6397   1.8727  27.1088 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 24.47136    4.07802   6.001 3.77e-09 ***
## rm           4.22379    0.42382   9.966  < 2e-16 ***
## dis         -0.55193    0.12695  -4.348 1.67e-05 ***
## lstat       -0.66544    0.04675 -14.233  < 2e-16 ***
## ptratio     -0.97365    0.11603  -8.391 4.94e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.139 on 501 degrees of freedom
## Multiple R-squared:  0.6903, Adjusted R-squared:  0.6878 
## F-statistic: 279.2 on 4 and 501 DF,  p-value: < 2.2e-16

#=============================================================================== # Evaluación de Supuestos del modelo #===============================================================================

(1) Normalidad

# """"""""""""""""

# Gráfico Q-Q
qqnorm(rstudent(modelo), 
       ylab="Cuantiles de los Residuos Estudentizados", 
       xlab="Cuantiles teóricos")
qqline(rstudent(modelo))

# Test de normalidad
shapiro.test(rstudent(modelo)) ## No es normal
## 
##  Shapiro-Wilk normality test
## 
## data:  rstudent(modelo)
## W = 0.90565, p-value < 2.2e-16

(2) Homocedasticidad

# Gráfico de residuos estudentizados 
rs <- rstudent(modelo)    # Residuos studentizados
ypred <- predict(modelo)  # Valores ajustados (predicci?n de la media)
# L?mites para detectar outliers: 
# Cuantiles de la distribuci?n t-student (correcci?n de Bonferroni)
n <- dim(data)[1]
p <- length(modelo$coefficients)
alfa <- 0.01
t <- qt(alfa/(n*2), df=n-p-1)    # distribuci?n t

par(mfrow=c(3,2))
plot(ypred, rs, ylab="Residuos Studentizados", xlab="Valor medio de las viviendas ocupadas",
     main="residuos Studentizados vs valore medio de las habitaciones ocupadas",ylim=c(-5,5)) 
abline(-t, 0)
abline( t, 0)
abline( 0, 0)

plot(data$rm, rs, ylab="Residuos Studentizados",
     xlab="Número medio de habitaciones por vivienda", 
     main="Residuos Studentizados vs Número medio de habitaciones por vivienda", ylim=c(-5,5)) 
abline(-t, 0)
abline( t, 0)
abline( 0, 0)

plot(data$dis, rs, ylab="Residuos Studentizados", xlab="media ponderada de las distancias a cinco centros de trabajo de Boston",
     main="Residuos Studentizados vs media ponderada de las distancias a cinco centros de trabajo de Boston", ylim=c(-5,5)) 
abline(-t, 0)
abline( t, 0)
abline( 0, 0)

plot(data$lstat,rs, ylab="Residuos Studentizados",
     xlab="estatus inferior de la población (porcentaje)",
     main="Residuos Studentizados vs estatus inferior de la población (porcentaje)", ylim=c(-5,5)) 
abline(-t, 0)
abline( t, 0)
abline( 0, 0)
par(mfrow=c(1,1))

plot(data$ptratio,rs, ylab="Residuos Studentizados",
     xlab="ratio alumnos-docente por ciudad",
     main="Residuos Studentizados vs ratio alumnos-docente por ciudad", ylim=c(-5,5)) 
abline(-t, 0)
abline( t, 0)
abline( 0, 0)

par(mfrow=c(1,1))
# Test de Heterocedasticidad (Test de Breusch-Pagan)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
bptest(modelo, varformula = ~fitted.values(modelo), studentize=FALSE)
## 
##  Breusch-Pagan test
## 
## data:  modelo
## BP = 0.80169, df = 1, p-value = 0.3706
## Hay Homecedasticidad

# Test chi-cuadrado (similar al anterior)
library(car)
## Loading required package: carData
ncvTest(modelo)   # Test para varianza no constante de error
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 0.8016934, Df = 1, p = 0.37059

(3) MULTICOLINEALIDAD

car::vif(modelo)         ## como los valores de Vif son menores a 7 entonces 
##       rm      dis    lstat  ptratio 
## 1.695903 1.366730 2.131782 1.206839
                          ## no hay multicolinealidad


# (4) Autocorrelaci?n    
# """"""""""""""""""""""""
# Gr?fico de Residuos a trav?s del tiempo
plot(residuals(modelo), ylab="Residuos")
abline(h=0)

(4) AUTOCORRELACIÓN

# Gráfico de Residuos a través del tiempo
plot(residuals(modelo), ylab="Residuos")
abline(h=0)

# Prueba de Durbin-Watson ## si hay autocorrelación
library(lmtest)
dwtest(modelo)
## 
##  Durbin-Watson test
## 
## data:  modelo
## DW = 0.97414, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
cor(data[c(6,8,13,11)])
##                 rm        dis      lstat    ptratio
## rm       1.0000000  0.2052462 -0.6138083 -0.3555015
## dis      0.2052462  1.0000000 -0.4969958 -0.2324705
## lstat   -0.6138083 -0.4969958  1.0000000  0.3740443
## ptratio -0.3555015 -0.2324705  0.3740443  1.0000000

======================================================================

Evaluación de la calidad predictiva del modelo

======================================================================

library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
set.seed(325)

# Divisi?n de datos: conjunto de entrenamiento/validaci?n (75%) y evaluaci?n (25%)
idx_train <- createDataPartition(y=data$medv, p=0.75, list=FALSE, times=1)
datos_train <- data[idx_train,]
datos_test  <- data[-idx_train,]

mean(datos_train$medv)
## [1] 22.7021
mean(datos_test$medv)
## [1] 22.0168
mean(data$medv)
## [1] 22.53281

———————————————————-

Validación Cruzada (K=10) con 5 repeticiones

———————————————————-

formul <- medv ~ rm+dis+lstat+ptratio

# Par?metros
k <- 10
repeticiones <- 5

trc <- trainControl(method="repeatedcv", number=k, repeats=repeticiones, 
                    returnResamp="all")
## Modelo Lineal Normal
# ______________________

set.seed(835)
modeloLN <- train(formul, data=datos_train, method ="glm", trControl=trc)

modeloLN
## Generalized Linear Model 
## 
## 381 samples
##   4 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 341, 344, 343, 343, 344, 343, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   5.000539  0.7327531  3.604369
summary(modeloLN)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -15.9203   -2.9926   -0.4902    2.0343   28.3828  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  21.8536     4.3658   5.006 8.57e-07 ***
## rm            4.6615     0.4467  10.436  < 2e-16 ***
## dis          -0.3998     0.1450  -2.758  0.00611 ** 
## lstat        -0.6544     0.0527 -12.416  < 2e-16 ***
## ptratio      -1.0266     0.1295  -7.926 2.60e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 25.32944)
## 
##     Null deviance: 34966.3  on 380  degrees of freedom
## Residual deviance:  9523.9  on 376  degrees of freedom
## AIC: 2319.6
## 
## Number of Fisher Scoring iterations: 2
modeloLN$results    # RMSE, R2, MAE (y sus desviaciones estándar)
##   parameter     RMSE  Rsquared      MAE   RMSESD RsquaredSD     MAESD
## 1      none 5.000539 0.7327531 3.604369 1.116272  0.1184313 0.6178617
modeloLN$resample   # Valores para cada muestra (50 = K*repeticiones)
##        RMSE  Rsquared      MAE parameter    Resample
## 1  4.408379 0.7014579 3.039035      none Fold01.Rep1
## 2  4.338228 0.8792529 3.272949      none Fold02.Rep1
## 3  5.247855 0.6679846 3.746958      none Fold03.Rep1
## 4  4.683635 0.7527255 3.467382      none Fold04.Rep1
## 5  6.787827 0.4796984 4.184304      none Fold05.Rep1
## 6  4.801293 0.7719866 3.915903      none Fold06.Rep1
## 7  4.551654 0.8139078 3.354722      none Fold07.Rep1
## 8  5.204671 0.7222084 3.795361      none Fold08.Rep1
## 9  6.449686 0.6192732 4.164082      none Fold09.Rep1
## 10 3.924061 0.8644889 3.075365      none Fold10.Rep1
## 11 4.837342 0.6519809 3.330426      none Fold01.Rep2
## 12 5.686388 0.7354271 4.156856      none Fold02.Rep2
## 13 3.776516 0.8736785 2.783363      none Fold03.Rep2
## 14 3.389044 0.8238059 2.462767      none Fold04.Rep2
## 15 4.857572 0.7754441 3.710986      none Fold05.Rep2
## 16 8.452796 0.3866198 5.247763      none Fold06.Rep2
## 17 4.056696 0.8056301 2.981245      none Fold07.Rep2
## 18 4.928469 0.8530391 3.803843      none Fold08.Rep2
## 19 4.725286 0.7865132 3.703237      none Fold09.Rep2
## 20 4.818587 0.7043865 3.722729      none Fold10.Rep2
## 21 7.128709 0.5350221 4.610090      none Fold01.Rep3
## 22 3.664787 0.8679226 2.879945      none Fold02.Rep3
## 23 3.992174 0.8640661 2.903413      none Fold03.Rep3
## 24 5.499416 0.8546932 4.258526      none Fold04.Rep3
## 25 5.966345 0.6276309 4.105701      none Fold05.Rep3
## 26 4.397165 0.7125252 3.182818      none Fold06.Rep3
## 27 5.731852 0.5892663 3.527246      none Fold07.Rep3
## 28 4.861257 0.7591147 3.722412      none Fold08.Rep3
## 29 3.697400 0.8038619 3.104669      none Fold09.Rep3
## 30 5.253287 0.7091883 3.841354      none Fold10.Rep3
## 31 7.667727 0.4894943 5.341016      none Fold01.Rep4
## 32 5.306014 0.7834401 3.950162      none Fold02.Rep4
## 33 5.462774 0.6026594 3.769523      none Fold03.Rep4
## 34 3.915572 0.8010192 3.122524      none Fold04.Rep4
## 35 3.675710 0.8282373 2.910275      none Fold05.Rep4
## 36 5.020130 0.7289566 3.794208      none Fold06.Rep4
## 37 4.267967 0.8955359 3.168030      none Fold07.Rep4
## 38 4.919291 0.7373698 3.614677      none Fold08.Rep4
## 39 5.478496 0.6881504 3.209126      none Fold09.Rep4
## 40 4.341529 0.7983864 3.242765      none Fold10.Rep4
## 41 6.034121 0.6384404 4.434889      none Fold01.Rep5
## 42 6.536502 0.6616645 4.369320      none Fold02.Rep5
## 43 4.631785 0.7600204 3.665671      none Fold03.Rep5
## 44 3.301993 0.8591778 2.723161      none Fold04.Rep5
## 45 7.013296 0.4494633 4.407800      none Fold05.Rep5
## 46 3.538128 0.7715193 2.657300      none Fold06.Rep5
## 47 4.495501 0.7866458 3.293699      none Fold07.Rep5
## 48 4.635316 0.7885990 3.115548      none Fold08.Rep5
## 49 4.828698 0.7765287 3.539522      none Fold09.Rep5
## 50 4.838006 0.7995453 3.833798      none Fold10.Rep5
summary(modeloLN$resample$RMSE)  # Media del RMSE
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.302   4.339   4.833   5.001   5.475   8.453
sd(modeloLN$resample$RMSE)       # Desviaci?n est?ndar del RMSE
## [1] 1.116272
se <- sd(modeloLN$resample$RMSE)/sqrt(k*repeticiones)
se
## [1] 0.1578646
library(ggplot2)
library(ggpubr)
# Gráficas de evaluación de la precisión (evolución de RMSE con cada muestra)
p1 <- ggplot(data=modeloLN$resample, aes(x=RMSE)) + 
      geom_density(alpha=0.5, fill="gray50") +
      geom_vline(xintercept=mean(modeloLN$resample$RMSE), linetype="dashed") +
      theme_bw() 
p2 <- ggplot(data=modeloLN$resample, aes(x=1, y=RMSE)) +
      geom_boxplot(outlier.shape=NA, alpha=0.5, fill="gray50") +
      geom_jitter(width=0.05) + labs(x="") + theme_bw() +
      theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
p3 <- ggplot(data=modeloLN$resample, aes(x=Rsquared)) + 
      geom_density(alpha=0.5, fill="gray50") +
      geom_vline(xintercept=mean(modeloLN$resample$Rsquared), linetype="dashed") + 
      theme_bw() 
p4 <- ggplot(data=modeloLN$resample, aes(x=1, y=Rsquared)) +
      geom_boxplot(outlier.shape=NA, alpha=0.5, fill="gray50") +
      geom_jitter(width=0.05) + labs(x="") + theme_bw() +
      theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
p5 <- ggplot(data=modeloLN$resample, aes(x=MAE)) +
      geom_density(alpha=0.5, fill="gray50") +
      geom_vline(xintercept=mean(modeloLN$resample$MAE), linetype="dashed") +
      theme_bw() 
p6 <- ggplot(data=modeloLN$resample, aes(x=1, y=MAE)) +
      geom_boxplot(outlier.shape=NA, alpha=0.5, fill="gray50") +
      geom_jitter(width=0.05) + labs(x="") + theme_bw() +
      theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())

final_plot <- ggarrange(p1, p2, p3, p4, p5, p6, ncol=2, nrow=3)
final_plot <- annotate_figure(final_plot,
                              top=text_grob("Evaluación: Modelo Lineal Normal", size=15))
final_plot

MODELO GAMMA CON ENLACE IDENTIDAD

set.seed(835)
modeloGI <- train(formul, data=datos_train, method ="glm", 
                  family=Gamma(link=identity), trControl=trc)
## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge
modeloGI
## Generalized Linear Model 
## 
## 381 samples
##   4 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 341, 344, 343, 343, 344, 343, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   5.711663  0.6856126  4.059872
summary(modeloGI)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.87149  -0.15264  -0.02804   0.09981   1.22306  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 44.01286    4.72327   9.318  < 2e-16 ***
## rm           1.38839    0.46662   2.975  0.00311 ** 
## dis         -0.04619    0.16919  -0.273  0.78498    
## lstat       -0.51827    0.04400 -11.778  < 2e-16 ***
## ptratio     -1.29108    0.15567  -8.294 1.98e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.06854626)
## 
##     Null deviance: 66.192  on 380  degrees of freedom
## Residual deviance: 23.210  on 376  degrees of freedom
## AIC: 2344.1
## 
## Number of Fisher Scoring iterations: 25
modeloGI$results
##   parameter     RMSE  Rsquared      MAE   RMSESD RsquaredSD     MAESD
## 1      none 5.711663 0.6856126 4.059872 1.056502  0.1099726 0.6156645
# Gr?ficas de evaluaci?n de la precisi?n (evoluci?n de RMSE con cada muestra)
p1 <- ggplot(data=modeloGI$resample, aes(x=RMSE)) +
      geom_density(alpha=0.5, fill="gray50") +
      geom_vline(xintercept=mean(modeloGI$resample$RMSE), linetype="dashed") +
      theme_bw() 
p2 <- ggplot(data=modeloGI$resample, aes(x=1, y=RMSE)) +
      geom_boxplot(outlier.shape=NA, alpha=0.5, fill="gray50") +
      geom_jitter(width=0.05) + labs(x="") + theme_bw() +
      theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
p3 <- ggplot(data=modeloGI$resample, aes(x=Rsquared)) + 
      geom_density(alpha=0.5, fill="gray50") +
      geom_vline(xintercept=mean(modeloGI$resample$Rsquared), linetype="dashed") +
      theme_bw() 
p4 <- ggplot(data=modeloGI$resample, aes(x=1, y=Rsquared)) +
      geom_boxplot(outlier.shape=NA, alpha=0.5, fill="gray50") +
      geom_jitter(width=0.05) + labs(x="") + theme_bw() +
      theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
p5 <- ggplot(data=modeloGI$resample, aes(x=MAE)) +
      geom_density(alpha=0.5, fill="gray50") +
      geom_vline(xintercept=mean(modeloGI$resample$MAE), linetype="dashed") +
      theme_bw() 
p6 <- ggplot(data=modeloGI$resample, aes(x=1, y=MAE)) +
      geom_boxplot(outlier.shape=NA, alpha=0.5, fill="gray50") +
      geom_jitter(width=0.05) + labs(x="") + theme_bw() +
      theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())

final_plot <- ggarrange(p1, p2, p3, p4, p5, p6, ncol=2, nrow=3)
final_plot <- annotate_figure(final_plot,
                              top=text_grob("Evaluaci?n: Gamma (enlace identidad)", size = 15))
final_plot

MODELO GAMMA CON ENLACE LOGARÍTMICO

set.seed(835)
modeloGL <- train(formul, data=datos_train, method="glm", 
                  family=Gamma(link=log), trControl=trc)
modeloGL
## Generalized Linear Model 
## 
## 381 samples
##   4 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 341, 344, 343, 343, 344, 343, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   4.459273  0.7879906  3.223239
summary(modeloGL)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.81941  -0.11750  -0.02004   0.09142   1.00288  
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.644296   0.197485  18.454  < 2e-16 ***
## rm           0.106294   0.020204   5.261 2.41e-07 ***
## dis         -0.010850   0.006559  -1.654   0.0989 .  
## lstat       -0.036807   0.002384 -15.439  < 2e-16 ***
## ptratio     -0.041054   0.005859  -7.007 1.13e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.05182761)
## 
##     Null deviance: 66.192  on 380  degrees of freedom
## Residual deviance: 17.734  on 376  degrees of freedom
## AIC: 2240.6
## 
## Number of Fisher Scoring iterations: 6
modeloGL$results
##   parameter     RMSE  Rsquared      MAE   RMSESD RsquaredSD     MAESD
## 1      none 4.459273 0.7879906 3.223239 1.050621  0.1033864 0.5591482
# Gr?ficas de evaluaci?n de la precisi?n
p1 <- ggplot(data = modeloGL$resample, aes(x = RMSE)) +
  geom_density(alpha = 0.5, fill = "gray50") +
  geom_vline(xintercept = mean(modeloGL$resample$RMSE),
             linetype = "dashed") +
  theme_bw() 
p2 <- ggplot(data = modeloGL$resample, aes(x = 1, y = RMSE)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.5, fill = "gray50") +
  geom_jitter(width = 0.05) +
  labs(x = "") +
  theme_bw() +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
p3 <- ggplot(data = modeloGL$resample, aes(x = Rsquared)) +
  geom_density(alpha = 0.5, fill = "gray50") +
  geom_vline(xintercept = mean(modeloGL$resample$Rsquared),
             linetype = "dashed") +
  theme_bw() 
p4 <- ggplot(data = modeloGL$resample, aes(x = 1, y = Rsquared)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.5, fill = "gray50") +
  geom_jitter(width = 0.05) +
  labs(x = "") +
  theme_bw() +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
p5 <- ggplot(data = modeloGL$resample, aes(x = MAE)) +
  geom_density(alpha = 0.5, fill = "gray50") +
  geom_vline(xintercept = mean(modeloGL$resample$MAE),
             linetype = "dashed") +
  theme_bw() 
p6 <- ggplot(data = modeloGL$resample, aes(x = 1, y = MAE)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.5, fill = "gray50") +
  geom_jitter(width = 0.05) +
  labs(x = "") +
  theme_bw() +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())

final_plot <- ggarrange(p1, p2, p3, p4, p5, p6, ncol = 2, nrow = 3)
final_plot <- annotate_figure(
  final_plot,
  top = text_grob("Evaluaci?n: Modelo Gamma (enlace logar?tmico)", size = 15))
final_plot

MODELO GAMMA CON ENLACE RECÍPROCO

set.seed(835)
modeloGR <- train(formul, data=datos_train, method ="glm", 
                  family=Gamma(link=inverse), trControl=trc)
modeloGR
## Generalized Linear Model 
## 
## 381 samples
##   4 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 341, 344, 343, 343, 344, 343, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   4.196792  0.8060329  3.070947
summary(modeloGR)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.85021  -0.11448  -0.01522   0.10336   0.92976  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.0238310  0.0076155   3.129  0.00189 ** 
## rm          -0.0036528  0.0007227  -5.054 6.75e-07 ***
## dis          0.0005783  0.0002415   2.394  0.01714 *  
## lstat        0.0020435  0.0001207  16.930  < 2e-16 ***
## ptratio      0.0011623  0.0002234   5.202 3.25e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.04529821)
## 
##     Null deviance: 66.192  on 380  degrees of freedom
## Residual deviance: 16.656  on 376  degrees of freedom
## AIC: 2216.6
## 
## Number of Fisher Scoring iterations: 4
modeloGR$results
##   parameter     RMSE  Rsquared      MAE RMSESD RsquaredSD     MAESD
## 1      none 4.196792 0.8060329 3.070947 1.0509  0.1004044 0.5453499
## Gr?ficas de evaluaci?n de la precisi?n

p1 <- ggplot(data = modeloGR$resample, aes(x = RMSE)) +
  geom_density(alpha = 0.5, fill = "gray50") +
  geom_vline(xintercept = mean(modeloGR$resample$RMSE),
             linetype = "dashed") +
  theme_bw() 
p2 <- ggplot(data = modeloGR$resample, aes(x = 1, y = RMSE)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.5, fill = "gray50") +
  geom_jitter(width = 0.05) +
  labs(x = "") +
  theme_bw() +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
p3 <- ggplot(data = modeloGR$resample, aes(x = Rsquared)) +
  geom_density(alpha = 0.5, fill = "gray50") +
  geom_vline(xintercept = mean(modeloGR$resample$Rsquared),
             linetype = "dashed") +
  theme_bw() 
p4 <- ggplot(data = modeloGR$resample, aes(x = 1, y = Rsquared)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.5, fill = "gray50") +
  geom_jitter(width = 0.05) +
  labs(x = "") +
  theme_bw() +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
p5 <- ggplot(data = modeloGR$resample, aes(x = MAE)) +
  geom_density(alpha = 0.5, fill = "gray50") +
  geom_vline(xintercept = mean(modeloGR$resample$MAE),
             linetype = "dashed") +
  theme_bw() 
p6 <- ggplot(data = modeloGR$resample, aes(x = 1, y = MAE)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.5, fill = "gray50") +
  geom_jitter(width = 0.05) +
  labs(x = "") +
  theme_bw() +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())

final_plot <- ggarrange(p1, p2, p3, p4, p5, p6, ncol = 2, nrow = 3)
final_plot <- annotate_figure(
  final_plot,
  top = text_grob("Evaluaci?n: Modelo Gamma (enlace rec?proco)", size = 15))
final_plot

MODELO NORMAL INVER CON ENLACE IDENTIDAD

set.seed(835)
modeloNII <- train(formul, data=datos_train, method="glm",
                   family=inverse.gaussian(link=identity), trControl=trc)
## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge
## Warning in sqrt((weights[good] * mu.eta.val[good]^2)/variance(mu)[good]): Se han
## producido NaNs
## Warning: model fit failed for Fold03.Rep1: parameter=none Error in glm.fit(x = structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  : 
##   NA/NaN/Inf in 'x'
## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge
## Warning in sqrt((weights[good] * mu.eta.val[good]^2)/variance(mu)[good]): Se han
## producido NaNs
## Warning: model fit failed for Fold05.Rep2: parameter=none Error in glm.fit(x = structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  : 
##   NA/NaN/Inf in 'x'
## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge
## Warning in sqrt((weights[good] * mu.eta.val[good]^2)/variance(mu)[good]): Se han
## producido NaNs
## Warning: model fit failed for Fold10.Rep3: parameter=none Error in glm.fit(x = structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  : 
##   NA/NaN/Inf in 'x'
## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: algorithm did not converge
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## Warning: glm.fit: algorithm did not converge
modeloNII
## Generalized Linear Model 
## 
## 381 samples
##   4 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 341, 344, 343, 343, 344, 343, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   6.161884  0.6243431  4.359462
summary(modeloNII)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##       Min         1Q     Median         3Q        Max  
## -0.284289  -0.035274  -0.006564   0.024345   0.309988  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 53.93633    4.97423  10.843  < 2e-16 ***
## rm           0.09596    0.45305   0.212    0.832    
## dis          0.13015    0.18997   0.685    0.494    
## lstat       -0.51766    0.04231 -12.236  < 2e-16 ***
## ptratio     -1.43795    0.17754  -8.099 7.79e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for inverse.gaussian family taken to be 0.003896856)
## 
##     Null deviance: 3.3766  on 380  degrees of freedom
## Residual deviance: 1.3616  on 376  degrees of freedom
## AIC: 2416.3
## 
## Number of Fisher Scoring iterations: 25
modeloNII$results
##   parameter     RMSE  Rsquared      MAE   RMSESD RsquaredSD     MAESD
## 1      none 6.161884 0.6243431 4.359462 1.145676  0.1150634 0.6637477
## Gr?ficas de evaluaci?n de la precisi?n
p1 <- ggplot(data = modeloNII$resample, aes(x = RMSE)) +
  geom_density(alpha = 0.5, fill = "gray50") +
  geom_vline(xintercept = mean(modeloNII$resample$RMSE),
             linetype = "dashed") +
  theme_bw() 
p2 <- ggplot(data = modeloNII$resample, aes(x = 1, y = RMSE)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.5, fill = "gray50") +
  geom_jitter(width = 0.05) +
  labs(x = "") +
  theme_bw() +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
p3 <- ggplot(data = modeloNII$resample, aes(x = Rsquared)) +
  geom_density(alpha = 0.5, fill = "gray50") +
  geom_vline(xintercept = mean(modeloNII$resample$Rsquared),
             linetype = "dashed") +
  theme_bw() 
p4 <- ggplot(data = modeloNII$resample, aes(x = 1, y = Rsquared)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.5, fill = "gray50") +
  geom_jitter(width = 0.05) +
  labs(x = "") +
  theme_bw() +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
p5 <- ggplot(data = modeloNII$resample, aes(x = MAE)) +
  geom_density(alpha = 0.5, fill = "gray50") +
  geom_vline(xintercept = mean(modeloNII$resample$MAE),
             linetype = "dashed") +
  theme_bw() 
p6 <- ggplot(data = modeloNII$resample, aes(x = 1, y = MAE)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.5, fill = "gray50") +
  geom_jitter(width = 0.05) +
  labs(x = "") +
  theme_bw() +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())

final_plot <- ggarrange(p1, p2, p3, p4, p5, p6, ncol = 2, nrow = 3)
## Warning: Removed 3 rows containing non-finite values (stat_density).
## Warning: Removed 1 rows containing missing values (geom_vline).
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing non-finite values (stat_density).
## Warning: Removed 1 rows containing missing values (geom_vline).
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing non-finite values (stat_density).
## Warning: Removed 1 rows containing missing values (geom_vline).
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).
## Warning: Removed 3 rows containing missing values (geom_point).
final_plot <- annotate_figure(
  final_plot,
  top = text_grob("Evaluaci?n: Modelo Normal Inversa (enlace identidad)", size = 15))
final_plot

MODELO NORMAL INVERSA CON ENLACE LOGARÍTMICO

set.seed(835)
modeloNIL <- train(formul, data=datos_train, method ="glm",
                   family=inverse.gaussian(link=log), trControl=trc)
modeloNIL
## Generalized Linear Model 
## 
## 381 samples
##   4 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 341, 344, 343, 343, 344, 343, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   4.904326  0.7517099  3.543702
summary(modeloNIL)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##       Min         1Q     Median         3Q        Max  
## -0.275035  -0.027426  -0.003875   0.022128   0.283251  
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.067471   0.226195  17.982  < 2e-16 ***
## rm           0.055182   0.023489   2.349   0.0193 *  
## dis         -0.002636   0.007865  -0.335   0.7377    
## lstat       -0.035848   0.002421 -14.807  < 2e-16 ***
## ptratio     -0.048989   0.006932  -7.068 7.71e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for inverse.gaussian family taken to be 0.003244903)
## 
##     Null deviance: 3.3766  on 380  degrees of freedom
## Residual deviance: 1.1460  on 376  degrees of freedom
## AIC: 2350.6
## 
## Number of Fisher Scoring iterations: 9
modeloNIL$results
##   parameter     RMSE  Rsquared      MAE   RMSESD RsquaredSD   MAESD
## 1      none 4.904326 0.7517099 3.543702 1.049479  0.1017423 0.57823
## Gr?ficas de evaluaci?n de la precisi?n
p1 <- ggplot(data = modeloNIL$resample, aes(x = RMSE)) +
  geom_density(alpha = 0.5, fill = "gray50") +
  geom_vline(xintercept = mean(modeloNIL$resample$RMSE),
             linetype = "dashed") +
  theme_bw() 
p2 <- ggplot(data = modeloNIL$resample, aes(x = 1, y = RMSE)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.5, fill = "gray50") +
  geom_jitter(width = 0.05) +
  labs(x = "") +
  theme_bw() +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
p3 <- ggplot(data = modeloNIL$resample, aes(x = Rsquared)) +
  geom_density(alpha = 0.5, fill = "gray50") +
  geom_vline(xintercept = mean(modeloNIL$resample$Rsquared),
             linetype = "dashed") +
  theme_bw() 
p4 <- ggplot(data = modeloNIL$resample, aes(x = 1, y = Rsquared)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.5, fill = "gray50") +
  geom_jitter(width = 0.05) +
  labs(x = "") +
  theme_bw() +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
p5 <- ggplot(data = modeloNIL$resample, aes(x = MAE)) +
  geom_density(alpha = 0.5, fill = "gray50") +
  geom_vline(xintercept = mean(modeloNIL$resample$MAE),
             linetype = "dashed") +
  theme_bw() 
p6 <- ggplot(data = modeloNIL$resample, aes(x = 1, y = MAE)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.5, fill = "gray50") +
  geom_jitter(width = 0.05) +
  labs(x = "") +
  theme_bw() +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())

final_plot <- ggarrange(p1, p2, p3, p4, p5, p6, ncol = 2, nrow = 3)
final_plot <- annotate_figure(
  final_plot,
  top = text_grob("Evaluaci?n: Modelo Normal Inversa (enlace logar?tmico", size = 15))
final_plot

MODELO NORAL INVERSA CON ENLACE RECÍPROCO CUADRÁTICO

set.seed(835)
modeloNIRC <- train(formul, data=datos_train, method="glm",
                    family=inverse.gaussian(link=1/mu^2), trControl=trc)
## Warning in sqrt(eta): Se han producido NaNs
## Warning: model fit failed for Fold02.Rep1: parameter=none Error : no valid set of coefficients has been found: please supply starting values
## Warning in sqrt(eta): Se han producido NaNs
## Warning: model fit failed for Fold02.Rep2: parameter=none Error : no valid set of coefficients has been found: please supply starting values
## Warning in sqrt(eta): Se han producido NaNs
## Warning: model fit failed for Fold01.Rep3: parameter=none Error : no valid set of coefficients has been found: please supply starting values
## Warning in sqrt(eta): Se han producido NaNs
## Warning: model fit failed for Fold06.Rep4: parameter=none Error : no valid set of coefficients has been found: please supply starting values
## Warning in sqrt(eta): Se han producido NaNs
## Warning: model fit failed for Fold10.Rep4: parameter=none Error : no valid set of coefficients has been found: please supply starting values
## Warning in sqrt(eta): Se han producido NaNs
## Warning: model fit failed for Fold01.Rep5: parameter=none Error : no valid set of coefficients has been found: please supply starting values
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
modeloNIRC
## Generalized Linear Model 
## 
## 381 samples
##   4 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 341, 344, 343, 343, 344, 343, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   5.959871  0.7224323  3.687739
summary(modeloNIRC)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##       Min         1Q     Median         3Q        Max  
## -0.290199  -0.025180  -0.000327   0.028298   0.189072  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.057e-04  6.873e-04  -0.299  0.76483    
## rm          -1.448e-04  6.599e-05  -2.195  0.02876 *  
## dis          2.146e-05  1.881e-05   1.141  0.25466    
## lstat        1.980e-04  1.168e-05  16.951  < 2e-16 ***
## ptratio      6.824e-05  2.125e-05   3.210  0.00144 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for inverse.gaussian family taken to be 0.002785377)
## 
##     Null deviance: 3.3766  on 380  degrees of freedom
## Residual deviance: 1.2191  on 376  degrees of freedom
## AIC: 2374.2
## 
## Number of Fisher Scoring iterations: 5
modeloNIRC$results
##   parameter     RMSE  Rsquared      MAE   RMSESD RsquaredSD     MAESD
## 1      none 5.959871 0.7224323 3.687739 3.301849  0.1170106 0.8979335
## Gr?ficas de evaluaci?n de la precisi?n
p1 <- ggplot(data = modeloNIRC$resample, aes(x = RMSE)) +
  geom_density(alpha = 0.5, fill = "gray50") +
  geom_vline(xintercept = mean(modeloNIRC$resample$RMSE),
             linetype = "dashed") +
  theme_bw() 
p2 <- ggplot(data = modeloNIRC$resample, aes(x = 1, y = RMSE)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.5, fill = "gray50") +
  geom_jitter(width = 0.05) +
  labs(x = "") +
  theme_bw() +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
p3 <- ggplot(data = modeloNIRC$resample, aes(x = Rsquared)) +
  geom_density(alpha = 0.5, fill = "gray50") +
  geom_vline(xintercept = mean(modeloNIRC$resample$Rsquared),
             linetype = "dashed") +
  theme_bw() 
p4 <- ggplot(data = modeloNIRC$resample, aes(x = 1, y = Rsquared)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.5, fill = "gray50") +
  geom_jitter(width = 0.05) +
  labs(x = "") +
  theme_bw() +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
p5 <- ggplot(data = modeloNIRC$resample, aes(x = MAE)) +
  geom_density(alpha = 0.5, fill = "gray50") +
  geom_vline(xintercept = mean(modeloNIRC$resample$MAE),
             linetype = "dashed") +
  theme_bw() 
p6 <- ggplot(data = modeloNIRC$resample, aes(x = 1, y = MAE)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.5, fill = "gray50") +
  geom_jitter(width = 0.05) +
  labs(x = "") +
  theme_bw() +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())

final_plot <- ggarrange(p1, p2, p3, p4, p5, p6, ncol = 2, nrow = 3)
## Warning: Removed 6 rows containing non-finite values (stat_density).
## Warning: Removed 1 rows containing missing values (geom_vline).
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
## Warning: Removed 6 rows containing missing values (geom_point).
## Warning: Removed 6 rows containing non-finite values (stat_density).
## Warning: Removed 1 rows containing missing values (geom_vline).
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
## Warning: Removed 6 rows containing missing values (geom_point).
## Warning: Removed 6 rows containing non-finite values (stat_density).
## Warning: Removed 1 rows containing missing values (geom_vline).
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
## Warning: Removed 6 rows containing missing values (geom_point).
final_plot <- annotate_figure(
  final_plot,
  top = text_grob("Evaluaci?n: Modelo Normal Inversa (enlace rec?proco cuadr?tico)", size = 15))
final_plot

# Comparación de los Modelos Lineales Generalizados
modelos <- list(Normal=modeloLN, Gamma_Ident=modeloGI, Gamma_Log=modeloGL,
                Gamma_Recip=modeloGR, NI_Ident=modeloNII, NI_Log=modeloNIL)

resultados_resamples <- resamples(modelos)
## Warning in resamples.default(modelos): 'Normal' did not have
## 'returnResamp="final"; the optimal tuning parameters are used
## Warning in resamples.default(modelos): 'Gamma_Ident' did not have
## 'returnResamp="final"; the optimal tuning parameters are used
## Warning in resamples.default(modelos): 'Gamma_Log' did not have
## 'returnResamp="final"; the optimal tuning parameters are used
## Warning in resamples.default(modelos): 'Gamma_Recip' did not have
## 'returnResamp="final"; the optimal tuning parameters are used
## Warning in resamples.default(modelos): 'NI_Ident' did not have
## 'returnResamp="final"; the optimal tuning parameters are used
## Warning in resamples.default(modelos): 'NI_Log' did not have
## 'returnResamp="final"; the optimal tuning parameters are used
resultados_resamples
## 
## Call:
## resamples.default(x = modelos)
## 
## Models: Normal, Gamma_Ident, Gamma_Log, Gamma_Recip, NI_Ident, NI_Log 
## Number of resamples: 50 
## Performance metrics: MAE, RMSE, Rsquared 
## Time estimates for: everything, final model fit
# Extraer información relevante
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ✔ purrr   0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ✖ dplyr::recode() masks car::recode()
## ✖ dplyr::select() masks MASS::select()
## ✖ purrr::some()   masks car::some()
resultados_resamples$values %>% head(10)
##       Resample Normal~MAE Normal~RMSE Normal~Rsquared Gamma_Ident~MAE
## 1  Fold01.Rep1   3.039035    4.408379       0.7014579        3.289608
## 2  Fold01.Rep2   3.330426    4.837342       0.6519809        3.631980
## 3  Fold01.Rep3   4.610090    7.128709       0.5350221        4.965420
## 4  Fold01.Rep4   5.341016    7.667727       0.4894943        5.433950
## 5  Fold01.Rep5   4.434889    6.034121       0.6384404        4.493912
## 6  Fold02.Rep1   3.272949    4.338228       0.8792529        4.270443
## 7  Fold02.Rep2   4.156856    5.686388       0.7354271        4.612160
## 8  Fold02.Rep3   2.879945    3.664787       0.8679226        3.478335
## 9  Fold02.Rep4   3.950162    5.306014       0.7834401        4.848444
## 10 Fold02.Rep5   4.369320    6.536502       0.6616645        5.153543
##    Gamma_Ident~RMSE Gamma_Ident~Rsquared Gamma_Log~MAE Gamma_Log~RMSE
## 1          4.265582            0.7367741      2.640865       3.502803
## 2          4.344743            0.7075936      3.097494       4.197341
## 3          7.361917            0.5369314      4.000095       6.201068
## 4          7.775865            0.4857927      4.622161       6.750375
## 5          5.889745            0.6121647      4.010354       5.165170
## 6          6.124717            0.7850588      3.083113       4.118837
## 7          6.358474            0.7563881      3.873617       5.036190
## 8          4.814540            0.8368464      2.746054       3.330783
## 9          6.736532            0.6836059      3.705072       4.958346
## 10         7.850065            0.5262551      4.086823       6.472542
##    Gamma_Log~Rsquared Gamma_Recip~MAE Gamma_Recip~RMSE Gamma_Recip~Rsquared
## 1           0.8064999        2.674823         3.481337            0.8090731
## 2           0.7130687        3.100285         4.489327            0.6427565
## 3           0.6507581        3.841487         5.770913            0.7082294
## 4           0.6069493        4.202224         6.208661            0.6744136
## 5           0.7176640        3.668564         4.694038            0.7727969
## 6           0.8999631        2.542937         3.421167            0.9118689
## 7           0.8019713        4.129872         5.071027            0.7952259
## 8           0.9032360        2.505770         3.137666            0.9016495
## 9           0.8236755        3.050768         4.209782            0.8679898
## 10          0.6774835        3.482047         5.959900            0.7197131
##    NI_Ident~MAE NI_Ident~RMSE NI_Ident~Rsquared NI_Log~MAE NI_Log~RMSE
## 1      3.712745      4.602049         0.6910285   2.888792    3.634943
## 2      3.774470      4.511749         0.6758746   3.196078    3.999812
## 3      5.175212      7.609621         0.5089909   4.303900    6.361862
## 4      5.470273      7.985444         0.4602988   4.756618    6.973484
## 5      4.573802      6.140305         0.5819923   4.234171    5.486635
## 6      4.692465      6.787084         0.6965167   3.544448    4.924396
## 7      4.854303      6.762226         0.7279016   4.109549    5.401008
## 8      3.795149      5.352015         0.7834315   3.071313    3.939181
## 9      5.402708      7.522895         0.5901958   4.334811    5.806071
## 10     5.642726      8.445196         0.4339390   4.595134    7.129818
##    NI_Log~Rsquared
## 1        0.7982435
## 2        0.7472074
## 3        0.6410030
## 4        0.5778341
## 5        0.6805413
## 6        0.8482303
## 7        0.7917370
## 8        0.8713493
## 9        0.7642306
## 10       0.5999077
# Se trasforma el dataframe devuelto por resamples() para separar el nombre del
# modelo y las métricas en columnas distintas.
metricas_resamples <- resultados_resamples$values %>%
  gather(key="modelo", value="valor", -Resample) %>%
  separate(col="modelo", into=c("modelo", "metrica"), sep="~", remove=TRUE)

metricas_resamples %>% head()
##      Resample modelo metrica    valor
## 1 Fold01.Rep1 Normal     MAE 3.039035
## 2 Fold01.Rep2 Normal     MAE 3.330426
## 3 Fold01.Rep3 Normal     MAE 4.610090
## 4 Fold01.Rep4 Normal     MAE 5.341016
## 5 Fold01.Rep5 Normal     MAE 4.434889
## 6 Fold02.Rep1 Normal     MAE 3.272949
# Comparación de los RMSE, MAE y R2 promedio 
metricas_resamples %>% group_by(modelo, metrica) %>% 
  summarise(media = mean(valor)) %>% spread(key = metrica, value = media) %>%
  arrange(RMSE) ##
## `summarise()` has grouped output by 'modelo'. You can override using the
## `.groups` argument.
## # A tibble: 6 × 4
## # Groups:   modelo [6]
##   modelo        MAE  RMSE Rsquared
##   <chr>       <dbl> <dbl>    <dbl>
## 1 Gamma_Recip  3.07  4.20    0.806
## 2 Gamma_Log    3.22  4.46    0.788
## 3 NI_Log       3.54  4.90    0.752
## 4 Normal       3.60  5.00    0.733
## 5 Gamma_Ident  4.06  5.71    0.686
## 6 NI_Ident    NA    NA      NA
#arrange(desc(Rsquared))
# Gráfica para comparar los modelos según RMSE
metricas_resamples %>%
  filter(metrica == "RMSE") %>%
  group_by(modelo) %>% 
  summarise(media = mean(valor)) %>%
  ggplot(aes(x = reorder(modelo, media), y = media, label = round(media, 2))) +
  geom_segment(aes(x = reorder(modelo, media), y = 0.5,
                   xend = modelo, yend = media),
               color = "grey50") +
  geom_point(size = 15, color = "firebrick") +
  geom_text(color = "white", size = 5) +
  scale_y_continuous(limits = c(2, 10)) +
  # RMSE de referencia
  geom_hline(yintercept = 0.73, linetype = "dashed") +
  annotate(geom = "text", y = 0.72, x = 7.5, label = "RMSE referencia") +
  labs(title = "Validaci?n: RMSE medio repeated-CV",
       subtitle = "Modelos ordenados por media",
       x = "modelo") +
  coord_flip() +
  theme_bw()
## Warning: Removed 6 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text).
## Warning: Removed 1 rows containing missing values (geom_hline).
## Warning: Removed 1 rows containing missing values (geom_text).

# Gráfica para comparación considerando la variabilidad
metricas_resamples %>% filter(metrica == "RMSE") %>%
  group_by(modelo) %>% 
  mutate(media = mean(valor)) %>%
  ungroup() %>%
  ggplot(aes(x = reorder(modelo, media), y = valor, color = modelo)) +
  geom_boxplot(alpha = 0.6, outlier.shape = NA) +
  geom_jitter(width = 0.1, alpha = 0.6) +
  scale_y_continuous(limits = c(2, 8)) +
  # RMSE de referencia
  geom_hline(yintercept = 0.72, linetype = "dashed") +
  annotate(geom = "text", y = 0.72, x = 7.5, label = "RMSE de referencia") +
  theme_bw() +
  labs(title = "Validaci?n: RMSE medio repeated-CV",
       subtitle = "Modelos ordenados por media") +
  coord_flip() +
  theme(legend.position = "none")
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Warning: Removed 8 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_hline).
## Warning: Removed 1 rows containing missing values (geom_text).

# Evaluar diferencias
# -------------------
# Test de Friedman
#   H0: todas las medianas son iguales
#   H1: alguna de las medianas (de los RMSE) es distinta a las demás

matriz_metricas <- metricas_resamples %>% filter(metrica == "RMSE") %>%
  spread(key = modelo, value = valor) %>%
  dplyr::select(-Resample, -metrica) %>% as.matrix()

# alfa = 0.05
friedman.test(y=matriz_metricas) # Se rechaza la hipotesis nula,almenos una de las medianas es diferente
## 
##  Friedman rank sum test
## 
## data:  matriz_metricas
## Friedman chi-squared = 193.69, df = 5, p-value < 2.2e-16
# Error de Test
# -------------
predicciones <- extractPrediction(models = modelos,
                                  testX = datos_test[, -1],
                                  testY = datos_test$medv)
predicciones %>% head()
##    obs     pred model dataType object
## 1 24.0 31.90136   glm Training Normal
## 2 21.6 25.54408   glm Training Normal
## 3 34.7 32.44926   glm Training Normal
## 4 36.2 30.05964   glm Training Normal
## 5 28.7 26.79588   glm Training Normal
## 6 22.9 23.91666   glm Training Normal
metricas_predicciones <- predicciones %>% mutate(acierto = (obs-pred)^2) %>%
  group_by(object, dataType) %>% summarise(RMSE = sqrt(mean(acierto)))
## `summarise()` has grouped output by 'object'. You can override using the
## `.groups` argument.
# RMSE en conjunto de evaluaci?n (y de prueba)
metricas_predicciones %>% spread(key = dataType, value = RMSE) %>%
  arrange(desc(Test))
## # A tibble: 6 × 3
## # Groups:   object [6]
##   object       Test Training
##   <chr>       <dbl>    <dbl>
## 1 NI_Ident     5.83     6.21
## 2 Gamma_Ident  5.55     5.73
## 3 Normal       5.54     5.00
## 4 NI_Log       5.23     4.94
## 5 Gamma_Log    4.97     4.49
## 6 Gamma_Recip  4.69     4.22
ggplot(data = metricas_predicciones, aes(x = reorder(object, RMSE), y = RMSE,
                                         color = dataType, label = round(RMSE, 3))) +
      geom_point(size=14) + scale_color_manual(values = c("orangered2", "gray50")) +
      geom_text(color="white", size=3) + scale_y_continuous(limits=c(2, 10)) +
      # RMSE de referencia 
      geom_hline(yintercept = 0.73, linetype = "dashed") +
      annotate(geom = "text", y = 0.73, x = 8.5, label = "RMSE referencia") +
      coord_flip() + labs(title="RMSE de entrenamiento y test", x="Modelo") +
      theme_bw() + theme(legend.position = "bottom")
## Warning: Removed 1 rows containing missing values (geom_hline).
## Warning: Removed 1 rows containing missing values (geom_text).

# Modelo Final
modeloGR
## Generalized Linear Model 
## 
## 381 samples
##   4 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 341, 344, 343, 343, 344, 343, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   4.196792  0.8060329  3.070947

###KNN

# =================================================================
#  K-NN (K-nearest neighbor) para construir el modelo
# =================================================================

library(Metrics)
## 
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
## 
##     precision, recall
# install.packages('kknn')
library(kknn)
## 
## Attaching package: 'kknn'
## The following object is masked from 'package:caret':
## 
##     contr.dummy
# ===============================================================================
# Selección de hiperparámetros ("modelos")
# ===============================================================================

# Determinación del mejor valor de K (usando Caret y 10 CV)
particiones  <- 10 # valor de k (folds)
repeticiones <- 5

# Metrica para seleccionar los hiperparametros óptimos
metrica <- "RMSE"

# Definicion de hiperpar?metros
hiperparametros <- expand.grid(kmax = seq(from=1, to=50, by=2),
                               distance = 2,
                               kernel = c("optimal", "epanechnikov","gaussian")
)
hiperparametros
##    kmax distance       kernel
## 1     1        2      optimal
## 2     3        2      optimal
## 3     5        2      optimal
## 4     7        2      optimal
## 5     9        2      optimal
## 6    11        2      optimal
## 7    13        2      optimal
## 8    15        2      optimal
## 9    17        2      optimal
## 10   19        2      optimal
## 11   21        2      optimal
## 12   23        2      optimal
## 13   25        2      optimal
## 14   27        2      optimal
## 15   29        2      optimal
## 16   31        2      optimal
## 17   33        2      optimal
## 18   35        2      optimal
## 19   37        2      optimal
## 20   39        2      optimal
## 21   41        2      optimal
## 22   43        2      optimal
## 23   45        2      optimal
## 24   47        2      optimal
## 25   49        2      optimal
## 26    1        2 epanechnikov
## 27    3        2 epanechnikov
## 28    5        2 epanechnikov
## 29    7        2 epanechnikov
## 30    9        2 epanechnikov
## 31   11        2 epanechnikov
## 32   13        2 epanechnikov
## 33   15        2 epanechnikov
## 34   17        2 epanechnikov
## 35   19        2 epanechnikov
## 36   21        2 epanechnikov
## 37   23        2 epanechnikov
## 38   25        2 epanechnikov
## 39   27        2 epanechnikov
## 40   29        2 epanechnikov
## 41   31        2 epanechnikov
## 42   33        2 epanechnikov
## 43   35        2 epanechnikov
## 44   37        2 epanechnikov
## 45   39        2 epanechnikov
## 46   41        2 epanechnikov
## 47   43        2 epanechnikov
## 48   45        2 epanechnikov
## 49   47        2 epanechnikov
## 50   49        2 epanechnikov
## 51    1        2     gaussian
## 52    3        2     gaussian
## 53    5        2     gaussian
## 54    7        2     gaussian
## 55    9        2     gaussian
## 56   11        2     gaussian
## 57   13        2     gaussian
## 58   15        2     gaussian
## 59   17        2     gaussian
## 60   19        2     gaussian
## 61   21        2     gaussian
## 62   23        2     gaussian
## 63   25        2     gaussian
## 64   27        2     gaussian
## 65   29        2     gaussian
## 66   31        2     gaussian
## 67   33        2     gaussian
## 68   35        2     gaussian
## 69   37        2     gaussian
## 70   39        2     gaussian
## 71   41        2     gaussian
## 72   43        2     gaussian
## 73   45        2     gaussian
## 74   47        2     gaussian
## 75   49        2     gaussian
library(caret)
control_train <- trainControl(method = "repeatedcv", number = particiones,
                              repeats = repeticiones, returnResamp = "final",
                              verboseIter = FALSE)


set.seed(555)
modeloKNN <- train(formul, data = data, method = "kknn", tuneGrid = hiperparametros,
                   metric = metrica, trControl = control_train)
modeloKNN
## k-Nearest Neighbors 
## 
## 506 samples
##   4 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 455, 455, 456, 456, 456, 455, ... 
## Resampling results across tuning parameters:
## 
##   kmax  kernel        RMSE      Rsquared   MAE     
##    1    optimal       4.753915  0.7438044  3.141004
##    1    epanechnikov  4.753915  0.7438044  3.141004
##    1    gaussian      4.753915  0.7438044  3.141004
##    3    optimal       4.133026  0.7961658  2.752106
##    3    epanechnikov  4.054021  0.8026834  2.737437
##    3    gaussian      4.017134  0.8063702  2.719312
##    5    optimal       3.955092  0.8120900  2.633554
##    5    epanechnikov  3.972119  0.8101330  2.643992
##    5    gaussian      3.989421  0.8084163  2.649733
##    7    optimal       3.953769  0.8122371  2.619902
##    7    epanechnikov  3.934419  0.8138345  2.621045
##    7    gaussian      4.002049  0.8071211  2.653968
##    9    optimal       3.967592  0.8108495  2.624955
##    9    epanechnikov  3.934739  0.8137436  2.616566
##    9    gaussian      4.002049  0.8071211  2.653968
##   11    optimal       3.967592  0.8108495  2.624955
##   11    epanechnikov  3.935197  0.8136466  2.616783
##   11    gaussian      4.002049  0.8071211  2.653968
##   13    optimal       3.967592  0.8108495  2.624955
##   13    epanechnikov  3.935197  0.8136466  2.616783
##   13    gaussian      4.006633  0.8068273  2.654531
##   15    optimal       3.967592  0.8108495  2.624955
##   15    epanechnikov  3.935197  0.8136466  2.616783
##   15    gaussian      4.006633  0.8068273  2.654531
##   17    optimal       3.967592  0.8108495  2.624955
##   17    epanechnikov  3.935197  0.8136466  2.616783
##   17    gaussian      4.006633  0.8068273  2.654531
##   19    optimal       3.967592  0.8108495  2.624955
##   19    epanechnikov  3.935197  0.8136466  2.616783
##   19    gaussian      4.006633  0.8068273  2.654531
##   21    optimal       3.967592  0.8108495  2.624955
##   21    epanechnikov  3.935197  0.8136466  2.616783
##   21    gaussian      4.006633  0.8068273  2.654531
##   23    optimal       3.967592  0.8108495  2.624955
##   23    epanechnikov  3.935197  0.8136466  2.616783
##   23    gaussian      4.006633  0.8068273  2.654531
##   25    optimal       3.967592  0.8108495  2.624955
##   25    epanechnikov  3.935197  0.8136466  2.616783
##   25    gaussian      4.006633  0.8068273  2.654531
##   27    optimal       3.967592  0.8108495  2.624955
##   27    epanechnikov  3.935197  0.8136466  2.616783
##   27    gaussian      4.006633  0.8068273  2.654531
##   29    optimal       3.967592  0.8108495  2.624955
##   29    epanechnikov  3.935197  0.8136466  2.616783
##   29    gaussian      4.006633  0.8068273  2.654531
##   31    optimal       3.967592  0.8108495  2.624955
##   31    epanechnikov  3.935197  0.8136466  2.616783
##   31    gaussian      4.006633  0.8068273  2.654531
##   33    optimal       3.967592  0.8108495  2.624955
##   33    epanechnikov  3.935197  0.8136466  2.616783
##   33    gaussian      4.006633  0.8068273  2.654531
##   35    optimal       3.967592  0.8108495  2.624955
##   35    epanechnikov  3.935197  0.8136466  2.616783
##   35    gaussian      4.006633  0.8068273  2.654531
##   37    optimal       3.967592  0.8108495  2.624955
##   37    epanechnikov  3.935197  0.8136466  2.616783
##   37    gaussian      4.006633  0.8068273  2.654531
##   39    optimal       3.967592  0.8108495  2.624955
##   39    epanechnikov  3.935197  0.8136466  2.616783
##   39    gaussian      4.006633  0.8068273  2.654531
##   41    optimal       3.967592  0.8108495  2.624955
##   41    epanechnikov  3.935197  0.8136466  2.616783
##   41    gaussian      4.006633  0.8068273  2.654531
##   43    optimal       3.967592  0.8108495  2.624955
##   43    epanechnikov  3.935197  0.8136466  2.616783
##   43    gaussian      4.006633  0.8068273  2.654531
##   45    optimal       3.967592  0.8108495  2.624955
##   45    epanechnikov  3.935197  0.8136466  2.616783
##   45    gaussian      4.006633  0.8068273  2.654531
##   47    optimal       3.967592  0.8108495  2.624955
##   47    epanechnikov  3.935197  0.8136466  2.616783
##   47    gaussian      4.006633  0.8068273  2.654531
##   49    optimal       3.967592  0.8108495  2.624955
##   49    epanechnikov  3.935197  0.8136466  2.616783
##   49    gaussian      4.006633  0.8068273  2.654531
## 
## Tuning parameter 'distance' was held constant at a value of 2
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were kmax = 7, distance = 2 and kernel
##  = epanechnikov.
ggplot(modeloKNN, highlight=TRUE) + scale_x_discrete(breaks=hiperparametros$kmax) +
       labs(title="Evolución de exactitud del modelo KNN", x="K") + theme_bw()

##KNN


library(kknn)
library(rpart)

modeloKNN <- kknn(formula=formul, train=datos_train, test=datos_test, kernel="epanechnikov", k=3)
modeloKNN
## 
## Call:
## kknn(formula = formul, train = datos_train, test = datos_test,     k = 3, kernel = "epanechnikov")
## 
## Response: "continuous"
?kknn
## starting httpd help server ... done
Metrics::rmse(actual = datos_train$medv, predicted = predict(modeloKNN))
## Warning in actual - predicted: longitud de objeto mayor no es múltiplo de la
## longitud de uno menor
## [1] 10.6983
Metrics::rmse(actual = datos_test$medv, predicted = predict(modeloKNN))
## [1] 4.835782
##Rpart
library("partykit")
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
arbol_completo <- rpart(formul, data=datos_train, cp=0, minbucket=3)
arbol_completo
## n= 381 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##    1) root 381 3.496634e+04 22.702100  
##      2) rm< 6.92 318 1.221547e+04 19.647800  
##        4) lstat>=14.785 121 2.468612e+03 14.459500  
##          8) dis< 2.0643 65 8.228822e+02 11.847690  
##           16) ptratio>=19.6 54 6.320637e+02 11.125930  
##             32) lstat>=21.42 31 2.733671e+02  9.609677  
##               64) lstat< 34.195 28 1.767886e+02  9.257143  
##                128) lstat>=25.735 13 2.500308e+01  7.623077  
##                  256) dis< 1.5981 6 9.408333e+00  6.716667 *
##                  257) dis>=1.5981 7 6.440000e+00  8.400000 *
##                129) lstat< 25.735 15 8.698933e+01 10.673330  
##                  258) lstat< 23.135 6 3.595500e+01  9.650000 *
##                  259) lstat>=23.135 9 4.056222e+01 11.355560  
##                    518) dis>=1.68365 4 2.086000e+01  9.900000 *
##                    519) dis< 1.68365 5 4.448000e+00 12.520000 *
##               65) lstat>=34.195 3 6.062000e+01 12.900000 *
##             33) lstat< 21.42 23 1.913687e+02 13.169570  
##               66) dis< 1.7852 13 1.012477e+02 12.069230  
##                132) lstat< 21.15 10 6.612900e+01 11.310000  
##                  264) lstat>=19.57 7 5.684857e+01 10.685710 *
##                  265) lstat< 19.57 3 1.866667e-01 12.766670 *
##                133) lstat>=21.15 3 1.014000e+01 14.600000 *
##               67) dis>=1.7852 10 5.392000e+01 14.600000  
##                134) dis>=1.97915 3 1.108667e+01 11.866670 *
##                135) dis< 1.97915 7 1.081429e+01 15.771430 *
##           17) ptratio< 19.6 11 2.458909e+01 15.390910  
##             34) dis< 1.4887 3 7.466667e-01 13.933330 *
##             35) dis>=1.4887 8 1.507875e+01 15.937500 *
##          9) dis>=2.0643 56 6.876655e+02 17.491070  
##           18) ptratio>=19.7 34 2.693497e+02 15.802940  
##             36) lstat>=18.885 8 1.431875e+01 13.762500 *
##             37) lstat< 18.885 26 2.114754e+02 16.430770  
##               74) dis>=2.16915 22 1.440345e+02 15.845450  
##                148) dis< 2.44985 9 4.090222e+01 14.455560  
##                  296) lstat< 16.855 3 3.486667e+00 12.866670 *
##                  297) lstat>=16.855 6 2.605500e+01 15.250000 *
##                149) dis>=2.44985 13 7.370923e+01 16.807690  
##                  298) lstat>=16.355 8 3.863875e+01 15.637500 *
##                  299) lstat< 16.355 5 6.588000e+00 18.680000 *
##               75) dis< 2.16915 4 1.845000e+01 19.650000 *
##           19) ptratio< 19.7 22 1.716800e+02 20.100000  
##             38) dis>=6.33335 3 9.206667e+00 17.466670 *
##             39) dis< 6.33335 19 1.383853e+02 20.515790  
##               78) ptratio>=17.6 14 8.446357e+01 19.821430  
##                156) dis>=4.86055 3 1.256000e+01 16.800000 *
##                157) dis< 4.86055 11 3.704727e+01 20.645450  
##                  314) lstat< 15.985 4 5.340000e+00 19.000000 *
##                  315) lstat>=15.985 7 1.468857e+01 21.585710 *
##               79) ptratio< 17.6 5 2.827200e+01 22.460000 *
##        5) lstat< 14.785 197 4.489165e+03 22.834520  
##         10) lstat>=7.81 137 2.120775e+03 21.268610  
##           20) rm< 6.0985 71 4.260237e+02 19.928170  
##             40) ptratio>=20.6 8 5.991500e+01 17.525000 *
##             41) ptratio< 20.6 63 3.140400e+02 20.233330  
##               82) rm< 5.822 20 1.205255e+02 19.135000  
##                164) lstat< 13.185 11 7.138545e+01 18.336360  
##                  328) lstat>=12.065 4 3.580000e+00 16.600000 *
##                  329) lstat< 12.065 7 4.885429e+01 19.328570 *
##                165) lstat>=13.185 9 3.354889e+01 20.111110  
##                  330) rm>=5.6805 5 7.368000e+00 18.880000 *
##                  331) rm< 5.6805 4 9.130000e+00 21.650000 *
##               83) rm>=5.822 43 1.581660e+02 20.744190  
##                166) ptratio>=16.85 31 9.691097e+01 20.364520  
##                  332) dis>=6.56935 3 2.846667e+00 18.033330 *
##                  333) dis< 6.56935 28 7.601429e+01 20.614290  
##                    666) ptratio>=19.85 8 4.438750e+00 19.862500 *
##                    667) ptratio< 19.85 20 6.524550e+01 20.915000  
##                     1334) dis< 3.6126 9 3.666222e+01 20.155560  
##                       2668) dis>=2.43995 6 7.680000e+00 19.100000 *
##                       2669) dis< 2.43995 3 8.926667e+00 22.266670 *
##                     1335) dis>=3.6126 11 1.914545e+01 21.536360  
##                       2670) rm< 5.923 6 4.073333e+00 20.766670 *
##                       2671) rm>=5.923 5 7.252000e+00 22.460000 *
##                167) ptratio< 16.85 12 4.524250e+01 21.725000  
##                  334) lstat>=12.94 3 1.768667e+01 20.133330 *
##                  335) lstat< 12.94 9 1.742222e+01 22.255560  
##                    670) ptratio< 16.5 6 1.411500e+01 21.850000 *
##                    671) ptratio>=16.5 3 3.466667e-01 23.066670 *
##           21) rm>=6.0985 66 1.429943e+03 22.710610  
##             42) lstat>=10.1 37 1.225611e+02 20.867570  
##               84) rm>=6.4035 8 3.914000e+01 19.300000 *
##               85) rm< 6.4035 29 5.834000e+01 21.300000  
##                170) rm< 6.318 21 2.700952e+01 20.838100  
##                  340) rm>=6.25 5 3.788000e+00 19.820000 *
##                  341) rm< 6.25 16 1.641937e+01 21.156250  
##                    682) lstat>=12.7 9 6.328889e+00 20.711110  
##                     1364) dis>=3.03135 3 1.006667e+00 20.233330 *
##                     1365) dis< 3.03135 6 4.295000e+00 20.950000 *
##                    683) lstat< 12.7 7 6.014286e+00 21.728570 *
##                171) rm>=6.318 8 1.508875e+01 22.512500 *
##             43) lstat< 10.1 29 1.021348e+03 25.062070  
##               86) lstat< 9.44 19 1.731253e+02 23.084210  
##                172) rm< 6.5915 16 7.506937e+01 22.106250  
##                  344) dis>=5.68815 5 3.310800e+01 20.620000 *
##                  345) dis< 5.68815 11 2.589636e+01 22.781820  
##                    690) ptratio>=17.2 8 7.028750e+00 22.087500 *
##                    691) ptratio< 17.2 3 4.726667e+00 24.633330 *
##                173) rm>=6.5915 3 1.140000e+00 28.300000 *
##               87) lstat>=9.44 10 6.326760e+02 28.820000  
##                174) rm>=6.3025 6 2.749333e+01 25.333330 *
##                175) rm< 6.3025 4 4.228300e+02 34.050000 *
##         11) lstat< 7.81 60 1.265414e+03 26.410000  
##           22) lstat>=4.52 52 4.415698e+02 25.251920  
##             44) rm< 6.7225 45 2.156711e+02 24.555560  
##               88) dis>=6.12605 15 3.372400e+01 23.180000  
##                176) rm< 6.364 7 7.820000e+00 22.100000 *
##                177) rm>=6.364 8 1.059500e+01 24.125000 *
##               89) dis< 6.12605 30 1.393737e+02 25.243330  
##                178) rm< 6.4225 18 3.402000e+01 24.433330  
##                  356) dis>=2.1583 15 1.599333e+01 24.066670  
##                    712) rm< 6.168 3 3.800000e-01 22.600000 *
##                    713) rm>=6.168 12 7.546667e+00 24.433330  
##                     1426) ptratio>=19.1 5 4.120000e+00 23.900000 *
##                     1427) ptratio< 19.1 7 9.885714e-01 24.814290 *
##                  357) dis< 2.1583 3 5.926667e+00 26.266670 *
##                179) rm>=6.4225 12 7.582917e+01 26.458330  
##                  358) lstat< 5.145 4 1.428750e+01 24.825000 *
##                  359) lstat>=5.145 8 4.553500e+01 27.275000 *
##             45) rm>=6.7225 7 6.379429e+01 29.728570 *
##           23) lstat< 4.52 8 3.007987e+02 33.937500 *
##      3) rm>=6.92 63 4.810337e+03 38.119050  
##        6) rm< 7.435 33 9.230406e+02 31.775760  
##         12) ptratio>=19.65 5 1.396400e+02 22.300000 *
##         13) ptratio< 19.65 28 2.542811e+02 33.467860  
##           26) lstat>=9.65 3 1.152667e+01 29.466670 *
##           27) lstat< 9.65 25 1.889624e+02 33.948000  
##             54) dis>=3.02235 21 1.126524e+02 33.352380  
##              108) dis< 4.35735 5 1.065200e+01 31.340000 *
##              109) dis>=4.35735 16 7.542437e+01 33.981250  
##                218) dis>=7.7406 4 2.707000e+01 31.750000 *
##                219) dis< 7.7406 12 2.180250e+01 34.725000  
##                  438) rm>=7.213 3 2.066667e-01 33.066670 *
##                  439) rm< 7.213 9 1.059556e+01 35.277780  
##                    878) dis< 5.77695 5 3.080000e+00 34.600000 *
##                    879) dis>=5.77695 4 2.347500e+00 36.125000 *
##             55) dis< 3.02235 4 2.974750e+01 37.075000 *
##        7) rm>=7.435 30 1.098850e+03 45.096670  
##         14) ptratio>=18.3 3 2.238200e+02 33.300000 *
##         15) ptratio< 18.3 27 4.111585e+02 46.407410  
##           30) ptratio>=14.8 13 1.946277e+02 44.369230  
##             60) lstat< 3.665 5 4.076800e+01 42.620000 *
##             61) lstat>=3.665 8 1.289988e+02 45.462500 *
##           31) ptratio< 14.8 14 1.123800e+02 48.300000  
##             62) rm< 7.706 4 3.784750e+01 44.725000 *
##             63) rm>=7.706 10 2.961000e+00 49.730000  
##              126) lstat>=3.755 4 1.867500e+00 49.325000 *
##              127) lstat< 3.755 6 0.000000e+00 50.000000 *
xerr <- arbol_completo$cptable[,"xerror"]
minxerr <- which.min(xerr)
mincp <- arbol_completo$cptable[minxerr, "CP"] ### Criterio de Parada es 0.001538394
arbol_prune <- prune(arbol_completo, cp=mincp)
plot(as.party(arbol_prune), tp_args = list(id=FALSE))

Metrics::rmse(actual = datos_train$medv, predicted = predict(arbol_prune))
## [1] 3.077051
Metrics::rmse(actual = datos_test$medv, predicted = predict(arbol_prune))
## Warning in actual - predicted: longitud de objeto mayor no es múltiplo de la
## longitud de uno menor
## [1] 11.21912