library(tidyverse)
library(GGally)
library(mlbench)
library(plotly)
library(caret)
library(car)
library(psych)
library(NeuralNetTools)
library(nnet)
library(gam)
library(HoRM)
library(robust)
library(Amelia)
library(leaps)
library(bestglm) 
library(Metrics)

1 INTRODUCCION


En este trabajo practico analizamos el dataset “Boston Housing”. Una descripción del mismo se puede hallar en “https://www.kaggle.com/c/boston-housing”.

2 DESCRIPCION DE LOS DATOS


La base de datos de BostonHousing tiene 506 filas y 14 columnas. En ella se registran los datos originales del censo de 1970 realizado en Boston (Harrison y Rubinfeld 1979), y posteriormente corregida con información espacial adicional.


Esta información esta incluída en la biblioteca mlbench o puede ser descargada. Los datos tienen las siguientes características, siendo medv la variable de objetivo o independiente:


crim - Crimen per cápita por ciudad
zn - proporción de terrenos residenciales divididos en zonas para lotes de más de 25,000 pies cuadrados
indus - proporción de acres de negocios no minoristas por ciudad
chas - variable ficticia de Charles River (= 1 si el tramo limita el río, 0 de lo contrario)
nox - concentración de óxidos nítricos (partes por 10 millones)
rm - número promedio de habitaciones por vivienda
age - proporción de unidades ocupadas por sus propietarios construidas antes de 1940
dis - Distancias desproporcionadas a cinco centros de empleo de Boston
rad - índice de accesibilidad a las autopistas radiales
tax - tasa de impuesto a la propiedad de valor completo por USD 10,000
ptratio - colegios por localidad
b 1000 (B - 0,63)^ 2, donde B es la proporción de negros por ciudad
lstat - porcentaje de estado inferior de la población
medv - valor mediano de las viviendas ocupadas por sus propietarios en USD 1000

data("BostonHousing")
db<-BostonHousing
plot<-ggpairs(db)
plot

dim(db)
## [1] 506  14
str(db)
## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : num  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ b      : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
summary(db)
##       crim                zn             indus       chas         nox        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   0:471   Min.   :0.3850  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1: 35   1st Qu.:0.4490  
##  Median : 0.25651   Median :  0.00   Median : 9.69           Median :0.5380  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14           Mean   :0.5547  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10           3rd Qu.:0.6240  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74           Max.   :0.8710  
##        rm             age              dis              rad        
##  Min.   :3.561   Min.   :  2.90   Min.   : 1.130   Min.   : 1.000  
##  1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100   1st Qu.: 4.000  
##  Median :6.208   Median : 77.50   Median : 3.207   Median : 5.000  
##  Mean   :6.285   Mean   : 68.57   Mean   : 3.795   Mean   : 9.549  
##  3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188   3rd Qu.:24.000  
##  Max.   :8.780   Max.   :100.00   Max.   :12.127   Max.   :24.000  
##       tax           ptratio            b              lstat      
##  Min.   :187.0   Min.   :12.60   Min.   :  0.32   Min.   : 1.73  
##  1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38   1st Qu.: 6.95  
##  Median :330.0   Median :19.05   Median :391.44   Median :11.36  
##  Mean   :408.2   Mean   :18.46   Mean   :356.67   Mean   :12.65  
##  3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23   3rd Qu.:16.95  
##  Max.   :711.0   Max.   :22.00   Max.   :396.90   Max.   :37.97  
##       medv      
##  Min.   : 5.00  
##  1st Qu.:17.02  
##  Median :21.20  
##  Mean   :22.53  
##  3rd Qu.:25.00  
##  Max.   :50.00
missmap(db,col=c('yellow','black'),y.at=1,y.labels='',legend=TRUE) #No hay datos perdidos.

hist(db$medv)
g1<-ggplot(db)+geom_point(aes(crim,medv))
g1

g2<-ggplot(db)+geom_point(aes(nox,medv))
g2

g3<-ggplot(db)+geom_point(aes(zn,medv))
g3

g4<-ggplot(db)+geom_point(aes(indus,medv))
g4

g5<-ggplot(db)+geom_point(aes(rm,medv))
g5

g6<-ggplot(db)+geom_point(aes(age,medv))
g6

g7<-ggplot(db)+geom_point(aes(dis,medv))
g7

g8<-ggplot(db)+geom_point(aes(rad,medv))
g8

g9<-ggplot(db)+geom_point(aes(tax,medv))
g9

g10<-ggplot(db)+geom_point(aes(ptratio,medv))
g10

g11<-ggplot(db)+geom_point(aes(b,medv))
g11

g12<-ggplot(db)+geom_point(aes(lstat,medv))
g12

ggcorr(db, label= TRUE, nbreaks = 5, low= "Lightblue", mid= "white", high = "red")


Graficamente, la máxima correlación observada es entre las variables tax y rad (0.9), seguida por la correlación entre las variables indus y nox (0.8)

3 MODELOS PARA PREDECIR Y EXPLICAR LA VARIABLE “medv”

3.1 MODELO LINEAL

ajuste<-lm(medv~.,data=db)
summary(ajuste)#R2 0.73
## 
## Call:
## lm(formula = medv ~ ., data = db)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.595  -2.730  -0.518   1.777  26.199 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.646e+01  5.103e+00   7.144 3.28e-12 ***
## crim        -1.080e-01  3.286e-02  -3.287 0.001087 ** 
## zn           4.642e-02  1.373e-02   3.382 0.000778 ***
## indus        2.056e-02  6.150e-02   0.334 0.738288    
## chas1        2.687e+00  8.616e-01   3.118 0.001925 ** 
## nox         -1.777e+01  3.820e+00  -4.651 4.25e-06 ***
## rm           3.810e+00  4.179e-01   9.116  < 2e-16 ***
## age          6.922e-04  1.321e-02   0.052 0.958229    
## dis         -1.476e+00  1.995e-01  -7.398 6.01e-13 ***
## rad          3.060e-01  6.635e-02   4.613 5.07e-06 ***
## tax         -1.233e-02  3.760e-03  -3.280 0.001112 ** 
## ptratio     -9.527e-01  1.308e-01  -7.283 1.31e-12 ***
## b            9.312e-03  2.686e-03   3.467 0.000573 ***
## lstat       -5.248e-01  5.072e-02 -10.347  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared:  0.7406, Adjusted R-squared:  0.7338 
## F-statistic: 108.1 on 13 and 492 DF,  p-value: < 2.2e-16
plot(ajuste$residuals,ajuste$fitted.values)#nube de ptos en el 0, algunos outliers??

plot(ajuste$residuals)

plot(db$medv,ajuste$fitted.values)
abline(0,1)

hist(ajuste$residuals)#parece distribución normal

qqnorm(ajuste$residuals)
qqline(ajuste$residuals)

plot(ajuste)

#ver outliers

leverage<-hatvalues(ajuste)
sort(leverage, decreasing = T)
##         381         419         406         411         366         156 
## 0.305959491 0.190100964 0.156432506 0.124706994 0.098514926 0.085276660 
##         491         368         493         365         492         153 
## 0.082067417 0.080430187 0.077112512 0.076722562 0.076458117 0.076138593 
##         490         354         489         415         215         127 
## 0.075708855 0.075573889 0.074208614 0.073871754 0.073404490 0.069996876 
##         143         124         369         157         164         284 
## 0.069303219 0.067954436 0.066330935 0.065890523 0.064252939 0.062857754 
##         155         121         125         123         126         122 
## 0.061301310 0.060987916 0.060920971 0.060617543 0.060372067 0.060134916 
##         428         163         254         356         355         146 
## 0.059619949 0.057418966 0.056841611 0.056269029 0.056075388 0.055989416 
##         413         148         145           9         371         370 
## 0.053925766 0.053777718 0.052889397 0.052871770 0.052760173 0.051978828 
##         161         210         160          55         405          49 
## 0.051974602 0.051720530 0.051266874 0.050784987 0.050739934 0.050580987 
##         149         399         147         212         343         359 
## 0.050439258 0.050131441 0.050016101 0.049856941 0.049849957 0.049706326 
##         222         205         204         152         103         375 
## 0.049331516 0.049090143 0.047908395 0.047867887 0.047675351 0.047599105 
##         144         358         373         258         425         364 
## 0.047381682 0.047097925 0.046792287 0.046075731 0.045401752 0.045141575 
##         352         142         154         439         293         266 
## 0.045135912 0.044478866 0.043746740 0.043410052 0.042993928 0.042782982 
##         357         151         274         150         291         292 
## 0.042765274 0.042752049 0.042372525 0.042179627 0.042170667 0.042137328 
##         275         353         424         417         427          58 
## 0.042070055 0.041585853 0.041346282 0.041077154 0.040341252 0.040270651 
##         211         263         167         283         451         219 
## 0.040242218 0.039911950 0.039896169 0.039850819 0.039791607 0.039726954 
##         270         278         213         201         330         416 
## 0.039714145 0.039434704 0.039229328 0.038782535 0.038610652 0.038473284 
##         217          65         458         200         457         221 
## 0.038395737 0.038204058 0.038005215 0.037963012 0.037724909 0.037269036 
##         467         455         420         277         220         407 
## 0.037260235 0.037243033 0.037171433 0.037075157 0.037032514 0.036707923 
##         412         162         374         329         209          56 
## 0.036705538 0.036291759 0.036201827 0.036164021 0.035909691 0.035833255 
##          11         268         159         438         226         223 
## 0.035330406 0.035237786 0.035234370 0.035181328 0.035175233 0.034783685 
##          33         237         363         260         298         235 
## 0.034772293 0.034639249 0.034365432 0.034030422 0.033875100 0.033779410 
##         331         198         257          41         202         199 
## 0.033719147 0.033271863 0.033157771 0.032775236 0.032657130 0.032493897 
##         259         294         203         414         426         446 
## 0.032457726 0.032368460 0.032353742 0.032288498 0.032261733 0.032051005 
##         229         432         158         456          75         166 
## 0.031963683 0.031940964 0.031887200 0.031847710 0.031822870 0.031803470 
##         437         141         267         253          40         296 
## 0.031504845 0.031501661 0.031263328 0.031161037 0.031118132 0.031080374 
##         197         264         350         388         496         347 
## 0.031057358 0.031037605 0.030842588 0.030822545 0.030783114 0.030626347 
##          57          62           8         262         265         287 
## 0.030612054 0.030408437 0.030193780 0.030104110 0.030022416 0.029735066 
##         346         351         248         433         196         245 
## 0.029721737 0.029545142 0.029497466 0.029434970 0.029417703 0.029326637 
##         484         168          10         297          67         285 
## 0.029227887 0.029089095 0.029055845 0.029041105 0.029007492 0.028820607 
##         255         361         400         387         261         431 
## 0.028782421 0.028643711 0.028588027 0.028455654 0.028445865 0.028420983 
##         430         256         367          94         348         483 
## 0.028411405 0.027919162 0.027733811 0.027526004 0.027427601 0.027365910 
##         172          42          98          99         133         311 
## 0.027194736 0.026869222 0.026828430 0.026750386 0.026600078 0.026489204 
##         295         165         385         131         246         454 
## 0.026488182 0.026358885 0.026169753 0.026110767 0.026104563 0.026078381 
##         169         171         504         307         132          66 
## 0.026041901 0.025902097 0.025754662 0.025743695 0.025563862 0.025523047 
##          71         138          74         129         233         436 
## 0.025476846 0.025445554 0.025380506 0.025375486 0.025289855 0.025279877 
##         472         401         188         185         482         184 
## 0.025278382 0.025246964 0.025230690 0.025059306 0.024833412 0.024587898 
##         136         170         135         376         386         269 
## 0.024567988 0.024509168 0.024493895 0.024445552 0.024411638 0.024396733 
##         225          73         418         140         485         308 
## 0.024307029 0.024280861 0.024192171 0.024090471 0.024080167 0.024032978 
##         434         139         429         360          19          12 
## 0.024006459 0.023976004 0.023860144 0.023689843 0.023666499 0.023624074 
##         301         379          95         181         173         334 
## 0.023620412 0.023478104 0.023425791 0.023371734 0.023359198 0.023344367 
##         505          44         234          43         189         349 
## 0.023162527 0.023146444 0.023112374 0.022864841 0.022749508 0.022728381 
##          35         335         389         227          17         409 
## 0.022688400 0.022602398 0.022596101 0.022419682 0.022364850 0.022260658 
##         183         435         306         134         187         404 
## 0.022237447 0.022126215 0.022084616 0.022008104 0.021979308 0.021949344 
##         299          48         481         128         137         486 
## 0.021929749 0.021927769 0.021842046 0.021787861 0.021568396 0.021565400 
##         300         410         230         130          93         466 
## 0.021553506 0.021537718 0.021410083 0.021326962 0.020863265 0.020850220 
##         326          63         336          64         190         476 
## 0.020811798 0.020792204 0.020733554 0.020628003 0.020612504 0.020525894 
##         506         372         106         305         281         488 
## 0.020508325 0.020500750 0.020458367 0.020322084 0.020167636 0.020098264 
##         107         408          61         378         495         362 
## 0.019979657 0.019895974 0.019888577 0.019842347 0.019758476 0.019708326 
##         464         441         502          72          31         503 
## 0.019641264 0.019535733 0.019494992 0.019487509 0.019357802 0.019196742 
##         377         474         206         109         176         470 
## 0.019021473 0.018742787 0.018599895 0.018576116 0.018573909 0.018549962 
##         475          24          21         272         191         182 
## 0.018454630 0.018349943 0.018291606 0.018257713 0.018255002 0.018236258 
##         280         193          59         393         468          32 
## 0.018187588 0.018166038 0.018144941 0.018057787 0.017986886 0.017914973 
##         473          13         175         339         250         108 
## 0.017558517 0.017515956 0.017493155 0.017478416 0.017455755 0.017406656 
##         477         471         100         252         194          29 
## 0.017347496 0.017284046 0.017201538 0.017184940 0.017061589 0.016973372 
##           1         111         105         382         110         192 
## 0.016924788 0.016913019 0.016758090 0.016626899 0.016584787 0.016542322 
##         487          80         384         462          89         478 
## 0.016526408 0.016464872 0.016442195 0.016433136 0.016304202 0.016238530 
##         338         443          50           5         337          28 
## 0.016220162 0.016214412 0.016212339 0.016109512 0.016096392 0.016079911 
##         463          26         440         444         180         442 
## 0.016062520 0.016030728 0.015992441 0.015909769 0.015847614 0.015842840 
##          23         102          34           4         342         101 
## 0.015840873 0.015802714 0.015726017 0.015696418 0.015582982 0.015551153 
##         174         452          25          39          27         380 
## 0.015527300 0.015502794 0.015485697 0.015403852 0.015355867 0.015341865 
##         461         340         460         251          30          52 
## 0.015336055 0.015332114 0.015322098 0.015274908 0.015270683 0.015228053 
##         465         397         289         244         247         383 
## 0.015212785 0.015162426 0.015117256 0.015114103 0.015097999 0.015009813 
##         390         494         195         113         448         279 
## 0.014994429 0.014980418 0.014954135 0.014896418 0.014891803 0.014885192 
##         114         186         104         286          77           6 
## 0.014864058 0.014863311 0.014860553 0.014854146 0.014849574 0.014814788 
##         249          97         345         423         218         469 
## 0.014743171 0.014738668 0.014635746 0.014614515 0.014588384 0.014585919 
##          15          47         453         282         341          78 
## 0.014572238 0.014537164 0.014519597 0.014456354 0.014349909 0.014294676 
##          79         497          76          96         480          16 
## 0.014270285 0.014200634 0.014157446 0.014019282 0.013988039 0.013983437 
##          68           7         391         178         402         327 
## 0.013976068 0.013973681 0.013905019 0.013892764 0.013873574 0.013821103 
##          14         288         177          54         290         344 
## 0.013784130 0.013783235 0.013767856 0.013718117 0.013702610 0.013696161 
##          60         392         303         242         116         449 
## 0.013689553 0.013637163 0.013621520 0.013555501 0.013514623 0.013474300 
##          22         304          46         276          69         398 
## 0.013276488 0.013256003 0.013194028 0.013188283 0.013089071 0.013081086 
##         115          92         447         396         403         214 
## 0.013063499 0.013035960 0.013017456 0.013012848 0.012991087 0.012963082 
##         479         119          51          53         232         118 
## 0.012944738 0.012891868 0.012829756 0.012785420 0.012782823 0.012655996 
##         120         395          90         394         179         332 
## 0.012637141 0.012593789 0.012535015 0.012500174 0.012349145 0.012308399 
##          20         238          91         312         243         459 
## 0.012274329 0.012209535 0.012161298 0.011975197 0.011937432 0.011930106 
##           3          18         228         112          70          38 
## 0.011822955 0.011773758 0.011743897 0.011672010 0.011642797 0.011618819 
##         445         450         422         500         333         271 
## 0.011580795 0.011540876 0.011346375 0.011308995 0.011268940 0.011130278 
##           2         309         421         241         224          45 
## 0.011109178 0.011096064 0.011067840 0.010992539 0.010923043 0.010910486 
##         117         499          87         239          88         208 
## 0.010472896 0.010389131 0.010380584 0.010341193 0.010047724 0.009912134 
##          82          36         231         328         325         324 
## 0.009874485 0.009838493 0.009822593 0.009785332 0.009774782 0.009738437 
##          37         302         216         236         273         313 
## 0.009686272 0.009066012 0.009056657 0.008943379 0.008937216 0.008582156 
##         498         501         314         323         317          86 
## 0.008555365 0.008554815 0.008122787 0.008118747 0.007991405 0.007884738 
##          85         316         315         322          83         321 
## 0.007791846 0.007737680 0.007673505 0.007291172 0.007220090 0.007199388 
##          81         240         207         310          84         318 
## 0.007145677 0.007006450 0.006929322 0.006886914 0.006498251 0.005888659 
##         320         319 
## 0.005268682 0.004524537
plot(db$medv,leverage)

#observaciones 381 406 419 parecen atipicos, ya que tienen lo valores mas altos de leverage. VEAMOS LUEGO SI TENEMOS VENTAJAS AJUSTANDO UN MODELO ROBUSTO

#colinealidad
vif(ajuste)# colinealidad entre rad y tax?? Vif>5
##     crim       zn    indus     chas      nox       rm      age      dis 
## 1.792192 2.298758 3.991596 1.073995 4.393720 1.933744 3.100826 3.955945 
##      rad      tax  ptratio        b    lstat 
## 7.484496 9.008554 1.799084 1.348521 2.941491


De acuerdo al ajuste del modelo lineal múltiple, las variables Age e Indus NO fueron significativas.
Además, con la función vif(), observamos que la máxima colinealidad esta dada entre las variables rad y tax, lo cual coincide con el análisis gráfico. Ajustemos un modelo lineal SIN las variables AGE, INDUS y TAX

db2<-db[,-c(3,7,10)]
ajuste2<-lm(medv~.,data=db2)
summary(ajuste2)# no mejora significativamente el R2 ajustado
## 
## Call:
## lm(formula = medv ~ ., data = db2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.2609  -2.9888  -0.5083   1.8041  26.2482 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  34.712342   5.102742   6.803 2.97e-11 ***
## crim         -0.104843   0.033132  -3.164 0.001650 ** 
## zn            0.036634   0.013412   2.731 0.006532 ** 
## chas1         2.967868   0.860830   3.448 0.000614 ***
## nox         -20.314416   3.472292  -5.850 8.92e-09 ***
## rm            3.977104   0.407731   9.754  < 2e-16 ***
## dis          -1.429370   0.186922  -7.647 1.08e-13 ***
## rad           0.128761   0.040788   3.157 0.001692 ** 
## ptratio      -1.014914   0.129006  -7.867 2.30e-14 ***
## b             0.009700   0.002701   3.591 0.000363 ***
## lstat        -0.528147   0.047930 -11.019  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.79 on 495 degrees of freedom
## Multiple R-squared:  0.7342, Adjusted R-squared:  0.7288 
## F-statistic: 136.7 on 10 and 495 DF,  p-value: < 2.2e-16
plot(ajuste2$residuals,ajuste2$fitted.values)#nube de ptos en el 0, algunos outliers??

plot(ajuste2$residuals)

plot(db2$medv,ajuste2$fitted.values)
abline(0,1)

hist(ajuste2$residuals)#parece distribución normal

qqnorm(ajuste2$residuals)
qqline(ajuste2$residuals)

plot(ajuste2)#NO PARECEN HABER UNA GRAN DIFERENCIA A FAVOR DE ESTE MODELO


No obtenemos un aumento significativo del R2 ajustado, por lo que por el momento nos quedamos con el modelo lineal clasico, vemos potenciales outliers

leverage<-hatvalues(ajuste)
sort(leverage, decreasing = T)
##         381         419         406         411         366         156 
## 0.305959491 0.190100964 0.156432506 0.124706994 0.098514926 0.085276660 
##         491         368         493         365         492         153 
## 0.082067417 0.080430187 0.077112512 0.076722562 0.076458117 0.076138593 
##         490         354         489         415         215         127 
## 0.075708855 0.075573889 0.074208614 0.073871754 0.073404490 0.069996876 
##         143         124         369         157         164         284 
## 0.069303219 0.067954436 0.066330935 0.065890523 0.064252939 0.062857754 
##         155         121         125         123         126         122 
## 0.061301310 0.060987916 0.060920971 0.060617543 0.060372067 0.060134916 
##         428         163         254         356         355         146 
## 0.059619949 0.057418966 0.056841611 0.056269029 0.056075388 0.055989416 
##         413         148         145           9         371         370 
## 0.053925766 0.053777718 0.052889397 0.052871770 0.052760173 0.051978828 
##         161         210         160          55         405          49 
## 0.051974602 0.051720530 0.051266874 0.050784987 0.050739934 0.050580987 
##         149         399         147         212         343         359 
## 0.050439258 0.050131441 0.050016101 0.049856941 0.049849957 0.049706326 
##         222         205         204         152         103         375 
## 0.049331516 0.049090143 0.047908395 0.047867887 0.047675351 0.047599105 
##         144         358         373         258         425         364 
## 0.047381682 0.047097925 0.046792287 0.046075731 0.045401752 0.045141575 
##         352         142         154         439         293         266 
## 0.045135912 0.044478866 0.043746740 0.043410052 0.042993928 0.042782982 
##         357         151         274         150         291         292 
## 0.042765274 0.042752049 0.042372525 0.042179627 0.042170667 0.042137328 
##         275         353         424         417         427          58 
## 0.042070055 0.041585853 0.041346282 0.041077154 0.040341252 0.040270651 
##         211         263         167         283         451         219 
## 0.040242218 0.039911950 0.039896169 0.039850819 0.039791607 0.039726954 
##         270         278         213         201         330         416 
## 0.039714145 0.039434704 0.039229328 0.038782535 0.038610652 0.038473284 
##         217          65         458         200         457         221 
## 0.038395737 0.038204058 0.038005215 0.037963012 0.037724909 0.037269036 
##         467         455         420         277         220         407 
## 0.037260235 0.037243033 0.037171433 0.037075157 0.037032514 0.036707923 
##         412         162         374         329         209          56 
## 0.036705538 0.036291759 0.036201827 0.036164021 0.035909691 0.035833255 
##          11         268         159         438         226         223 
## 0.035330406 0.035237786 0.035234370 0.035181328 0.035175233 0.034783685 
##          33         237         363         260         298         235 
## 0.034772293 0.034639249 0.034365432 0.034030422 0.033875100 0.033779410 
##         331         198         257          41         202         199 
## 0.033719147 0.033271863 0.033157771 0.032775236 0.032657130 0.032493897 
##         259         294         203         414         426         446 
## 0.032457726 0.032368460 0.032353742 0.032288498 0.032261733 0.032051005 
##         229         432         158         456          75         166 
## 0.031963683 0.031940964 0.031887200 0.031847710 0.031822870 0.031803470 
##         437         141         267         253          40         296 
## 0.031504845 0.031501661 0.031263328 0.031161037 0.031118132 0.031080374 
##         197         264         350         388         496         347 
## 0.031057358 0.031037605 0.030842588 0.030822545 0.030783114 0.030626347 
##          57          62           8         262         265         287 
## 0.030612054 0.030408437 0.030193780 0.030104110 0.030022416 0.029735066 
##         346         351         248         433         196         245 
## 0.029721737 0.029545142 0.029497466 0.029434970 0.029417703 0.029326637 
##         484         168          10         297          67         285 
## 0.029227887 0.029089095 0.029055845 0.029041105 0.029007492 0.028820607 
##         255         361         400         387         261         431 
## 0.028782421 0.028643711 0.028588027 0.028455654 0.028445865 0.028420983 
##         430         256         367          94         348         483 
## 0.028411405 0.027919162 0.027733811 0.027526004 0.027427601 0.027365910 
##         172          42          98          99         133         311 
## 0.027194736 0.026869222 0.026828430 0.026750386 0.026600078 0.026489204 
##         295         165         385         131         246         454 
## 0.026488182 0.026358885 0.026169753 0.026110767 0.026104563 0.026078381 
##         169         171         504         307         132          66 
## 0.026041901 0.025902097 0.025754662 0.025743695 0.025563862 0.025523047 
##          71         138          74         129         233         436 
## 0.025476846 0.025445554 0.025380506 0.025375486 0.025289855 0.025279877 
##         472         401         188         185         482         184 
## 0.025278382 0.025246964 0.025230690 0.025059306 0.024833412 0.024587898 
##         136         170         135         376         386         269 
## 0.024567988 0.024509168 0.024493895 0.024445552 0.024411638 0.024396733 
##         225          73         418         140         485         308 
## 0.024307029 0.024280861 0.024192171 0.024090471 0.024080167 0.024032978 
##         434         139         429         360          19          12 
## 0.024006459 0.023976004 0.023860144 0.023689843 0.023666499 0.023624074 
##         301         379          95         181         173         334 
## 0.023620412 0.023478104 0.023425791 0.023371734 0.023359198 0.023344367 
##         505          44         234          43         189         349 
## 0.023162527 0.023146444 0.023112374 0.022864841 0.022749508 0.022728381 
##          35         335         389         227          17         409 
## 0.022688400 0.022602398 0.022596101 0.022419682 0.022364850 0.022260658 
##         183         435         306         134         187         404 
## 0.022237447 0.022126215 0.022084616 0.022008104 0.021979308 0.021949344 
##         299          48         481         128         137         486 
## 0.021929749 0.021927769 0.021842046 0.021787861 0.021568396 0.021565400 
##         300         410         230         130          93         466 
## 0.021553506 0.021537718 0.021410083 0.021326962 0.020863265 0.020850220 
##         326          63         336          64         190         476 
## 0.020811798 0.020792204 0.020733554 0.020628003 0.020612504 0.020525894 
##         506         372         106         305         281         488 
## 0.020508325 0.020500750 0.020458367 0.020322084 0.020167636 0.020098264 
##         107         408          61         378         495         362 
## 0.019979657 0.019895974 0.019888577 0.019842347 0.019758476 0.019708326 
##         464         441         502          72          31         503 
## 0.019641264 0.019535733 0.019494992 0.019487509 0.019357802 0.019196742 
##         377         474         206         109         176         470 
## 0.019021473 0.018742787 0.018599895 0.018576116 0.018573909 0.018549962 
##         475          24          21         272         191         182 
## 0.018454630 0.018349943 0.018291606 0.018257713 0.018255002 0.018236258 
##         280         193          59         393         468          32 
## 0.018187588 0.018166038 0.018144941 0.018057787 0.017986886 0.017914973 
##         473          13         175         339         250         108 
## 0.017558517 0.017515956 0.017493155 0.017478416 0.017455755 0.017406656 
##         477         471         100         252         194          29 
## 0.017347496 0.017284046 0.017201538 0.017184940 0.017061589 0.016973372 
##           1         111         105         382         110         192 
## 0.016924788 0.016913019 0.016758090 0.016626899 0.016584787 0.016542322 
##         487          80         384         462          89         478 
## 0.016526408 0.016464872 0.016442195 0.016433136 0.016304202 0.016238530 
##         338         443          50           5         337          28 
## 0.016220162 0.016214412 0.016212339 0.016109512 0.016096392 0.016079911 
##         463          26         440         444         180         442 
## 0.016062520 0.016030728 0.015992441 0.015909769 0.015847614 0.015842840 
##          23         102          34           4         342         101 
## 0.015840873 0.015802714 0.015726017 0.015696418 0.015582982 0.015551153 
##         174         452          25          39          27         380 
## 0.015527300 0.015502794 0.015485697 0.015403852 0.015355867 0.015341865 
##         461         340         460         251          30          52 
## 0.015336055 0.015332114 0.015322098 0.015274908 0.015270683 0.015228053 
##         465         397         289         244         247         383 
## 0.015212785 0.015162426 0.015117256 0.015114103 0.015097999 0.015009813 
##         390         494         195         113         448         279 
## 0.014994429 0.014980418 0.014954135 0.014896418 0.014891803 0.014885192 
##         114         186         104         286          77           6 
## 0.014864058 0.014863311 0.014860553 0.014854146 0.014849574 0.014814788 
##         249          97         345         423         218         469 
## 0.014743171 0.014738668 0.014635746 0.014614515 0.014588384 0.014585919 
##          15          47         453         282         341          78 
## 0.014572238 0.014537164 0.014519597 0.014456354 0.014349909 0.014294676 
##          79         497          76          96         480          16 
## 0.014270285 0.014200634 0.014157446 0.014019282 0.013988039 0.013983437 
##          68           7         391         178         402         327 
## 0.013976068 0.013973681 0.013905019 0.013892764 0.013873574 0.013821103 
##          14         288         177          54         290         344 
## 0.013784130 0.013783235 0.013767856 0.013718117 0.013702610 0.013696161 
##          60         392         303         242         116         449 
## 0.013689553 0.013637163 0.013621520 0.013555501 0.013514623 0.013474300 
##          22         304          46         276          69         398 
## 0.013276488 0.013256003 0.013194028 0.013188283 0.013089071 0.013081086 
##         115          92         447         396         403         214 
## 0.013063499 0.013035960 0.013017456 0.013012848 0.012991087 0.012963082 
##         479         119          51          53         232         118 
## 0.012944738 0.012891868 0.012829756 0.012785420 0.012782823 0.012655996 
##         120         395          90         394         179         332 
## 0.012637141 0.012593789 0.012535015 0.012500174 0.012349145 0.012308399 
##          20         238          91         312         243         459 
## 0.012274329 0.012209535 0.012161298 0.011975197 0.011937432 0.011930106 
##           3          18         228         112          70          38 
## 0.011822955 0.011773758 0.011743897 0.011672010 0.011642797 0.011618819 
##         445         450         422         500         333         271 
## 0.011580795 0.011540876 0.011346375 0.011308995 0.011268940 0.011130278 
##           2         309         421         241         224          45 
## 0.011109178 0.011096064 0.011067840 0.010992539 0.010923043 0.010910486 
##         117         499          87         239          88         208 
## 0.010472896 0.010389131 0.010380584 0.010341193 0.010047724 0.009912134 
##          82          36         231         328         325         324 
## 0.009874485 0.009838493 0.009822593 0.009785332 0.009774782 0.009738437 
##          37         302         216         236         273         313 
## 0.009686272 0.009066012 0.009056657 0.008943379 0.008937216 0.008582156 
##         498         501         314         323         317          86 
## 0.008555365 0.008554815 0.008122787 0.008118747 0.007991405 0.007884738 
##          85         316         315         322          83         321 
## 0.007791846 0.007737680 0.007673505 0.007291172 0.007220090 0.007199388 
##          81         240         207         310          84         318 
## 0.007145677 0.007006450 0.006929322 0.006886914 0.006498251 0.005888659 
##         320         319 
## 0.005268682 0.004524537
plot(db$medv,leverage)

plot(ajuste$residuals)


Las observaciones 381 406 y 419 parecen atipicos, ya que tienen lo valores mas altos de leverage. Veamos si se ponen en evidencia como datos influyentes y si tenemos ventajas ajustando un modelo robusto

3.2 MODELO ROBUSTO

robusto<-lmRob(medv~., data=db)
## 00:00:00 left
summary(robusto)#R2 ajustado mucho menor, veamos que ocurre si ajustamos el modelo con menos variables
## 
## Call:
## lmRob(formula = medv ~ ., data = db)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.2737  -1.8816  -0.2092   2.2086  36.6468 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.413175   5.390138   1.190 0.234700    
## crim        -0.115853   0.085542  -1.354 0.176247    
## zn           0.029100   0.013568   2.145 0.032462 *  
## indus       -0.013139   0.057156  -0.230 0.818274    
## chas1        1.223951   0.852206   1.436 0.151577    
## nox         -5.948090   3.563890  -1.669 0.095756 .  
## rm           6.418135   0.482066  13.314  < 2e-16 ***
## age         -0.043296   0.012545  -3.451 0.000606 ***
## dis         -0.985929   0.193109  -5.106 4.72e-07 ***
## rad          0.155002   0.067996   2.280 0.023060 *  
## tax         -0.012315   0.003438  -3.582 0.000375 ***
## ptratio     -0.703856   0.120831  -5.825 1.03e-08 ***
## b            0.012306   0.002651   4.642 4.43e-06 ***
## lstat       -0.207954   0.056699  -3.668 0.000271 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.48 on 492 degrees of freedom
## Multiple R-Squared: 0.5755 
## 
## Test for Bias:
##             statistic   p-value
## M-estimate     -32.23 1.0000000
## LS-estimate     38.34 0.0004604
pred_rob<-predict(robusto)
plot(db$medv,pred_rob)
abline(0,1)

robusto2<-lmRob(medv~., data=db2)
summary(robusto2)# no mejora mucho, por lo que nos quedamos con el modelo lineal 
## 
## Call:
## lmRob(formula = medv ~ ., data = db2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.0563  -2.0066  -0.1088   2.2459  34.4643 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.839401   4.769500   1.224 0.221413    
## crim         -0.100231   0.032921  -3.045 0.002454 ** 
## zn            0.017316   0.012068   1.435 0.151960    
## chas1         1.886284   0.784820   2.403 0.016608 *  
## nox         -11.173988   2.935326  -3.807 0.000158 ***
## rm            6.480611   0.450549  14.384  < 2e-16 ***
## dis          -0.695383   0.164243  -4.234 2.74e-05 ***
## rad          -0.009479   0.035365  -0.268 0.788789    
## ptratio      -0.844913   0.109491  -7.717 6.63e-14 ***
## b             0.011592   0.002313   5.011 7.55e-07 ***
## lstat        -0.338140   0.044381  -7.619 1.31e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.689 on 495 degrees of freedom
## Multiple R-Squared: 0.5669 
## 
## Test for Bias:
##             statistic   p-value
## M-estimate     -46.11 1.000e+00
## LS-estimate     64.66 1.248e-09
#Boxplot de residuos del modelo robusto para identificar outliers 

boxplot(robusto$residuals)

plot(db$medv,robusto$fitted.values) #similar en ambos modelos

boxplot(robusto$residuals,ajuste$residuals)#no difieren demasiado

plot(robusto$residuals,robusto$fitted.values) #muy parecido al modelo lineal

plot(robusto) #se ponen en evidencia los 3 valores atipicos (369,372,373), sin embargo, lo mismo tienen valores de leverage no tan elevados, por lo que podrian tratarse de outliers no influyentes, de todas formas, por el momento nos quedamos con el modelo lineal por > R2


De acuerdo al ajuste del modelo robusto con todas las variables, crim, indus, chas1 y nox NO fueron significativas.
Se ponen en evidencia 3 valores atipicos: 369, 372 y 373, sin embargo, estos tienen valores de leverage no tan elevados, por lo que podrian tratarse de outliers no influyentes, de todas formas nos seguimos quedando con el modelo lineal con todas las variables debido a que tenemos mayor variabilidad explicada

4 EVALUAMOS BONDAD DE PREDICCION DE LOS 2 MODELOS CON ECM, RMSE Y AME- p MAE


Modelo Lineal

ecm<-c()
ame<-c()
for(i in 1:nrow(db))
{
  reg<-lm(medv~.,data = db[-i,])
  ecm[i]<-(predict(reg,db[i,])-db$medv[i])^2
  ame[i]<-abs(predict(reg,db[i,])-db$medv[i])
}

ECM_lineal<-mean(ecm) #23.72
RMSE_lineal<-sqrt(mean(ecm)) #4.87 desvio cuadratico medio
AME_lineal<-mean(ame)#3.38

#otra forma con trainControl de caret

errores_lineal<-train(medv~.,data=db, method="lm", trControl=trainControl(method = "LOOCV"))
print(errores_lineal)
## Linear Regression 
## 
## 506 samples
##  13 predictor
## 
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation 
## Summary of sample sizes: 505, 505, 505, 505, 505, 505, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   4.870908  0.7191505  3.382797
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
#o con metrics
AMEp_lineal<-mape(db$medv,ajuste$fitted.values)#0.16, error medio absoluto proporcional


Modelo Robusto


Con el modelo robusto se logra un ajuste con menor AME, pero con AMEp muy similar que con el modelo lineal.

5 SELECCION DE VARIABLES

set.seed(1234)
Xmodelo<-model.matrix(ajuste)
Y<-db$medv
Xy<-data.frame(Xmodelo[,-1],Y)
res.bestglm <-bestglm(Xy = Xy,
                      IC = "CV",
                      method = "exhaustive")

res.bestglm$Subsets 
##     (Intercept)  crim    zn indus chas1   nox    rm   age   dis   rad   tax
## 0          TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 1          TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2          TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## 3          TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## 4          TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE
## 5*         TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE FALSE
## 6          TRUE FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE
## 7          TRUE FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE
## 8          TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE
## 9          TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE
## 10         TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE
## 11         TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE
## 12         TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE
## 13         TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##     ptratio     b lstat logLikelihood       CV
## 0     FALSE FALSE FALSE    -1122.2572 85.50373
## 1     FALSE FALSE  TRUE     -923.5046 39.90643
## 2     FALSE FALSE  TRUE     -864.7883 32.85251
## 3      TRUE FALSE  TRUE     -835.0657 29.60324
## 4      TRUE FALSE  TRUE     -825.6966 29.18787
## 5*     TRUE FALSE  TRUE     -810.7364 27.78703
## 6      TRUE FALSE  TRUE     -803.9866 27.96785
## 7      TRUE  TRUE  TRUE     -798.2363 27.95317
## 8      TRUE  TRUE  TRUE     -794.1546 27.84265
## 9      TRUE  TRUE  TRUE     -790.8362 28.70348
## 10     TRUE  TRUE  TRUE     -786.0154 27.80005
## 11     TRUE  TRUE  TRUE     -780.8803 28.33435
## 12     TRUE  TRUE  TRUE     -780.8228 28.39083
## 13     TRUE  TRUE  TRUE     -780.8214 29.35995


El mejor modelo es el 5 (menor Cross-validation (CV=27.81)). En este modelo, se excluyen las variables crim, chas, zn, indus, age, rad, tax y b.
Veamos si ajustando un modelo lineal, sin esas variables mejoran las metricas.

db3<-db[,-c(1,2,3,4,7,9,10,12)]
ajuste3<-lm(medv~.,data=db3)
summary(ajuste3)#0.70 R2 ajustado no mejora significativamente
## 
## Call:
## lm(formula = medv ~ ., data = db3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.7765  -3.0186  -0.6481   1.9752  27.7625 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  37.49920    4.61295   8.129 3.43e-15 ***
## nox         -17.99657    3.26095  -5.519 5.49e-08 ***
## rm            4.16331    0.41203  10.104  < 2e-16 ***
## dis          -1.18466    0.16842  -7.034 6.64e-12 ***
## ptratio      -1.04577    0.11352  -9.212  < 2e-16 ***
## lstat        -0.58108    0.04794 -12.122  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.994 on 500 degrees of freedom
## Multiple R-squared:  0.7081, Adjusted R-squared:  0.7052 
## F-statistic: 242.6 on 5 and 500 DF,  p-value: < 2.2e-16
errores_lineal3<-train(medv~.,data=db3, method="lm", trControl=trainControl(method = "LOOCV"))
print(errores_lineal3)
## Linear Regression 
## 
## 506 samples
##   5 predictor
## 
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation 
## Summary of sample sizes: 505, 505, 505, 505, 505, 505, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   5.063549  0.6963521  3.548789
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
AMEp_lineal3<-mape(db$medv,ajuste3$fitted.values)


No observamos gran reduccion de los errores, tampoco mejora el R2 ajustado, por lo que nos seguimos quedando con el modelo lineal con todas las variables

5.1 REGRESION RIDGE


Reduce los coeficientes a valores no nulos para evitar el sobreajuste, pero conserva todas las variables.

train<-db
#Modelo lineal 
set.seed(1234)
lm<-train(medv~.,
          train,
          method="lm")
lm$results
##   intercept    RMSE  Rsquared      MAE    RMSESD RsquaredSD     MAESD
## 1      TRUE 4.92983 0.7180031 3.450138 0.3904757 0.04027113 0.2519472
summary(lm)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.595  -2.730  -0.518   1.777  26.199 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.646e+01  5.103e+00   7.144 3.28e-12 ***
## crim        -1.080e-01  3.286e-02  -3.287 0.001087 ** 
## zn           4.642e-02  1.373e-02   3.382 0.000778 ***
## indus        2.056e-02  6.150e-02   0.334 0.738288    
## chas1        2.687e+00  8.616e-01   3.118 0.001925 ** 
## nox         -1.777e+01  3.820e+00  -4.651 4.25e-06 ***
## rm           3.810e+00  4.179e-01   9.116  < 2e-16 ***
## age          6.922e-04  1.321e-02   0.052 0.958229    
## dis         -1.476e+00  1.995e-01  -7.398 6.01e-13 ***
## rad          3.060e-01  6.635e-02   4.613 5.07e-06 ***
## tax         -1.233e-02  3.760e-03  -3.280 0.001112 ** 
## ptratio     -9.527e-01  1.308e-01  -7.283 1.31e-12 ***
## b            9.312e-03  2.686e-03   3.467 0.000573 ***
## lstat       -5.248e-01  5.072e-02 -10.347  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared:  0.7406, Adjusted R-squared:  0.7338 
## F-statistic: 108.1 on 13 and 492 DF,  p-value: < 2.2e-16
plot(lm$finalModel)

#regresion ridge
set.seed(1234)
ridge<-train(medv~.,
          train,
          method="glmnet",
          tuneGrid=expand.grid(alpha=0,
                               lambda= seq(0.001, 1, length=5)))
plot(ridge)

ridge
## glmnet 
## 
## 506 samples
##  13 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 506, 506, 506, 506, 506, 506, ... 
## Resampling results across tuning parameters:
## 
##   lambda   RMSE      Rsquared   MAE     
##   0.00100  4.935964  0.7181718  3.377426
##   0.25075  4.935964  0.7181718  3.377426
##   0.50050  4.935964  0.7181718  3.377426
##   0.75025  4.940556  0.7180547  3.376129
##   1.00000  4.956581  0.7172357  3.376619
## 
## Tuning parameter 'alpha' was held constant at a value of 0
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0 and lambda = 0.5005.
plot(ridge$finalModel, xvar= "lambda", label= T)

plot(ridge$finalModel, xvar= "dev", label= T)

plot(varImp(ridge, scale = T))


El mejor valor de lamba para nuestra regresion Ridge es de 0.5 (lambda = 0.5005). En el gráfico “RMSE vs Regularization Parameter”, podemos ver que para valores de lambda mayores a 0.5,los RMSE aumentan.

En el gráfico “Coefficients vs Log Lambda”, vemos como al aumentar lambda reducimos el valor de los coeficientes. Permite reducir a cero los coeficientes de aquellas variables que no contribuyen al modelo.
En el gráfico “Coefficients vs Fraction Deviance Explained”, alrededor 20% de la variabilidad puede verse en el punto en el que los coeficientes comienzan a incrementarse. Hacia la parte derecha del gráfico, luego del 60%, vemos que los coeficientes aumentan mucho. Si modelaramos con coeficientes en esos valores, esperariamos que el modelo sobreajustara (overfitting).
Finalmente, con el gráfico de “Variable Importance”, vemos que las variables mas importantes para el modelo Ridge son nox, rm, chas1, dis, ptratio e Istat.

5.2 REGRESION LASSO


Reduce los coeficientes de regresión, algunos a cero. Por esto, sirve para la selección de variables.

set.seed(1234)
lasso<-train(medv~.,
          train,
          method="glmnet",
          tuneGrid=expand.grid(alpha=1,
                               lambda= seq(0.001, 1, length=5)))
plot(lasso)

lasso
## glmnet 
## 
## 506 samples
##  13 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 506, 506, 506, 506, 506, 506, ... 
## Resampling results across tuning parameters:
## 
##   lambda   RMSE      Rsquared   MAE     
##   0.00100  4.928988  0.7180141  3.443731
##   0.25075  5.104895  0.6992171  3.479355
##   0.50050  5.271761  0.6825255  3.616864
##   0.75025  5.372117  0.6756955  3.700316
##   1.00000  5.466256  0.6714291  3.778905
## 
## Tuning parameter 'alpha' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 0.001.
plot(lasso$finalModel, xvar="lambda", label= T)

plot(lasso$finalModel, xvar="dev", label= T)

plot(varImp(lasso, scale=F))


En el gráfico “RMSE vs Regularization Parameter”, podemos ver que aumentan los valores de lambda, los RMSE aumentan.

En el gráfico “Coefficients vs Log Lambda”, vemos como al aumentar lambda reducimos el valor de los coeficientes.
Nuevamente, en el gráfico “Coefficients vs Fraction Deviance Explained”, alrededor 20% de la variabilidad puede verse en el punto en el que los coeficientes comienzan a incrementarse.También vemos que el 60% de la variabilidad parece estar explicada por 3 variables: rm (6), ptratio (11) e Istat (13).

Finalmente, al igual que con Ridge, vemos que con el gráfico de “Variable Importance”, las variables mas importantes para el modelo Lasso son nox, rm, chas1, dis, ptratio e Istat.

model_list<-list(ModeloLineal= lm, ModeloRidge= ridge, ModeloLasso= lasso)
res<-resamples(model_list)
summary(res)
## 
## Call:
## summary.resamples(object = res)
## 
## Models: ModeloLineal, ModeloRidge, ModeloLasso 
## Number of resamples: 25 
## 
## MAE 
##                  Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## ModeloLineal 3.110125 3.330665 3.405184 3.450138 3.518714 4.332531    0
## ModeloRidge  3.071106 3.273877 3.331695 3.377426 3.448563 4.114743    0
## ModeloLasso  3.103903 3.329793 3.398868 3.443731 3.513852 4.322139    0
## 
## RMSE 
##                  Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## ModeloLineal 4.269726 4.658180 4.859985 4.929830 5.127862 5.899336    0
## ModeloRidge  4.256486 4.640930 4.854830 4.935964 5.120946 5.750630    0
## ModeloLasso  4.267664 4.655568 4.861354 4.928988 5.126009 5.891671    0
## 
## Rsquared 
##                   Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## ModeloLineal 0.6447961 0.6906079 0.7175997 0.7180031 0.7555193 0.7711211    0
## ModeloRidge  0.6417932 0.6882782 0.7250665 0.7181718 0.7528197 0.7808316    0
## ModeloLasso  0.6437923 0.6903470 0.7181786 0.7180141 0.7560319 0.7714477    0
bwplot(res)


En la comparación entre los 3 modelos (Lineal, Ridge y Lasso), valor mas bajo de RMSE promedio lo obtuvimos con el Modelo Lasso.
Sin embargo, si observamos los Rsquared que nos miden la variabilidad en la variable respuesta explicada por variables independientes en el modelo, el valor mas alto lo obtuvimos con el modelo Ridge.
De forma gráfica (Boxplot), es muy difícil determinar cuál modelo es mejor.

5.3 MODELOS ADITIVOS GENERALIZADOS (GAM)


Ajustamos un modelo aditivo generalizado para evaluar efectos no lineales entre las variables y el valor de las propiedades, excluyendo la variable categorica (chas)

ajus.gam <- gam::gam(medv~s(crim)+s(zn)+s(indus)+s(nox)+s(rm)+s(age)+s(dis)+s(rad)+s(tax)+s(ptratio)+s(b)+s(lstat),data=db)
summary(ajus.gam)
## 
## Call: gam::gam(formula = medv ~ s(crim) + s(zn) + s(indus) + s(nox) + 
##     s(rm) + s(age) + s(dis) + s(rad) + s(tax) + s(ptratio) + 
##     s(b) + s(lstat), data = db)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -21.0856  -1.8141  -0.1376   1.5072  24.4029 
## 
## (Dispersion Parameter for gaussian family taken to be 12.9079)
## 
##     Null Deviance: 42716.3 on 505 degrees of freedom
## Residual Deviance: 5898.885 on 456.9997 degrees of freedom
## AIC: 2778.693 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## s(crim)      1  9658.1  9658.1 748.236 < 2.2e-16 ***
## s(zn)        1  2107.7  2107.7 163.291 < 2.2e-16 ***
## s(indus)     1  2983.4  2983.4 231.131 < 2.2e-16 ***
## s(nox)       1   193.1   193.1  14.964 0.0001255 ***
## s(rm)        1 10742.4 10742.4 832.237 < 2.2e-16 ***
## s(age)       1   223.6   223.6  17.321 3.774e-05 ***
## s(dis)       1  1230.4  1230.4  95.322 < 2.2e-16 ***
## s(rad)       1   149.4   149.4  11.576 0.0007268 ***
## s(tax)       1   197.4   197.4  15.295 0.0001059 ***
## s(ptratio)   1   941.3   941.3  72.922 < 2.2e-16 ***
## s(b)         1   367.4   367.4  28.462 1.507e-07 ***
## s(lstat)     1  3205.4  3205.4 248.330 < 2.2e-16 ***
## Residuals  457  5898.9    12.9                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##             Npar Df Npar F     Pr(F)    
## (Intercept)                             
## s(crim)           3  6.185 0.0003986 ***
## s(zn)             3  1.840 0.1389846    
## s(indus)          3  1.835 0.1400254    
## s(nox)            3  5.151 0.0016422 ** 
## s(rm)             3 43.896 < 2.2e-16 ***
## s(age)            3  1.202 0.3084023    
## s(dis)            3  8.595 1.463e-05 ***
## s(rad)            3  2.519 0.0574729 .  
## s(tax)            3  3.838 0.0098216 ** 
## s(ptratio)        3  1.864 0.1347940    
## s(b)              3  3.401 0.0177139 *  
## s(lstat)          3 24.829 6.661e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pred1<-predict(ajus.gam,newdata=db)
plot(db$medv,pred1)
abline(0,1)

deviance_explicada_gam<-with(summary(ajus.gam), 1 - deviance/null.deviance)#0.86

#Errores

ecm<-c()
ame<-c()
for(i in 1:nrow(db))
{
  reg<-gam::gam(medv~s(crim)+s(zn)+s(indus)+s(nox)+s(rm)+s(age)+s(dis)+s(rad)+s(tax)+s(ptratio)+s(b)+s(lstat),data=db[-i,])
  ecm[i]<-(predict(reg,db[i,])-db$medv[i])^2
  ame[i]<-abs(predict(reg,db[i,])-db$medv[i])
}

ECM_gam<-mean(ecm)#14.46
RMSE_gam<-sqrt(mean(ecm))#3.80
AME_gam<-mean(ame) #2.54 

#con Metrics
AMEp_gam<-mape(db$medv,ajus.gam$fitted.values)#0.11 disminuyo con respecto a lm


Se puede observar que 7 variables fueron significativas para efectos no lineales con medv: crim, nox, rm, dis, tax, b y Istat, por otro lado, el AME y AMEp por LOOCV disminuyeron en relación a las mismas métricas del modelo lineal con 0.86 de la variabilidad explicada

6 PROJECTION PORSUIT REGRESSION


Ahora, para poner en evidencia las correlaciones que explican la mayor parte de la variabilidad del valor de las propiedades (medv) y potenciales interacciones evaluamos un modelo PPR

summary(ajus.ppr)
## Call:
## ppr(formula = medv ~ ., data = db, nterms = 3, max.terms = 3, 
##     trace = TRUE)
## 
## Goodness of fit:
## 3 terms 
## 6450.42 
## 
## Projection direction vectors ('alpha'):
##         term 1        term 2        term 3       
## crim    -0.0122571909 -0.0916661467 -0.0137582119
## zn       0.0034600458  0.0002062988  0.0010534628
## indus    0.0025406952 -0.0392291042 -0.0056625920
## chas0   -0.0563795353  0.0073259020 -0.2370857319
## chas1   -0.0100221353 -0.0185486340  0.2571501889
## nox     -0.3886291182 -0.4427070987 -0.9115089898
## rm       0.8098936553 -0.8492344197 -0.1808002043
## age     -0.0016413625 -0.0100467408  0.0071509957
## dis     -0.2267340948 -0.2121355974  0.0613418252
## rad      0.0935064387  0.0763908334 -0.0251383900
## tax     -0.0022896976 -0.0049369906  0.0007246877
## ptratio -0.2006450217 -0.1209878475  0.0836028747
## b        0.0072518587 -0.0023510832 -0.0024094483
## lstat   -0.2985316731  0.0828612574 -0.0496251380
## 
## Coefficients of ridge terms ('beta'):
##   term 1   term 2   term 3 
## 8.397303 1.593870 1.378651
plot(ajus.ppr)

pred2 <- predict(ajus.ppr)
plot(db$medv, pred2)
abline(0,1,col='green')


Se observa que la mayor variabilidad de los datos es explicada en el primer termino y por las variables nox y Istat(correlacion negativa), y rm (correlacion positiva), esta ultima presenta un signo opuesto de magnitud similar en el segundo termino, lo cual podria indicar que para un menor valor de las propiedades el numero de habitaciones no disminuye demasiado.
En el grafico se puede ver que el ECM disminuye significativamente cuando se utilizan 3 terminos, el AME y AMEp son similares al ajuste GAM, veamos si mejora aun mas lm y gam incorporando interacciones.

7 GAM y LM con interaccion

#gam con interacc
ajus.gam.interac <- gam::gam(medv~s(crim)+s(zn)+s(indus)+s(nox)+s(rm)+s(age)+s(dis)+s(rad)+s(tax)+s(ptratio)+s(b)+s(lstat)+s(nox*lstat),data=db)
summary(ajus.gam.interac)
## 
## Call: gam::gam(formula = medv ~ s(crim) + s(zn) + s(indus) + s(nox) + 
##     s(rm) + s(age) + s(dis) + s(rad) + s(tax) + s(ptratio) + 
##     s(b) + s(lstat) + s(nox * lstat), data = db)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -23.6701  -1.6262  -0.1073   1.5659  23.3878 
## 
## (Dispersion Parameter for gaussian family taken to be 12.1607)
## 
##     Null Deviance: 42716.3 on 505 degrees of freedom
## Residual Deviance: 5508.816 on 452.9998 degrees of freedom
## AIC: 2752.076 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                 Df  Sum Sq Mean Sq  F value    Pr(>F)    
## s(crim)          1  9195.1  9195.1 756.1276 < 2.2e-16 ***
## s(zn)            1  1062.3  1062.3  87.3524 < 2.2e-16 ***
## s(indus)         1  3091.6  3091.6 254.2257 < 2.2e-16 ***
## s(nox)           1   444.0   444.0  36.5098 3.173e-09 ***
## s(rm)            1 10610.7 10610.7 872.5371 < 2.2e-16 ***
## s(age)           1    52.1    52.1   4.2804 0.0391217 *  
## s(dis)           1  1335.8  1335.8 109.8486 < 2.2e-16 ***
## s(rad)           1   159.8   159.8  13.1397 0.0003219 ***
## s(tax)           1   180.8   180.8  14.8681 0.0001320 ***
## s(ptratio)       1   731.0   731.0  60.1127 5.962e-14 ***
## s(b)             1   362.4   362.4  29.7982 7.912e-08 ***
## s(lstat)         1  3820.7  3820.7 314.1851 < 2.2e-16 ***
## s(nox * lstat)   1  1075.2  1075.2  88.4140 < 2.2e-16 ***
## Residuals      453  5508.8    12.2                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                Npar Df Npar F     Pr(F)    
## (Intercept)                                
## s(crim)              3  3.712  0.011660 *  
## s(zn)                3  2.257  0.081048 .  
## s(indus)             3  1.891  0.130254    
## s(nox)               3  3.269  0.021157 *  
## s(rm)                3 38.913 < 2.2e-16 ***
## s(age)               3  1.196  0.310778    
## s(dis)               3  8.103 2.877e-05 ***
## s(rad)               3  2.800  0.039615 *  
## s(tax)               3  3.942  0.008536 ** 
## s(ptratio)           3  1.430  0.233262    
## s(b)                 3  2.918  0.033872 *  
## s(lstat)             3 13.823 1.220e-08 ***
## s(nox * lstat)       3 15.990 6.754e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Errores

ecm<-c()
ame<-c()
for(i in 1:nrow(db))
{
  reg<-gam::gam(medv~s(crim)+s(zn)+s(indus)+s(nox)+s(rm)+s(age)+s(dis)+s(rad)+s(tax)+s(ptratio)+s(b)+s(lstat)+s(nox*lstat),data=db[-i,])
  ecm[i]<-(predict(reg,db[i,])-db$medv[i])^2
  ame[i]<-abs(predict(reg,db[i,])-db$medv[i])
}

ECM_gam_inter<-mean(ecm)#14.02 nox*lstat
RMSE_gam_inter<-sqrt(mean(ecm))#3.74
AME_gam_inter<-mean(ame) #2.45, los errores mejoraron con respecto a gam sin interacc y ppr

#con Metrics
AMEp_gam_interac<-mape(db$medv,ajus.gam.interac$fitted.values) #0.11 = a gam y ppr

pred_inter<-predict(ajus.gam.interac)
plot(db$medv,pred_inter)
abline(0,1)#mejoro el grafico de observados vs predichos

deviance_explicada_interac<-with(summary(ajus.gam.interac), 1 - deviance/null.deviance) #0.87 mejoro un poco con respecto a gam sin interacc

#grafico para nox vs lstat para entender un poco mejor esta interaccion

gp13<-ggplot(db,aes(x=lstat,y=nox))+
              geom_point(col="blue")
gp13

#lm interacc

lineal.interac<-lm(medv~crim+zn+indus+nox+chas+rm+age+dis+rad+tax+ptratio+b+lstat+rm*lstat,data=db)
summary(lineal.interac)# la interaccion rm*istat fue significativa, R2 de 0.80, mejoria con respecto a lm sin interaccion pero no tanto como el modelo gam con interaccion nox*lstat. La interaccion entre nox y lstat fue no significativa en el modelo lineal
## 
## Call:
## lm(formula = medv ~ crim + zn + indus + nox + chas + rm + age + 
##     dis + rad + tax + ptratio + b + lstat + rm * lstat, data = db)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21.5738  -2.3319  -0.3584   1.8149  27.9558 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.073638   5.038175   1.206 0.228582    
## crim         -0.157100   0.028808  -5.453 7.85e-08 ***
## zn            0.027199   0.012020   2.263 0.024083 *  
## indus         0.052272   0.053475   0.978 0.328798    
## nox         -15.051627   3.324807  -4.527 7.51e-06 ***
## chas1         2.051584   0.750060   2.735 0.006459 ** 
## rm            7.958907   0.488520  16.292  < 2e-16 ***
## age           0.013466   0.011518   1.169 0.242918    
## dis          -1.120269   0.175498  -6.383 4.02e-10 ***
## rad           0.320355   0.057641   5.558 4.49e-08 ***
## tax          -0.011968   0.003267  -3.664 0.000276 ***
## ptratio      -0.721302   0.115093  -6.267 8.06e-10 ***
## b             0.003985   0.002371   1.681 0.093385 .  
## lstat         1.844883   0.191833   9.617  < 2e-16 ***
## rm:lstat     -0.418259   0.032955 -12.692  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.122 on 491 degrees of freedom
## Multiple R-squared:  0.8047, Adjusted R-squared:  0.7991 
## F-statistic: 144.5 on 14 and 491 DF,  p-value: < 2.2e-16
hist(lineal.interac$residuals) 

qqnorm(lineal.interac$residuals)
qqline(lineal.interac$residuals)

plot(lineal.interac$residuals)

plot(lineal.interac$residuals,lineal.interac$fitted.values)

plot(db$medv,lineal.interac$fitted.values)
abline(0,1)

ecm<-c()
ame<-c()
for(i in 1:nrow(db))
{
  reg<-lm(medv~crim+zn+indus+nox+chas+rm+age+dis+rad+tax+ptratio+b+lstat+rm*lstat,data = db[-i,])
  ecm[i]<-(predict(reg,db[i,])-db$medv[i])^2
  ame[i]<-abs(predict(reg,db[i,])-db$medv[i])
}

ECM_lineal.interac<-mean(ecm) #18.04
RMSE_lineal.interac<-sqrt(mean(ecm)) #4.24
AME_lineal.interac<-mean(ame)#2.86
AMEp_lineal.interac<-mape(db$medv,lineal.interac$fitted.values)#0.13 mejoraron los errores con respecto al lm sin interaccion, sin embargo no superan a gam con interaccion


#Lineal con las variables que fueron importantes para ridge y lasso, ademas de b, porque nos interesa estudiar esta variable

lineal_selec<-lm(medv~b+nox+rm+chas+dis+ptratio+lstat+rm*lstat,data=db)
summary(lineal_selec)#R2 ajustado 0.78 pero no supera a gam con interacc nox*lstat
## 
## Call:
## lm(formula = medv ~ b + nox + rm + chas + dis + ptratio + lstat + 
##     rm * lstat, data = db)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21.4728  -2.3716  -0.3362   1.7577  30.2179 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.609545   4.986077   0.122 0.902751    
## b             0.004736   0.002365   2.002 0.045788 *  
## nox         -12.120534   2.891198  -4.192 3.27e-05 ***
## rm            8.221198   0.485310  16.940  < 2e-16 ***
## chas1         2.598006   0.773042   3.361 0.000837 ***
## dis          -0.980602   0.146022  -6.715 5.14e-11 ***
## ptratio      -0.677280   0.101680  -6.661 7.23e-11 ***
## lstat         1.742538   0.194612   8.954  < 2e-16 ***
## rm:lstat     -0.401815   0.033483 -12.000  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.303 on 497 degrees of freedom
## Multiple R-squared:  0.7846, Adjusted R-squared:  0.7811 
## F-statistic: 226.3 on 8 and 497 DF,  p-value: < 2.2e-16
hist(lineal_selec$residuals) #esta bien

qqnorm(lineal_selec$residuals)
qqline(lineal_selec$residuals)

plot(lineal_selec$residuals)

plot(db$medv,lineal_selec$fitted.values)# permanece igual
abline(0,1)

ecm<-c()
ame<-c()
for(i in 1:nrow(db))
{
  reg<-lm(medv~b+nox+rm+chas+dis+ptratio+lstat+rm*lstat,data = db[-i,])
  ecm[i]<-(predict(reg,db[i,])-db$medv[i])^2
  ame[i]<-abs(predict(reg,db[i,])-db$medv[i])
}

ECM_lineal.selec<-mean(ecm) #19.37
RMSE_lineal.selec<-sqrt(mean(ecm)) #4.40
AME_lineal.selec<-mean(ame)#2.93
AMEp_lineal.selec<-mape(db$medv,lineal_selec$fitted.values)#0.14 no supera al lineal completo con interaccion ni al modelo gam

#GAM con las variables importantes para lasso y ridge con interaccion
ajus.gam4 <- gam::gam(medv~s(nox)+s(rm)+s(dis)+s(ptratio)+s(b)+s(lstat)+s(nox*lstat),data=db)
summary(ajus.gam4)
## 
## Call: gam::gam(formula = medv ~ s(nox) + s(rm) + s(dis) + s(ptratio) + 
##     s(b) + s(lstat) + s(nox * lstat), data = db)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.6941  -1.8216  -0.1427   1.6689  25.1910 
## 
## (Dispersion Parameter for gaussian family taken to be 14.3977)
## 
##     Null Deviance: 42716.3 on 505 degrees of freedom
## Residual Deviance: 6867.711 on 476.9996 degrees of freedom
## AIC: 2815.64 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                 Df  Sum Sq Mean Sq F value    Pr(>F)    
## s(nox)           1  9136.6  9136.6 634.585 < 2.2e-16 ***
## s(rm)            1 12521.1 12521.1 869.656 < 2.2e-16 ***
## s(dis)           1   754.2   754.2  52.386 1.835e-12 ***
## s(ptratio)       1  1328.1  1328.1  92.241 < 2.2e-16 ***
## s(b)             1   832.6   832.6  57.826 1.535e-13 ***
## s(lstat)         1  4799.7  4799.7 333.368 < 2.2e-16 ***
## s(nox * lstat)   1  1302.7  1302.7  90.482 < 2.2e-16 ***
## Residuals      477  6867.7    14.4                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                Npar Df Npar F     Pr(F)    
## (Intercept)                                
## s(nox)               3  3.079  0.027254 *  
## s(rm)                3 36.668 < 2.2e-16 ***
## s(dis)               3  3.848  0.009662 ** 
## s(ptratio)           3  1.799  0.146570    
## s(b)                 3  2.417  0.065651 .  
## s(lstat)             3 17.568 7.822e-11 ***
## s(nox * lstat)       3 10.990 5.463e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
deviance_explicada_gam4<-with(summary(ajus.gam4), 1 - deviance/null.deviance)#0.84

pred_gam4<-predict(ajus.gam4)
plot(db$medv,pred_gam4)
abline(0,1)# sin cambios

ecm<-c()
ame<-c()
for(i in 1:nrow(db))
{
  reg<-gam::gam(medv~s(nox)+s(rm)+s(dis)+s(ptratio)+s(b)+s(lstat)+s(nox*lstat),data=db[-i,])
  ecm[i]<-(predict(reg,db[i,])-db$medv[i])^2
  ame[i]<-abs(predict(reg,db[i,])-db$medv[i])
}

ECM_gam4<-mean(ecm)#14.02 nox*lstat
RMSE_gam4<-sqrt(mean(ecm))#3.99
AME_gam4<-mean(ame) #2.60, los errores mejoraron con respecto a gam sin interacc y ppr pero no con respecto a gam con interaccion
AMEp_gam4<-mape(db$medv,ajus.gam4$fitted.values)#0.12


Al ajustar modelos con interacciones, el que presenta menor probabilidad de error en promedio, explicando mejor la variabilidad de los datos, es el modelo gam con interaccion nox*lstat. Ajustando un modelo lineal con interaccion (rm y lstat) y teniendo en cuenta solo las variables mas importantes para ridge y lasso, mejora el R2 ajustado pero no mejoran los errores en relacion al modelo lineal con interaccion, el mismo ajuste con gam mas la interaccion(nox y lstat) tampoco mejoro la bondad de prediccion.
Ahora vamos a contruir una red neuronal de 1 capa con 5 neuronas primero con todas las variables y luego con las que fueron importantes para las regresion ridge y lasso

8 ANN CON TODAS LAS VARIABLES Y CON LAS VARIABLES IMPORTANTES SEGUN LASSO Y RIDGE

ajus.nnet <- nnet(medv~.,data=db, size=5, decay=0.5, 
                  maxit=3000, linout=1, trace=TRUE)
## # weights:  76
## initial  value 288026.627791 
## iter  10 value 29964.501572
## iter  20 value 24848.419409
## iter  30 value 22023.433269
## iter  40 value 20191.818518
## iter  50 value 18664.583677
## iter  60 value 18181.202342
## iter  70 value 15925.572382
## iter  80 value 15677.943205
## iter  90 value 14961.556218
## iter 100 value 14243.326201
## iter 110 value 13819.685653
## iter 120 value 13120.103172
## iter 130 value 12908.655257
## iter 140 value 12541.953748
## iter 150 value 11842.701694
## iter 160 value 11530.867721
## iter 170 value 10923.597087
## iter 180 value 10255.138690
## iter 190 value 9732.738250
## iter 200 value 8662.015279
## iter 210 value 8290.874766
## iter 220 value 7906.241642
## iter 230 value 7519.830665
## iter 240 value 6744.513790
## iter 250 value 6163.236195
## iter 260 value 5951.581974
## iter 270 value 5793.877766
## iter 280 value 5606.429365
## iter 290 value 5419.761968
## iter 300 value 5221.966268
## iter 310 value 5117.460280
## iter 320 value 4988.722928
## iter 330 value 4913.736954
## iter 340 value 4881.490832
## iter 350 value 4744.725874
## iter 360 value 4715.760124
## iter 370 value 4667.615277
## iter 380 value 4566.591056
## iter 390 value 4438.957242
## iter 400 value 4371.127087
## iter 410 value 4362.749233
## iter 420 value 4361.454234
## iter 430 value 4360.401799
## iter 440 value 4359.900322
## final  value 4359.895953 
## converged
plotnet(ajus.nnet)

summary(ajus.nnet)
## a 13-5-1 network with 76 weights
## options were - linear output units  decay=0.5
##   b->h1  i1->h1  i2->h1  i3->h1  i4->h1  i5->h1  i6->h1  i7->h1  i8->h1  i9->h1 
##   -0.28   -2.85    0.27    0.14    2.34   -0.28   -2.26   -0.01   -2.37    0.81 
## i10->h1 i11->h1 i12->h1 i13->h1 
##   -0.04   -1.13    0.19   -0.97 
##   b->h2  i1->h2  i2->h2  i3->h2  i4->h2  i5->h2  i6->h2  i7->h2  i8->h2  i9->h2 
##    0.46   -0.17    0.13   -0.33    0.50   -7.75   -0.88    0.11   -5.01   -1.72 
## i10->h2 i11->h2 i12->h2 i13->h2 
##   -0.06    4.98   -0.01   -0.23 
##   b->h3  i1->h3  i2->h3  i3->h3  i4->h3  i5->h3  i6->h3  i7->h3  i8->h3  i9->h3 
##    1.08   -0.25   -0.13    2.01    0.70    1.11    3.58   -0.28    0.61    0.95 
## i10->h3 i11->h3 i12->h3 i13->h3 
##   -0.13    1.71    0.12   -2.64 
##   b->h4  i1->h4  i2->h4  i3->h4  i4->h4  i5->h4  i6->h4  i7->h4  i8->h4  i9->h4 
##    2.30   -0.80    0.08    0.44    3.24   -1.57    2.19   -0.04   -1.12    1.01 
## i10->h4 i11->h4 i12->h4 i13->h4 
##   -0.01   -1.05    0.00   -0.58 
##   b->h5  i1->h5  i2->h5  i3->h5  i4->h5  i5->h5  i6->h5  i7->h5  i8->h5  i9->h5 
##   -8.05    1.36    0.00   -0.09   -1.01   -5.70    2.41   -0.01   -0.23    0.14 
## i10->h5 i11->h5 i12->h5 i13->h5 
##    0.00   -0.07   -0.01   -0.11 
##  b->o h1->o h2->o h3->o h4->o h5->o 
## -8.26  4.13 20.33  4.24 10.60 18.40
pred3 <- predict(ajus.nnet)
plot(db$medv, pred3, cex=0.5, pch=1)
abline(0,1,col='green')## mejora el grafico de observados vs predichos

#Red neuronal con menos variables

ajus.nnet2 <- nnet(medv~b+nox+rm+chas+dis+ptratio+lstat,data=db,size=5, decay=0.5, 
                  maxit=3000, linout=1, trace=TRUE)
## # weights:  46
## initial  value 312683.990521 
## iter  10 value 39126.663150
## iter  20 value 38010.484921
## iter  30 value 23824.345249
## iter  40 value 19241.144989
## iter  50 value 17651.739123
## iter  60 value 16908.755146
## iter  70 value 15836.918123
## iter  80 value 13841.002954
## iter  90 value 11994.695740
## iter 100 value 11271.563764
## iter 110 value 10152.027129
## iter 120 value 9760.299538
## iter 130 value 9457.257795
## iter 140 value 9042.327294
## iter 150 value 8347.823865
## iter 160 value 7632.409028
## iter 170 value 6938.434860
## iter 180 value 6550.162263
## iter 190 value 6403.358000
## iter 200 value 6134.655888
## iter 210 value 5519.572033
## iter 220 value 5358.310259
## iter 230 value 5229.985974
## iter 240 value 5115.316379
## iter 250 value 4952.953825
## iter 260 value 4900.770395
## iter 270 value 4888.587826
## final  value 4888.562118 
## converged
plotnet(ajus.nnet2)

summary(ajus.nnet2)
## a 7-5-1 network with 46 weights
## options were - linear output units  decay=0.5
##  b->h1 i1->h1 i2->h1 i3->h1 i4->h1 i5->h1 i6->h1 i7->h1 
##   0.22   0.15  -0.07  -5.46  -0.73  -0.37  -0.81   1.60 
##  b->h2 i1->h2 i2->h2 i3->h2 i4->h2 i5->h2 i6->h2 i7->h2 
##  -5.04  -0.01  -1.52   1.18   0.96  -0.55   0.26  -0.18 
##  b->h3 i1->h3 i2->h3 i3->h3 i4->h3 i5->h3 i6->h3 i7->h3 
##  13.28   0.00  -4.20  -0.42   0.12   0.25  -0.39  -0.06 
##  b->h4 i1->h4 i2->h4 i3->h4 i4->h4 i5->h4 i6->h4 i7->h4 
##   2.62   0.05  -1.07  -0.29   0.25 -11.26   0.40  -1.04 
##  b->h5 i1->h5 i2->h5 i3->h5 i4->h5 i5->h5 i6->h5 i7->h5 
##  -8.39   0.00  -2.73   2.22  -0.84   0.10  -0.33  -0.16 
##   b->o  h1->o  h2->o  h3->o  h4->o  h5->o 
##  -2.63   7.95  11.88  14.06  23.31  19.59
pred4 <- predict(ajus.nnet2)
plot(db$medv, pred4, cex=0.5, pch=1)#igual a la red con todas las variables
abline(0,1,col='blue')


Tanto la red neuronal con todas las variables como la que tiene las mas importantes segun lasso y ridge no mejoran metricas de bondad de prediccion con respecto a gam con interaccion


El modelo con mejor capacidad predictiva fue el modelo aditivo generalizado con todas las variables e interaccion entre nox y lstat segun AME y AMEp, y el modelo con mayor capacidad explicativa parece ser el lineal con todas las variables con la interaccion rm y lstat, que se pusieron en evidencia a traves del modelo PPR.


Las variables que mejor explican el fenomeno son: crim, zn, nox, chas, rm, dis, rad, tax, ptratio y lstat