sma <- read.table("sma.txt", row.names = 1)

names(sma) <- c("Area", "PobT", "PCiudad", "PSenior", "Medicos", "CHosp",
"Grad", "Laboral", "Ingresos", "Delitos", "Region")

# Variable de respuesta
sma$TDelitos <- sma$Delitos/sma$PobT

sma = sma[,-c(2, 10)]
summary(sma$TDelitos)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.69   45.97   56.55   55.87   64.38   93.58
summary(sma)
##       Area          PCiudad          PSenior          Medicos     
##  Min.   :   47   Min.   :  0.00   Min.   : 3.900   Min.   :  140  
##  1st Qu.: 1384   1st Qu.: 29.70   1st Qu.: 8.300   1st Qu.:  436  
##  Median : 1803   Median : 39.50   Median : 9.700   Median :  760  
##  Mean   : 2580   Mean   : 41.78   Mean   : 9.821   Mean   : 1778  
##  3rd Qu.: 2866   3rd Qu.: 51.10   3rd Qu.:10.700   3rd Qu.: 1874  
##  Max.   :27293   Max.   :100.00   Max.   :21.800   Max.   :25627  
##      CHosp            Grad          Laboral          Ingresos    
##  Min.   :  481   Min.   :30.30   Min.   :  66.9   Min.   :  769  
##  1st Qu.: 2159   1st Qu.:50.00   1st Qu.: 145.6   1st Qu.: 1987  
##  Median : 3352   Median :53.90   Median : 221.1   Median : 3007  
##  Mean   : 6134   Mean   :54.54   Mean   : 431.7   Mean   : 6488  
##  3rd Qu.: 6323   3rd Qu.:59.90   3rd Qu.: 422.6   3rd Qu.: 5909  
##  Max.   :69678   Max.   :72.80   Max.   :4083.9   Max.   :72100  
##      Region         TDelitos    
##  Min.   :1.000   Min.   :15.69  
##  1st Qu.:2.000   1st Qu.:45.97  
##  Median :3.000   Median :56.55  
##  Mean   :2.567   Mean   :55.87  
##  3rd Qu.:3.000   3rd Qu.:64.38  
##  Max.   :4.000   Max.   :93.58
Region = c("Noreste", "Norte-Centro", "Sur", "Oeste")
for (i in 1:4) {
  sma$Region[sma$Region == i] = Region[i]
}
sma$Region = as.factor(sma$Region)
class(sma$Region)
## [1] "factor"
levels(sma$Region)
## [1] "Noreste"      "Norte-Centro" "Oeste"        "Sur"
pairs(sma, panel = function(x, y) {
points(x, y)
lines(lowess(x, y), col = "red")
})

mod1 = lm(TDelitos~.+I((PSenior - mean(PSenior))^2), sma)
summary(mod1)
## 
## Call:
## lm(formula = TDelitos ~ . + I((PSenior - mean(PSenior))^2), data = sma)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -24.696  -8.072  -0.543   7.280  23.029 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    35.1138252 11.5241607   3.047 0.002808 ** 
## Area                            0.0008503  0.0003748   2.269 0.024973 *  
## PCiudad                         0.0248962  0.0581531   0.428 0.669287    
## PSenior                        -0.8863371  0.5421968  -1.635 0.104566    
## Medicos                         0.0025356  0.0017545   1.445 0.150840    
## CHosp                          -0.0002094  0.0005689  -0.368 0.713454    
## Grad                            0.2407081  0.1510107   1.594 0.113407    
## Laboral                        -0.0075189  0.0207472  -0.362 0.717645    
## Ingresos                        0.0001671  0.0014921   0.112 0.910992    
## RegionNorte-Centro              8.7394604  3.0477233   2.868 0.004839 ** 
## RegionOeste                    20.0983752  4.0391164   4.976 2.05e-06 ***
## RegionSur                      12.4365699  3.1354531   3.966 0.000121 ***
## I((PSenior - mean(PSenior))^2)  0.2176201  0.0677492   3.212 0.001667 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.44 on 128 degrees of freedom
## Multiple R-squared:  0.5213, Adjusted R-squared:  0.4764 
## F-statistic: 11.61 on 12 and 128 DF,  p-value: 1.561e-15
library(car)
vif(mod1)
##                                      GVIF Df GVIF^(1/(2*Df))
## Area                             1.399663  1        1.183073
## PCiudad                          1.348402  1        1.161207
## PSenior                          2.400697  1        1.549418
## Medicos                         35.830812  1        5.985884
## CHosp                           30.498116  1        5.522510
## Grad                             1.829614  1        1.352632
## Laboral                        192.710496  1       13.882021
## Ingresos                       279.523684  1       16.718962
## Region                           4.006517  3        1.260263
## I((PSenior - mean(PSenior))^2)   2.235769  1        1.495249
mod2 = lm(TDelitos~.-PCiudad - CHosp - Ingresos +
            I((PSenior - mean(PSenior))^2), sma)
summary(mod2)
## 
## Call:
## lm(formula = TDelitos ~ . - PCiudad - CHosp - Ingresos + I((PSenior - 
##     mean(PSenior))^2), data = sma)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.1439  -7.7037  -0.4264   6.8788  22.7389 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    35.9813961 11.0475301   3.257  0.00143 ** 
## Area                            0.0008276  0.0003616   2.289  0.02369 *  
## PSenior                        -0.9399100  0.5255217  -1.789  0.07600 .  
## Medicos                         0.0024288  0.0012718   1.910  0.05835 .  
## Grad                            0.2419097  0.1492296   1.621  0.10741    
## Laboral                        -0.0071072  0.0065071  -1.092  0.27674    
## RegionNorte-Centro              9.2602567  2.8450429   3.255  0.00144 ** 
## RegionOeste                    21.0202618  3.5457296   5.928 2.56e-08 ***
## RegionSur                      12.9356438  2.8677576   4.511 1.42e-05 ***
## I((PSenior - mean(PSenior))^2)  0.2194098  0.0664821   3.300  0.00124 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.34 on 131 degrees of freedom
## Multiple R-squared:  0.5202, Adjusted R-squared:  0.4873 
## F-statistic: 15.78 on 9 and 131 DF,  p-value: < 2.2e-16
vif(mod2)
##                                     GVIF Df GVIF^(1/(2*Df))
## Area                            1.329868  1        1.153199
## PSenior                         2.303195  1        1.517628
## Medicos                        19.226866  1        4.384845
## Grad                            1.824652  1        1.350797
## Laboral                        19.359170  1        4.399906
## Region                          2.828878  3        1.189239
## I((PSenior - mean(PSenior))^2)  2.198643  1        1.482782
mod3 = lm(TDelitos~.-PCiudad - CHosp - Ingresos - Medicos +
            I((PSenior - mean(PSenior))^2), sma)
summary(mod3)
## 
## Call:
## lm(formula = TDelitos ~ . - PCiudad - CHosp - Ingresos - Medicos + 
##     I((PSenior - mean(PSenior))^2), data = sma)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.2489  -8.0939  -0.2504   7.4177  22.2070 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    32.7703039 11.0277601   2.972  0.00352 ** 
## Area                            0.0006872  0.0003575   1.922  0.05676 .  
## PSenior                        -0.8042126  0.5258908  -1.529  0.12860    
## Grad                            0.2697990  0.1499951   1.799  0.07435 .  
## Laboral                         0.0049778  0.0015310   3.251  0.00146 ** 
## RegionNorte-Centro              8.3921019  2.8365097   2.959  0.00366 ** 
## RegionOeste                    21.2729808  3.5786105   5.944 2.33e-08 ***
## RegionSur                      13.1145993  2.8948226   4.530 1.30e-05 ***
## I((PSenior - mean(PSenior))^2)  0.2069837  0.0668231   3.097  0.00238 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.44 on 132 degrees of freedom
## Multiple R-squared:  0.5069, Adjusted R-squared:  0.477 
## F-statistic: 16.96 on 8 and 132 DF,  p-value: < 2.2e-16
vif(mod3)
##                                    GVIF Df GVIF^(1/(2*Df))
## Area                           1.274930  1        1.129128
## PSenior                        2.261088  1        1.503692
## Grad                           1.807178  1        1.344313
## Laboral                        1.050570  1        1.024973
## Region                         2.650750  3        1.176417
## I((PSenior - mean(PSenior))^2) 2.177583  1        1.475664
mod4 = lm(TDelitos~.- CHosp - Ingresos - Medicos +
            I((PSenior - mean(PSenior))^2), sma)
summary(mod4)
## 
## Call:
## lm(formula = TDelitos ~ . - CHosp - Ingresos - Medicos + I((PSenior - 
##     mean(PSenior))^2), data = sma)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -24.4205  -7.9832  -0.6127   7.4084  22.4202 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    30.9224685 11.2723663   2.743  0.00694 ** 
## Area                            0.0007137  0.0003595   1.985  0.04919 *  
## PCiudad                         0.0452509  0.0555421   0.815  0.41671    
## PSenior                        -0.7434898  0.5318106  -1.398  0.16447    
## Grad                            0.2669570  0.1502270   1.777  0.07789 .  
## Laboral                         0.0049301  0.0015340   3.214  0.00165 ** 
## RegionNorte-Centro              7.8472164  2.9178135   2.689  0.00809 ** 
## RegionOeste                    20.5565882  3.6894929   5.572 1.38e-07 ***
## RegionSur                      12.4004142  3.0281745   4.095 7.34e-05 ***
## I((PSenior - mean(PSenior))^2)  0.2075967  0.0669126   3.103  0.00235 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.45 on 131 degrees of freedom
## Multiple R-squared:  0.5094, Adjusted R-squared:  0.4757 
## F-statistic: 15.11 on 9 and 131 DF,  p-value: < 2.2e-16
library(leaps)
?leaps
leaps(x = model.matrix(mod4)[,-1],
      y = sma$TDelitos,
      method = c("Cp"),
      nbest = 3,
      names = colnames(model.matrix(mod4)[,-1]))
## $which
##    Area PCiudad PSenior  Grad Laboral RegionNorte-Centro RegionOeste RegionSur
## 1 FALSE   FALSE   FALSE FALSE   FALSE              FALSE        TRUE     FALSE
## 1 FALSE   FALSE   FALSE  TRUE   FALSE              FALSE       FALSE     FALSE
## 1  TRUE   FALSE   FALSE FALSE   FALSE              FALSE       FALSE     FALSE
## 2 FALSE   FALSE   FALSE FALSE   FALSE              FALSE        TRUE      TRUE
## 2 FALSE   FALSE   FALSE FALSE   FALSE              FALSE        TRUE     FALSE
## 2 FALSE   FALSE   FALSE FALSE    TRUE              FALSE        TRUE     FALSE
## 3 FALSE   FALSE   FALSE FALSE   FALSE               TRUE        TRUE      TRUE
## 3 FALSE   FALSE   FALSE  TRUE   FALSE              FALSE        TRUE      TRUE
## 3 FALSE   FALSE   FALSE FALSE    TRUE              FALSE        TRUE      TRUE
## 4 FALSE   FALSE   FALSE FALSE    TRUE               TRUE        TRUE      TRUE
## 4 FALSE   FALSE   FALSE FALSE   FALSE               TRUE        TRUE      TRUE
## 4 FALSE   FALSE   FALSE  TRUE    TRUE              FALSE        TRUE      TRUE
## 5 FALSE   FALSE   FALSE FALSE    TRUE               TRUE        TRUE      TRUE
## 5 FALSE   FALSE   FALSE  TRUE    TRUE               TRUE        TRUE      TRUE
## 5 FALSE   FALSE   FALSE  TRUE    TRUE              FALSE        TRUE      TRUE
## 6 FALSE   FALSE   FALSE  TRUE    TRUE               TRUE        TRUE      TRUE
## 6 FALSE   FALSE    TRUE FALSE    TRUE               TRUE        TRUE      TRUE
## 6  TRUE   FALSE   FALSE FALSE    TRUE               TRUE        TRUE      TRUE
## 7  TRUE   FALSE   FALSE  TRUE    TRUE               TRUE        TRUE      TRUE
## 7  TRUE   FALSE    TRUE FALSE    TRUE               TRUE        TRUE      TRUE
## 7 FALSE   FALSE    TRUE  TRUE    TRUE               TRUE        TRUE      TRUE
## 8  TRUE   FALSE    TRUE  TRUE    TRUE               TRUE        TRUE      TRUE
## 8  TRUE    TRUE   FALSE  TRUE    TRUE               TRUE        TRUE      TRUE
## 8  TRUE    TRUE    TRUE FALSE    TRUE               TRUE        TRUE      TRUE
## 9  TRUE    TRUE    TRUE  TRUE    TRUE               TRUE        TRUE      TRUE
##   I((PSenior - mean(PSenior))^2)
## 1                          FALSE
## 1                          FALSE
## 1                          FALSE
## 2                          FALSE
## 2                           TRUE
## 2                          FALSE
## 3                          FALSE
## 3                          FALSE
## 3                          FALSE
## 4                          FALSE
## 4                           TRUE
## 4                          FALSE
## 5                           TRUE
## 5                          FALSE
## 5                           TRUE
## 6                           TRUE
## 6                           TRUE
## 6                           TRUE
## 7                           TRUE
## 7                           TRUE
## 7                           TRUE
## 8                           TRUE
## 8                           TRUE
## 8                           TRUE
## 9                           TRUE
## 
## $label
##  [1] "(Intercept)"                    "Area"                          
##  [3] "PCiudad"                        "PSenior"                       
##  [5] "Grad"                           "Laboral"                       
##  [7] "RegionNorte-Centro"             "RegionOeste"                   
##  [9] "RegionSur"                      "I((PSenior - mean(PSenior))^2)"
## 
## $size
##  [1]  2  2  2  3  3  3  4  4  4  5  5  5  6  6  6  7  7  7  8  8  8  9  9  9 10
## 
## $Cp
##  [1] 65.326647 92.283376 98.436590 45.465939 51.793428 59.031659 31.868909
##  [8] 34.457106 35.160139 20.356975 24.277675 25.200920 12.807201 15.741094
## [15] 20.354106  9.837679 11.443900 12.503340  8.996373  9.890909 10.348559
## [22]  8.663759  9.954502 11.157814 10.000000
CP = leaps(x = model.matrix(mod4)[,-1],
      y = sma$TDelitos,
      method = c("Cp"),
      nbest = 3,
      names = colnames(model.matrix(mod4)[,-1]))
CP$which[which.min(CP$Cp),]
##                           Area                        PCiudad 
##                           TRUE                          FALSE 
##                        PSenior                           Grad 
##                           TRUE                           TRUE 
##                        Laboral             RegionNorte-Centro 
##                           TRUE                           TRUE 
##                    RegionOeste                      RegionSur 
##                           TRUE                           TRUE 
## I((PSenior - mean(PSenior))^2) 
##                           TRUE
c(AIC(mod3), AIC(mod4))
## [1] 1072.266 1073.554
c(AIC(mod3, k = log(nrow(sma))), AIC(mod4, k = log(nrow(sma))))
## [1] 1101.754 1105.990
X4 = model.matrix(mod4)
H4 = X4 %*% solve(crossprod(X4)) %*% t(X4)
di4 = residuals(mod4)/(1 - diag(H4))
PRESS4 = sum(di4^2)

PRESS4
## [1] 16995.86
X3 = model.matrix(mod3)
H3 = X3 %*% solve(crossprod(X3)) %*% t(X3)
di3 = residuals(mod3)/(1 - diag(H3))
PRESS3 = sum(di3^2)

PRESS3
## [1] 16821.07
library(MASS)