sma <- read.table("sma.txt", row.names = 1)
names(sma) <- c("Area", "PobT", "PCiudad", "PSenior", "Medicos", "CHosp",
"Grad", "Laboral", "Ingresos", "Delitos", "Region")
# Variable de respuesta
sma$TDelitos <- sma$Delitos/sma$PobT
sma = sma[,-c(2, 10)]
summary(sma$TDelitos)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 15.69 45.97 56.55 55.87 64.38 93.58
summary(sma)
## Area PCiudad PSenior Medicos
## Min. : 47 Min. : 0.00 Min. : 3.900 Min. : 140
## 1st Qu.: 1384 1st Qu.: 29.70 1st Qu.: 8.300 1st Qu.: 436
## Median : 1803 Median : 39.50 Median : 9.700 Median : 760
## Mean : 2580 Mean : 41.78 Mean : 9.821 Mean : 1778
## 3rd Qu.: 2866 3rd Qu.: 51.10 3rd Qu.:10.700 3rd Qu.: 1874
## Max. :27293 Max. :100.00 Max. :21.800 Max. :25627
## CHosp Grad Laboral Ingresos
## Min. : 481 Min. :30.30 Min. : 66.9 Min. : 769
## 1st Qu.: 2159 1st Qu.:50.00 1st Qu.: 145.6 1st Qu.: 1987
## Median : 3352 Median :53.90 Median : 221.1 Median : 3007
## Mean : 6134 Mean :54.54 Mean : 431.7 Mean : 6488
## 3rd Qu.: 6323 3rd Qu.:59.90 3rd Qu.: 422.6 3rd Qu.: 5909
## Max. :69678 Max. :72.80 Max. :4083.9 Max. :72100
## Region TDelitos
## Min. :1.000 Min. :15.69
## 1st Qu.:2.000 1st Qu.:45.97
## Median :3.000 Median :56.55
## Mean :2.567 Mean :55.87
## 3rd Qu.:3.000 3rd Qu.:64.38
## Max. :4.000 Max. :93.58
Region = c("Noreste", "Norte-Centro", "Sur", "Oeste")
for (i in 1:4) {
sma$Region[sma$Region == i] = Region[i]
}
sma$Region = as.factor(sma$Region)
class(sma$Region)
## [1] "factor"
levels(sma$Region)
## [1] "Noreste" "Norte-Centro" "Oeste" "Sur"
pairs(sma, panel = function(x, y) {
points(x, y)
lines(lowess(x, y), col = "red")
})

mod1 = lm(TDelitos~.+I((PSenior - mean(PSenior))^2), sma)
summary(mod1)
##
## Call:
## lm(formula = TDelitos ~ . + I((PSenior - mean(PSenior))^2), data = sma)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.696 -8.072 -0.543 7.280 23.029
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 35.1138252 11.5241607 3.047 0.002808 **
## Area 0.0008503 0.0003748 2.269 0.024973 *
## PCiudad 0.0248962 0.0581531 0.428 0.669287
## PSenior -0.8863371 0.5421968 -1.635 0.104566
## Medicos 0.0025356 0.0017545 1.445 0.150840
## CHosp -0.0002094 0.0005689 -0.368 0.713454
## Grad 0.2407081 0.1510107 1.594 0.113407
## Laboral -0.0075189 0.0207472 -0.362 0.717645
## Ingresos 0.0001671 0.0014921 0.112 0.910992
## RegionNorte-Centro 8.7394604 3.0477233 2.868 0.004839 **
## RegionOeste 20.0983752 4.0391164 4.976 2.05e-06 ***
## RegionSur 12.4365699 3.1354531 3.966 0.000121 ***
## I((PSenior - mean(PSenior))^2) 0.2176201 0.0677492 3.212 0.001667 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.44 on 128 degrees of freedom
## Multiple R-squared: 0.5213, Adjusted R-squared: 0.4764
## F-statistic: 11.61 on 12 and 128 DF, p-value: 1.561e-15
library(car)
vif(mod1)
## GVIF Df GVIF^(1/(2*Df))
## Area 1.399663 1 1.183073
## PCiudad 1.348402 1 1.161207
## PSenior 2.400697 1 1.549418
## Medicos 35.830812 1 5.985884
## CHosp 30.498116 1 5.522510
## Grad 1.829614 1 1.352632
## Laboral 192.710496 1 13.882021
## Ingresos 279.523684 1 16.718962
## Region 4.006517 3 1.260263
## I((PSenior - mean(PSenior))^2) 2.235769 1 1.495249
mod2 = lm(TDelitos~.-PCiudad - CHosp - Ingresos +
I((PSenior - mean(PSenior))^2), sma)
summary(mod2)
##
## Call:
## lm(formula = TDelitos ~ . - PCiudad - CHosp - Ingresos + I((PSenior -
## mean(PSenior))^2), data = sma)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.1439 -7.7037 -0.4264 6.8788 22.7389
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 35.9813961 11.0475301 3.257 0.00143 **
## Area 0.0008276 0.0003616 2.289 0.02369 *
## PSenior -0.9399100 0.5255217 -1.789 0.07600 .
## Medicos 0.0024288 0.0012718 1.910 0.05835 .
## Grad 0.2419097 0.1492296 1.621 0.10741
## Laboral -0.0071072 0.0065071 -1.092 0.27674
## RegionNorte-Centro 9.2602567 2.8450429 3.255 0.00144 **
## RegionOeste 21.0202618 3.5457296 5.928 2.56e-08 ***
## RegionSur 12.9356438 2.8677576 4.511 1.42e-05 ***
## I((PSenior - mean(PSenior))^2) 0.2194098 0.0664821 3.300 0.00124 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.34 on 131 degrees of freedom
## Multiple R-squared: 0.5202, Adjusted R-squared: 0.4873
## F-statistic: 15.78 on 9 and 131 DF, p-value: < 2.2e-16
vif(mod2)
## GVIF Df GVIF^(1/(2*Df))
## Area 1.329868 1 1.153199
## PSenior 2.303195 1 1.517628
## Medicos 19.226866 1 4.384845
## Grad 1.824652 1 1.350797
## Laboral 19.359170 1 4.399906
## Region 2.828878 3 1.189239
## I((PSenior - mean(PSenior))^2) 2.198643 1 1.482782
mod3 = lm(TDelitos~.-PCiudad - CHosp - Ingresos - Medicos +
I((PSenior - mean(PSenior))^2), sma)
summary(mod3)
##
## Call:
## lm(formula = TDelitos ~ . - PCiudad - CHosp - Ingresos - Medicos +
## I((PSenior - mean(PSenior))^2), data = sma)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.2489 -8.0939 -0.2504 7.4177 22.2070
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.7703039 11.0277601 2.972 0.00352 **
## Area 0.0006872 0.0003575 1.922 0.05676 .
## PSenior -0.8042126 0.5258908 -1.529 0.12860
## Grad 0.2697990 0.1499951 1.799 0.07435 .
## Laboral 0.0049778 0.0015310 3.251 0.00146 **
## RegionNorte-Centro 8.3921019 2.8365097 2.959 0.00366 **
## RegionOeste 21.2729808 3.5786105 5.944 2.33e-08 ***
## RegionSur 13.1145993 2.8948226 4.530 1.30e-05 ***
## I((PSenior - mean(PSenior))^2) 0.2069837 0.0668231 3.097 0.00238 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.44 on 132 degrees of freedom
## Multiple R-squared: 0.5069, Adjusted R-squared: 0.477
## F-statistic: 16.96 on 8 and 132 DF, p-value: < 2.2e-16
vif(mod3)
## GVIF Df GVIF^(1/(2*Df))
## Area 1.274930 1 1.129128
## PSenior 2.261088 1 1.503692
## Grad 1.807178 1 1.344313
## Laboral 1.050570 1 1.024973
## Region 2.650750 3 1.176417
## I((PSenior - mean(PSenior))^2) 2.177583 1 1.475664
mod4 = lm(TDelitos~.- CHosp - Ingresos - Medicos +
I((PSenior - mean(PSenior))^2), sma)
summary(mod4)
##
## Call:
## lm(formula = TDelitos ~ . - CHosp - Ingresos - Medicos + I((PSenior -
## mean(PSenior))^2), data = sma)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.4205 -7.9832 -0.6127 7.4084 22.4202
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.9224685 11.2723663 2.743 0.00694 **
## Area 0.0007137 0.0003595 1.985 0.04919 *
## PCiudad 0.0452509 0.0555421 0.815 0.41671
## PSenior -0.7434898 0.5318106 -1.398 0.16447
## Grad 0.2669570 0.1502270 1.777 0.07789 .
## Laboral 0.0049301 0.0015340 3.214 0.00165 **
## RegionNorte-Centro 7.8472164 2.9178135 2.689 0.00809 **
## RegionOeste 20.5565882 3.6894929 5.572 1.38e-07 ***
## RegionSur 12.4004142 3.0281745 4.095 7.34e-05 ***
## I((PSenior - mean(PSenior))^2) 0.2075967 0.0669126 3.103 0.00235 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.45 on 131 degrees of freedom
## Multiple R-squared: 0.5094, Adjusted R-squared: 0.4757
## F-statistic: 15.11 on 9 and 131 DF, p-value: < 2.2e-16
library(leaps)
?leaps
leaps(x = model.matrix(mod4)[,-1],
y = sma$TDelitos,
method = c("Cp"),
nbest = 3,
names = colnames(model.matrix(mod4)[,-1]))
## $which
## Area PCiudad PSenior Grad Laboral RegionNorte-Centro RegionOeste RegionSur
## 1 FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## 1 FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## 1 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2 FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE
## 2 FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## 2 FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE
## 3 FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE
## 3 FALSE FALSE FALSE TRUE FALSE FALSE TRUE TRUE
## 3 FALSE FALSE FALSE FALSE TRUE FALSE TRUE TRUE
## 4 FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE
## 4 FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE
## 4 FALSE FALSE FALSE TRUE TRUE FALSE TRUE TRUE
## 5 FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE
## 5 FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE
## 5 FALSE FALSE FALSE TRUE TRUE FALSE TRUE TRUE
## 6 FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE
## 6 FALSE FALSE TRUE FALSE TRUE TRUE TRUE TRUE
## 6 TRUE FALSE FALSE FALSE TRUE TRUE TRUE TRUE
## 7 TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE
## 7 TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE
## 7 FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
## 8 TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
## 8 TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE
## 8 TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE
## 9 TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## I((PSenior - mean(PSenior))^2)
## 1 FALSE
## 1 FALSE
## 1 FALSE
## 2 FALSE
## 2 TRUE
## 2 FALSE
## 3 FALSE
## 3 FALSE
## 3 FALSE
## 4 FALSE
## 4 TRUE
## 4 FALSE
## 5 TRUE
## 5 FALSE
## 5 TRUE
## 6 TRUE
## 6 TRUE
## 6 TRUE
## 7 TRUE
## 7 TRUE
## 7 TRUE
## 8 TRUE
## 8 TRUE
## 8 TRUE
## 9 TRUE
##
## $label
## [1] "(Intercept)" "Area"
## [3] "PCiudad" "PSenior"
## [5] "Grad" "Laboral"
## [7] "RegionNorte-Centro" "RegionOeste"
## [9] "RegionSur" "I((PSenior - mean(PSenior))^2)"
##
## $size
## [1] 2 2 2 3 3 3 4 4 4 5 5 5 6 6 6 7 7 7 8 8 8 9 9 9 10
##
## $Cp
## [1] 65.326647 92.283376 98.436590 45.465939 51.793428 59.031659 31.868909
## [8] 34.457106 35.160139 20.356975 24.277675 25.200920 12.807201 15.741094
## [15] 20.354106 9.837679 11.443900 12.503340 8.996373 9.890909 10.348559
## [22] 8.663759 9.954502 11.157814 10.000000
CP = leaps(x = model.matrix(mod4)[,-1],
y = sma$TDelitos,
method = c("Cp"),
nbest = 3,
names = colnames(model.matrix(mod4)[,-1]))
CP$which[which.min(CP$Cp),]
## Area PCiudad
## TRUE FALSE
## PSenior Grad
## TRUE TRUE
## Laboral RegionNorte-Centro
## TRUE TRUE
## RegionOeste RegionSur
## TRUE TRUE
## I((PSenior - mean(PSenior))^2)
## TRUE
c(AIC(mod3), AIC(mod4))
## [1] 1072.266 1073.554
c(AIC(mod3, k = log(nrow(sma))), AIC(mod4, k = log(nrow(sma))))
## [1] 1101.754 1105.990
X4 = model.matrix(mod4)
H4 = X4 %*% solve(crossprod(X4)) %*% t(X4)
di4 = residuals(mod4)/(1 - diag(H4))
PRESS4 = sum(di4^2)
PRESS4
## [1] 16995.86
X3 = model.matrix(mod3)
H3 = X3 %*% solve(crossprod(X3)) %*% t(X3)
di3 = residuals(mod3)/(1 - diag(H3))
PRESS3 = sum(di3^2)
PRESS3
## [1] 16821.07
library(MASS)