sma <- read.table("sma.txt", row.names = 1)
names(sma) <- c("Area", "PobT", "PCiudad", "PSenior", "Medicos", "CHosp",
"Grad", "Laboral", "Ingresos", "Delitos", "Region")
# Variable de respuesta
sma$TDelitos <- sma$Delitos/sma$PobT
sma = sma[,-c(2, 10)]
summary(sma$TDelitos)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 15.69 45.97 56.55 55.87 64.38 93.58
summary(sma)
## Area PCiudad PSenior Medicos
## Min. : 47 Min. : 0.00 Min. : 3.900 Min. : 140
## 1st Qu.: 1384 1st Qu.: 29.70 1st Qu.: 8.300 1st Qu.: 436
## Median : 1803 Median : 39.50 Median : 9.700 Median : 760
## Mean : 2580 Mean : 41.78 Mean : 9.821 Mean : 1778
## 3rd Qu.: 2866 3rd Qu.: 51.10 3rd Qu.:10.700 3rd Qu.: 1874
## Max. :27293 Max. :100.00 Max. :21.800 Max. :25627
## CHosp Grad Laboral Ingresos
## Min. : 481 Min. :30.30 Min. : 66.9 Min. : 769
## 1st Qu.: 2159 1st Qu.:50.00 1st Qu.: 145.6 1st Qu.: 1987
## Median : 3352 Median :53.90 Median : 221.1 Median : 3007
## Mean : 6134 Mean :54.54 Mean : 431.7 Mean : 6488
## 3rd Qu.: 6323 3rd Qu.:59.90 3rd Qu.: 422.6 3rd Qu.: 5909
## Max. :69678 Max. :72.80 Max. :4083.9 Max. :72100
## Region TDelitos
## Min. :1.000 Min. :15.69
## 1st Qu.:2.000 1st Qu.:45.97
## Median :3.000 Median :56.55
## Mean :2.567 Mean :55.87
## 3rd Qu.:3.000 3rd Qu.:64.38
## Max. :4.000 Max. :93.58
Region = c("Noreste", "Norte-Centro", "Sur", "Oeste")
for (i in 1:4) {
sma$Region[sma$Region == i] = Region[i]
}
sma$Region = as.factor(sma$Region)
class(sma$Region)
## [1] "factor"
levels(sma$Region)
## [1] "Noreste" "Norte-Centro" "Oeste" "Sur"
pairs(sma, panel = function(x, y) {
points(x, y)
lines(lowess(x, y), col = "red")
})

library(PoSI)
## Warning: package 'PoSI' was built under R version 4.1.3
# Este era el modelo sin problemas de multicolinealidad.
mod3 = lm(TDelitos~.-PCiudad - CHosp - Ingresos - Medicos +
I((PSenior - mean(PSenior))^2), sma)
summary(mod3)
##
## Call:
## lm(formula = TDelitos ~ . - PCiudad - CHosp - Ingresos - Medicos +
## I((PSenior - mean(PSenior))^2), data = sma)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.2489 -8.0939 -0.2504 7.4177 22.2070
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.7703039 11.0277601 2.972 0.00352 **
## Area 0.0006872 0.0003575 1.922 0.05676 .
## PSenior -0.8042126 0.5258908 -1.529 0.12860
## Grad 0.2697990 0.1499951 1.799 0.07435 .
## Laboral 0.0049778 0.0015310 3.251 0.00146 **
## RegionNorte-Centro 8.3921019 2.8365097 2.959 0.00366 **
## RegionOeste 21.2729808 3.5786105 5.944 2.33e-08 ***
## RegionSur 13.1145993 2.8948226 4.530 1.30e-05 ***
## I((PSenior - mean(PSenior))^2) 0.2069837 0.0668231 3.097 0.00238 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.44 on 132 degrees of freedom
## Multiple R-squared: 0.5069, Adjusted R-squared: 0.477
## F-statistic: 16.96 on 8 and 132 DF, p-value: < 2.2e-16
library(car)
## Warning: package 'car' was built under R version 4.1.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.1.3
vif(mod3)
## GVIF Df GVIF^(1/(2*Df))
## Area 1.274930 1 1.129128
## PSenior 2.261088 1 1.503692
## Grad 1.807178 1 1.344313
## Laboral 1.050570 1 1.024973
## Region 2.650750 3 1.176417
## I((PSenior - mean(PSenior))^2) 2.177583 1 1.475664
Xmod3 = model.matrix(mod3)
set.seed(42) # The story behind random set.seed(42) in Machine Learning.
posi_mod3 = PoSI(Xmod3)
## Removed column(s) 1 due to NAs after standardizing'
## Number of contrasts/adjusted predictors to process: 1024
## Number of bundles: 8
## Done with bundle 1 / 8 model sz = 1
## Done with bundle 2 / 8 model sz = 2
## Done with bundle 3 / 8 model sz = 3
## Done with bundle 4 / 8 model sz = 4
## Done with bundle 5 / 8 model sz = 5
## Done with bundle 6 / 8 model sz = 6
## Done with bundle 7 / 8 model sz = 7
## Done with bundle 8 / 8 model sz = 8
## p = 8 , d = 8 processed 1024 tests in 255 models. Times in seconds:
## user system elapsed
## 0.04 0.01 0.06
summary(posi_mod3)
## K.PoSI K.Bonferroni K.Scheffe
## 95% 3.184 4.061 3.938
## 99% 3.696 4.422 4.482
# Realziar esto manualmente.
coef(mod3)-summary(posi_mod3)[1,1]*summary(mod3)$coefficients[,2]
## (Intercept) Area
## -2.3420841591 -0.0004512130
## PSenior Grad
## -2.4786487522 -0.2077852713
## Laboral RegionNorte-Centro
## 0.0001031478 -0.6393448645
## RegionOeste RegionSur
## 9.8786851203 3.8974841797
## I((PSenior - mean(PSenior))^2)
## -0.0057809061
coef(mod3)+summary(posi_mod3)[1,1]*summary(mod3)$coefficients[,2]
## (Intercept) Area
## 67.882691873 0.001825649
## PSenior Grad
## 0.870223620 0.747383317
## Laboral RegionNorte-Centro
## 0.009852412 17.423548713
## RegionOeste RegionSur
## 32.667276555 22.331714340
## I((PSenior - mean(PSenior))^2)
## 0.419748384
IC_95_posimod3 = cbind(
coef(mod3)-summary(posi_mod3)[1,1]*summary(mod3)$coefficients[,2],
coef(mod3)+summary(posi_mod3)[1,1]*summary(mod3)$coefficients[,2])
print(IC_95_posimod3)
## [,1] [,2]
## (Intercept) -2.3420841591 67.882691873
## Area -0.0004512130 0.001825649
## PSenior -2.4786487522 0.870223620
## Grad -0.2077852713 0.747383317
## Laboral 0.0001031478 0.009852412
## RegionNorte-Centro -0.6393448645 17.423548713
## RegionOeste 9.8786851203 32.667276555
## RegionSur 3.8974841797 22.331714340
## I((PSenior - mean(PSenior))^2) -0.0057809061 0.419748384
confint(mod3)
## 2.5 % 97.5 %
## (Intercept) 1.095630e+01 54.584303251
## Area -2.004625e-05 0.001394482
## PSenior -1.844476e+00 0.236051341
## Grad -2.690605e-02 0.566504091
## Laboral 1.949355e-03 0.008006204
## RegionNorte-Centro 2.781205e+00 14.002998418
## RegionOeste 1.419414e+01 28.351826052
## RegionSur 7.388354e+00 18.840844444
## I((PSenior - mean(PSenior))^2) 7.480111e-02 0.339166371
# Source: https://github.com/davidruegamer/coinflibs
# install and load package
library("devtools")
## Loading required package: usethis
## Warning: package 'usethis' was built under R version 4.1.3
install_github("davidruegamer/coinflibs")
## WARNING: Rtools is required to build R packages, but is not currently installed.
##
## Please download and install Rtools 4.0 from https://cran.r-project.org/bin/windows/Rtools/.
## Skipping install of 'coinflibs' from a github remote, the SHA1 (5a7a3d26) has not changed since last install.
## Use `force = TRUE` to force installation
library("coinflibs")
library(MASS)
# use the cpus data
data("cpus")
# Fit initial model
cpus$perf <- log10(cpus$perf)
cpus$cach <- as.factor(cpus$cach)
mod <- lm(perf ~ .-name, data = cpus)
# use the stepAIC function to find the best model in a backward
# stepwise search
cpus.lm <- stepAIC(mod, trace = FALSE, direction = "backward", steps = 3)
# check model selection
cpus.lm$anova$Step
## [1] "" "- chmin"
# recalculate all visited models in the first step
allvisited <- lapply(attr(mod$terms, "term.labels"),
function(x) update(mod, as.formula(paste0("perf ~ .-", x))))
# combine the visited models and the final model
lom1 <- c(allvisited, list(mod))
# perform F-test at level
alpha = 0.001
# and check for non-significant variables
coefTable <- anova(cpus.lm)
drop <- rownames(coefTable)[alpha < coefTable[-nrow(coefTable),5]]
# drop non-significant variable
cpus.lm2 <- update(cpus.lm, as.formula(paste0(".~.-",drop)))
# create list of models, which are examined during the significance search
lom2 <- list(cpus.lm, cpus.lm2)
# now compute selective inference, which adjust for the AIC-based search as
# well as significance hunting
selinf( # supply all lists of visited models, where the best model in the
# first list is interpreted as the final model
lom2, # list given by signficance hunting
lom1, # list given by AIC-based search
response = cpus$perf,
what = c("Ftest", "aic"), # specify what type of selection was done
# for each supplied list
sd = summary(cpus.lm2)$sigma
)
## varname teststat lower upper pval
## 1 (Intercept) 1.288494e+00 1.220220e+00 1.355823e+00 0.000000e+00
## 2 syct -1.957808e-04 -3.031630e-04 -7.895361e-05 1.022812e-03
## 3 mmin 4.485151e-05 3.080390e-05 5.889389e-05 4.514890e-09
## 4 mmax 2.851264e-05 2.235414e-05 3.467114e-05 0.000000e+00
## 5 cach 2.345530e+00 NA NA 1.023764e-28
## 6 chmax 2.769125e-03 1.036697e-03 4.177813e-03 2.102618e-03
## 7 estperf -1.839245e-03 -2.475817e-03 -1.202317e-03 8.114622e-08