MEA I - Taller 8

sma <- read.table("sma.txt", row.names = 1)

names(sma) <- c("Area", "PobT", "PCiudad", "PSenior", "Medicos", "CHosp",
"Grad", "Laboral", "Ingresos", "Delitos", "Region")

# Variable de respuesta
sma$TDelitos <- sma$Delitos/sma$PobT

sma = sma[,-c(2, 10)]

summary(sma$TDelitos)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.69   45.97   56.55   55.87   64.38   93.58

summary(sma)

##       Area          PCiudad          PSenior          Medicos     
##  Min.   :   47   Min.   :  0.00   Min.   : 3.900   Min.   :  140  
##  1st Qu.: 1384   1st Qu.: 29.70   1st Qu.: 8.300   1st Qu.:  436  
##  Median : 1803   Median : 39.50   Median : 9.700   Median :  760  
##  Mean   : 2580   Mean   : 41.78   Mean   : 9.821   Mean   : 1778  
##  3rd Qu.: 2866   3rd Qu.: 51.10   3rd Qu.:10.700   3rd Qu.: 1874  
##  Max.   :27293   Max.   :100.00   Max.   :21.800   Max.   :25627  
##      CHosp            Grad          Laboral          Ingresos    
##  Min.   :  481   Min.   :30.30   Min.   :  66.9   Min.   :  769  
##  1st Qu.: 2159   1st Qu.:50.00   1st Qu.: 145.6   1st Qu.: 1987  
##  Median : 3352   Median :53.90   Median : 221.1   Median : 3007  
##  Mean   : 6134   Mean   :54.54   Mean   : 431.7   Mean   : 6488  
##  3rd Qu.: 6323   3rd Qu.:59.90   3rd Qu.: 422.6   3rd Qu.: 5909  
##  Max.   :69678   Max.   :72.80   Max.   :4083.9   Max.   :72100  
##      Region         TDelitos    
##  Min.   :1.000   Min.   :15.69  
##  1st Qu.:2.000   1st Qu.:45.97  
##  Median :3.000   Median :56.55  
##  Mean   :2.567   Mean   :55.87  
##  3rd Qu.:3.000   3rd Qu.:64.38  
##  Max.   :4.000   Max.   :93.58

Region = c("Noreste", "Norte-Centro", "Sur", "Oeste")
for (i in 1:4) {
  sma$Region[sma$Region == i] = Region[i]
}

sma$Region = as.factor(sma$Region)
class(sma$Region)

## [1] "factor"

levels(sma$Region)

## [1] "Noreste"      "Norte-Centro" "Oeste"        "Sur"

pairs(sma, panel = function(x, y) {
points(x, y)
lines(lowess(x, y), col = "red")
})

library(PoSI)

## Warning: package 'PoSI' was built under R version 4.1.3

# Este era el modelo sin problemas de multicolinealidad.
mod3 = lm(TDelitos~.-PCiudad - CHosp - Ingresos - Medicos +
            I((PSenior - mean(PSenior))^2), sma)

summary(mod3)

## 
## Call:
## lm(formula = TDelitos ~ . - PCiudad - CHosp - Ingresos - Medicos + 
##     I((PSenior - mean(PSenior))^2), data = sma)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.2489  -8.0939  -0.2504   7.4177  22.2070 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    32.7703039 11.0277601   2.972  0.00352 ** 
## Area                            0.0006872  0.0003575   1.922  0.05676 .  
## PSenior                        -0.8042126  0.5258908  -1.529  0.12860    
## Grad                            0.2697990  0.1499951   1.799  0.07435 .  
## Laboral                         0.0049778  0.0015310   3.251  0.00146 ** 
## RegionNorte-Centro              8.3921019  2.8365097   2.959  0.00366 ** 
## RegionOeste                    21.2729808  3.5786105   5.944 2.33e-08 ***
## RegionSur                      13.1145993  2.8948226   4.530 1.30e-05 ***
## I((PSenior - mean(PSenior))^2)  0.2069837  0.0668231   3.097  0.00238 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.44 on 132 degrees of freedom
## Multiple R-squared:  0.5069, Adjusted R-squared:  0.477 
## F-statistic: 16.96 on 8 and 132 DF,  p-value: < 2.2e-16

library(car)

## Warning: package 'car' was built under R version 4.1.3

## Loading required package: carData

## Warning: package 'carData' was built under R version 4.1.3

vif(mod3)

##                                    GVIF Df GVIF^(1/(2*Df))
## Area                           1.274930  1        1.129128
## PSenior                        2.261088  1        1.503692
## Grad                           1.807178  1        1.344313
## Laboral                        1.050570  1        1.024973
## Region                         2.650750  3        1.176417
## I((PSenior - mean(PSenior))^2) 2.177583  1        1.475664

Xmod3 = model.matrix(mod3)

set.seed(42)  # The story behind random set.seed(42) in Machine Learning.
posi_mod3 = PoSI(Xmod3)

## Removed column(s) 1 due to NAs after standardizing'
## Number of contrasts/adjusted predictors to process: 1024 
## Number of bundles: 8 
##                          Done with bundle 1 / 8    model sz = 1 
##                          Done with bundle 2 / 8    model sz = 2 
##                          Done with bundle 3 / 8    model sz = 3 
##                          Done with bundle 4 / 8    model sz = 4 
##                          Done with bundle 5 / 8    model sz = 5 
##                          Done with bundle 6 / 8    model sz = 6 
##                          Done with bundle 7 / 8    model sz = 7 
##                          Done with bundle 8 / 8    model sz = 8 
## p = 8 , d = 8   processed 1024 tests in 255 models.  Times in seconds:
##    user  system elapsed 
##    0.04    0.01    0.06

summary(posi_mod3)

##     K.PoSI K.Bonferroni K.Scheffe
## 95%  3.184        4.061     3.938
## 99%  3.696        4.422     4.482

# Realziar esto manualmente.
coef(mod3)-summary(posi_mod3)[1,1]*summary(mod3)$coefficients[,2]

##                    (Intercept)                           Area 
##                  -2.3420841591                  -0.0004512130 
##                        PSenior                           Grad 
##                  -2.4786487522                  -0.2077852713 
##                        Laboral             RegionNorte-Centro 
##                   0.0001031478                  -0.6393448645 
##                    RegionOeste                      RegionSur 
##                   9.8786851203                   3.8974841797 
## I((PSenior - mean(PSenior))^2) 
##                  -0.0057809061

coef(mod3)+summary(posi_mod3)[1,1]*summary(mod3)$coefficients[,2]

##                    (Intercept)                           Area 
##                   67.882691873                    0.001825649 
##                        PSenior                           Grad 
##                    0.870223620                    0.747383317 
##                        Laboral             RegionNorte-Centro 
##                    0.009852412                   17.423548713 
##                    RegionOeste                      RegionSur 
##                   32.667276555                   22.331714340 
## I((PSenior - mean(PSenior))^2) 
##                    0.419748384

IC_95_posimod3 = cbind(
coef(mod3)-summary(posi_mod3)[1,1]*summary(mod3)$coefficients[,2],
coef(mod3)+summary(posi_mod3)[1,1]*summary(mod3)$coefficients[,2])
print(IC_95_posimod3)

##                                         [,1]         [,2]
## (Intercept)                    -2.3420841591 67.882691873
## Area                           -0.0004512130  0.001825649
## PSenior                        -2.4786487522  0.870223620
## Grad                           -0.2077852713  0.747383317
## Laboral                         0.0001031478  0.009852412
## RegionNorte-Centro             -0.6393448645 17.423548713
## RegionOeste                     9.8786851203 32.667276555
## RegionSur                       3.8974841797 22.331714340
## I((PSenior - mean(PSenior))^2) -0.0057809061  0.419748384

confint(mod3)

##                                        2.5 %       97.5 %
## (Intercept)                     1.095630e+01 54.584303251
## Area                           -2.004625e-05  0.001394482
## PSenior                        -1.844476e+00  0.236051341
## Grad                           -2.690605e-02  0.566504091
## Laboral                         1.949355e-03  0.008006204
## RegionNorte-Centro              2.781205e+00 14.002998418
## RegionOeste                     1.419414e+01 28.351826052
## RegionSur                       7.388354e+00 18.840844444
## I((PSenior - mean(PSenior))^2)  7.480111e-02  0.339166371

# Source: https://github.com/davidruegamer/coinflibs
# install and load package
library("devtools")

## Loading required package: usethis

## Warning: package 'usethis' was built under R version 4.1.3

install_github("davidruegamer/coinflibs")

## WARNING: Rtools is required to build R packages, but is not currently installed.
## 
## Please download and install Rtools 4.0 from https://cran.r-project.org/bin/windows/Rtools/.

## Skipping install of 'coinflibs' from a github remote, the SHA1 (5a7a3d26) has not changed since last install.
##   Use `force = TRUE` to force installation

library("coinflibs")

library(MASS)
# use the cpus data
data("cpus")
# Fit initial model
cpus$perf <- log10(cpus$perf)
cpus$cach <- as.factor(cpus$cach)
mod <- lm(perf ~ .-name, data = cpus)

# use the stepAIC function to find the best model in a backward 
# stepwise search
cpus.lm <- stepAIC(mod, trace = FALSE, direction = "backward", steps = 3)
# check model selection
cpus.lm$anova$Step

## [1] ""        "- chmin"

# recalculate all visited models in the first step
allvisited <- lapply(attr(mod$terms, "term.labels"), 
                     function(x) update(mod, as.formula(paste0("perf ~ .-", x))))
# combine the visited models and the final model                     
lom1 <- c(allvisited, list(mod))

# perform F-test at level 
alpha = 0.001
# and check for non-significant variables
coefTable <- anova(cpus.lm)
drop <- rownames(coefTable)[alpha < coefTable[-nrow(coefTable),5]]

# drop non-significant variable
cpus.lm2 <- update(cpus.lm, as.formula(paste0(".~.-",drop)))

# create list of models, which are examined during the significance search
lom2 <- list(cpus.lm, cpus.lm2)

# now compute selective inference, which adjust for the AIC-based search as
# well as significance hunting
selinf( # supply all lists of visited models, where the best model in the 
        # first list is interpreted as the final model
       lom2, # list given by signficance hunting
       lom1, # list given by AIC-based search
       response = cpus$perf,
       what = c("Ftest", "aic"), # specify what type of selection was done 
                                 # for each supplied list
       sd = summary(cpus.lm2)$sigma
      )

##       varname      teststat         lower         upper         pval
## 1 (Intercept)  1.288494e+00  1.220220e+00  1.355823e+00 0.000000e+00
## 2        syct -1.957808e-04 -3.031630e-04 -7.895361e-05 1.022812e-03
## 3        mmin  4.485151e-05  3.080390e-05  5.889389e-05 4.514890e-09
## 4        mmax  2.851264e-05  2.235414e-05  3.467114e-05 0.000000e+00
## 5        cach  2.345530e+00            NA            NA 1.023764e-28
## 6       chmax  2.769125e-03  1.036697e-03  4.177813e-03 2.102618e-03
## 7     estperf -1.839245e-03 -2.475817e-03 -1.202317e-03 8.114622e-08

MEA I - Taller 8

Nelson Abad

19/12/2022