library(sae)
library(ggplot2)
library(tidyverse)
library(DT)
library(kableExtra)
library(rgdal)
library(broom)
library(ggrepel)
La data incomedata corresponde a un conjunto de datos asrtificiales generados por Molina y Marhuenda (2015). Son datos sobre ingresos y otras variables relacionadas para ilustrar el uso de modelos a nivel de Ɣrea en el paquete sae, con el objetivo de estimar la pobreza en las provincias espaƱolas. Las variables que se tienen son:
data("incomedata")
data("sizeprov")
data("sizeprovedu")
DT::datatable(
head(incomedata, 10),
fillContainer = TRUE
)
attach(incomedata)
summary(income)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1582 7006 10796 12233 15867 74626
ggplot(incomedata, aes(x = income)) + geom_histogram(col = 'black', fill = 'red', alpha = 0.4, bins = 30)
\[ F_{\alpha i} = \dfrac{1}{N_{i}} \displaystyle \sum_{i=1}^{N_{i}} \left( \dfrac{z - E_{ij}}{z} \right)^{\alpha} I\left(E_{ij} < z \right), \quad i = 1, \ldots, m; \alpha = 0,1. \]
donde \(E_{ij}\) son los ingresos y \(z\) el umbral de pobreza.
La lĆnea de pobreza o umbral de pobreza se fija en el 60% de la mediana de la distribución de los ingresos por unidad de consumo adjudicados a las personas. Se clasifica como pobre a todo individuo que tenga unos ingresos por unidad de consumo inferiores al umbral.
z <- 0.6*median(income)
z
## [1] 6477.484
ggplot(incomedata, aes(x = income)) + geom_histogram(col = 'black', fill = 'red', alpha = 0.4, bins = 30) + geom_vline(xintercept = z, col = "blue")
Se define la variable poor como la variable que indicara si el individuo es pobre (estĆ” por debajo del umbral de pobreza) o no. (1: pobre, 0: no lo es)
n <- dim(incomedata)[1]
poor <- numeric(n)
poor[income < z] <- 1
summary(as.factor(poor))
## 0 1
## 13440 3759
\[ \begin{split} \text{NIVEL 1: (Modelo de muestreo) } & Y_{i}|\theta_{i} \overset{ind}{\sim} N(\theta_{i}, D_{i}) \\ & Y_{i} = \theta_{i} + \varepsilon_{i}, \quad \text{con } \varepsilon_{i} \sim N(0, D_{i}) \end{split} \] \[ \begin{split} \text{NIVEL 2: (Modelo de enlace) } & \theta_{i} \overset{ind}{\sim} N(x_{i}^{T}\beta, A) \\ & \theta_{i} = x_{i}^{T}\beta + u_{i}, \quad \text{con } u_{i} \sim N(0,A) \end{split} \]
se combinan los componentes de ambos modelos y con esto se obtiene el modelo mixto lineal
\[ Y_{i}|\theta_{i} = x_{i}^{T}\beta + u_{i} + \varepsilon_{i}, \quad \varepsilon_{i} \sim N(0,D_{i}). \]
A partir de este modelo se construye el estimador EBLUP que corresponde al mejor predictor lineal empĆrico imparcial para \(\theta_{i}\),
\[ \theta_{i}^{\text{ EBLUP}} = (1- \widehat{B}_{i})Y_{i} + \widehat{B}_{i}x_{i}^{T}\tilde{\beta}, \quad \text{para } i = 1, \ldots, m. \]
DT::datatable(
head(sizeprov, 52),
fillContainer = TRUE
)
provdirect <- direct(y = poor, dom = provlab, sweight = weight, domsize = sizeprov[,c(1,3)])
provdirect %>%
kbl(table.attr = 'class="table"style="red"') %>%
kable_paper("hover",
full_width = F)
| Domain | SampSize | Direct | SD | CV | |
|---|---|---|---|---|---|
| 1 | Alava | 96 | 0.2550373 | 0.0484665 | 19.003670 |
| 2 | Albacete | 173 | 0.1405924 | 0.0304219 | 21.638397 |
| 3 | Alicante | 539 | 0.2054832 | 0.0216579 | 10.539977 |
| 4 | Almeria | 198 | 0.2649583 | 0.0408154 | 15.404468 |
| 5 | Avila | 58 | 0.0551220 | 0.0255543 | 46.359465 |
| 6 | Badajoz | 494 | 0.2103349 | 0.0232652 | 11.061047 |
| 7 | Baleares | 634 | 0.0999979 | 0.0153652 | 15.365488 |
| 8 | Barcelona | 1420 | 0.2895514 | 0.0157928 | 5.454215 |
| 9 | Burgos | 168 | 0.1999314 | 0.0437977 | 21.906360 |
| 10 | Caceres | 282 | 0.2675139 | 0.0311860 | 11.657729 |
| 11 | Cadiz | 398 | 0.1470535 | 0.0218145 | 14.834373 |
| 12 | Castellon | 118 | 0.1575128 | 0.0330163 | 20.961042 |
| 51 | Ceuta | 235 | 0.1972480 | 0.0334119 | 16.939048 |
| 13 | CiudadReal | 250 | 0.2035740 | 0.0325444 | 15.986538 |
| 14 | Cordoba | 224 | 0.2961564 | 0.0391757 | 13.228058 |
| 15 | CorunaLa | 495 | 0.2509315 | 0.0246038 | 9.805002 |
| 16 | Cuenca | 92 | 0.2308336 | 0.0542909 | 23.519486 |
| 17 | Gerona | 142 | 0.1777339 | 0.0366708 | 20.632432 |
| 18 | Granada | 208 | 0.3114513 | 0.0400184 | 12.849014 |
| 19 | Guadalajara | 89 | 0.1751581 | 0.0421583 | 24.068705 |
| 20 | Guipuzcoa | 285 | 0.2297464 | 0.0314953 | 13.708718 |
| 21 | Huelva | 122 | 0.1258345 | 0.0320255 | 25.450474 |
| 22 | Huesca | 115 | 0.2410761 | 0.0485635 | 20.144476 |
| 23 | Jaen | 232 | 0.3050704 | 0.0408482 | 13.389765 |
| 24 | Leon | 218 | 0.1851418 | 0.0298886 | 16.143618 |
| 25 | Lerida | 130 | 0.1555959 | 0.0387245 | 24.887854 |
| 27 | Lugo | 173 | 0.3702810 | 0.0565433 | 15.270368 |
| 28 | Madrid | 944 | 0.1784623 | 0.0162075 | 9.081729 |
| 29 | Malaga | 379 | 0.2247678 | 0.0271512 | 12.079680 |
| 52 | Melilla | 180 | 0.1910912 | 0.0344102 | 18.007191 |
| 30 | Murcia | 885 | 0.1754409 | 0.0164503 | 9.376573 |
| 31 | Navarra | 564 | 0.1550957 | 0.0178774 | 11.526700 |
| 32 | Orense | 129 | 0.2180633 | 0.0413454 | 18.960275 |
| 33 | Oviedo | 803 | 0.2575646 | 0.0208584 | 8.098320 |
| 34 | Palencia | 72 | 0.3016607 | 0.0717978 | 23.800849 |
| 35 | PalmasLas | 472 | 0.1626612 | 0.0227480 | 13.984868 |
| 36 | Pontevedra | 448 | 0.1823451 | 0.0239835 | 13.152826 |
| 26 | RiojaLa | 510 | 0.2468096 | 0.0240024 | 9.725052 |
| 37 | Salamanca | 164 | 0.1610451 | 0.0299824 | 18.617410 |
| 39 | Santander | 434 | 0.3206366 | 0.0314131 | 9.797102 |
| 40 | Segovia | 58 | 0.2226200 | 0.0563996 | 25.334492 |
| 41 | Sevilla | 482 | 0.1969960 | 0.0209045 | 10.611619 |
| 42 | Soria | 20 | 0.0254121 | 0.0254065 | 99.978151 |
| 43 | Tarragona | 134 | 0.3011744 | 0.0472588 | 15.691490 |
| 38 | Tenerife | 381 | 0.1806629 | 0.0203815 | 11.281514 |
| 44 | Teruel | 72 | 0.2736424 | 0.0672344 | 24.570172 |
| 45 | Toledo | 275 | 0.1255338 | 0.0213199 | 16.983409 |
| 46 | Valencia | 714 | 0.2115989 | 0.0206702 | 9.768584 |
| 47 | Valladolid | 299 | 0.1828620 | 0.0309429 | 16.921429 |
| 48 | Vizcaya | 524 | 0.2154970 | 0.0221091 | 10.259600 |
| 49 | Zamora | 104 | 0.3002744 | 0.0602530 | 20.065987 |
| 50 | Zaragoza | 564 | 0.0978732 | 0.0154954 | 15.832155 |
D <- length(unique(prov))
D
## [1] 52
table(provlab)
## provlab
## Alava Albacete Alicante Almeria Avila Badajoz
## 96 173 539 198 58 494
## Baleares Barcelona Burgos Caceres Cadiz Castellon
## 634 1420 168 282 398 118
## Ceuta CiudadReal Cordoba CorunaLa Cuenca Gerona
## 235 250 224 495 92 142
## Granada Guadalajara Guipuzcoa Huelva Huesca Jaen
## 208 89 285 122 115 232
## Leon Lerida Lugo Madrid Malaga Melilla
## 218 130 173 944 379 180
## Murcia Navarra Orense Oviedo Palencia PalmasLas
## 885 564 129 803 72 472
## Pontevedra RiojaLa Salamanca Santander Segovia Sevilla
## 448 510 164 434 58 482
## Soria Tarragona Tenerife Teruel Toledo Valencia
## 20 134 381 72 275 714
## Valladolid Vizcaya Zamora Zaragoza
## 299 524 104 564
Se deben calcular las proporciones para cada variable explicativa, para construir la matriz \(X\). Esto se hace a nivel de Ɣrea, utilizando el tamaƱo poblacional de las provincias (\(N_{d}\): tamaƱo poblacional de la provincia \(d\)).
data("incomedata")
data("sizeprov")
data("sizeprovedu")
data("sizeprovnat")
data("sizeprovlab")
data("sizeprovage")
Nd <- sizeprov[order(sizeprov[, 1]), - c(1, 2)]
Ndedu <- as.matrix(sizeprovedu[order(sizeprovedu[, 1]), - c(1, 2)])
DT::datatable(
head(Ndedu, 52),
fillContainer = TRUE
)
Ndnat <- as.matrix(sizeprovnat[order(sizeprovnat[, 1]), - c(1, 2)])
DT::datatable(
head(Ndnat, 52),
fillContainer = TRUE
)
Ndlab <- as.matrix(sizeprovlab[order(sizeprovlab[, 1]), - c(1, 2)])
DT::datatable(
head(Ndlab, 52),
fillContainer = TRUE
)
Ndage <- as.matrix(sizeprovage[order(sizeprovage[, 1]), - c(1, 2)])
DT::datatable(
head(Ndage, 52),
fillContainer = TRUE
)
Pdedu <- Ndedu/Nd
DT::datatable(
head(Pdedu, 52),
fillContainer = TRUE
)
Pdnat <- Ndnat/Nd
DT::datatable(
head(Pdnat, 52),
fillContainer = TRUE
)
Pdlab <- Ndlab/Nd
DT::datatable(
head(Pdlab, 52),
fillContainer = TRUE
)
Pdage <- Ndage/Nd
DT::datatable(
head(Pdage, 52),
fillContainer = TRUE
)
Se contruye la matriz \(X\) con las variables:
y se le agrega una columna de unos. AdemÔs, como son solamente variables categóricas se quita al menos una categoria para que no exista problema de colinealidad.
X <- cbind(const = rep(1, D), nat1 = Pdnat[,1], Pdage[, 2:5], Pdedu[, c(1,3)], Pdlab[, c(1,3)])
DT::datatable(
head(X, 52),
fillContainer = TRUE
)
eblup <- eblupFH(provdirect$Direct ~ X-1, vardir = provdirect$SD^2)
eblup
## $eblup
## [,1]
## 1 0.24844006
## 2 0.15466819
## 3 0.19978331
## 4 0.24160435
## 5 0.06821718
## 6 0.20909559
## 7 0.10473882
## 8 0.28374921
## 9 0.20410140
## 10 0.24973973
## 11 0.15415430
## 12 0.16297410
## 13 0.20395114
## 14 0.20155398
## 15 0.26357242
## 16 0.24761853
## 17 0.22973061
## 18 0.17556795
## 19 0.28049070
## 20 0.17633353
## 21 0.22839781
## 22 0.14901495
## 23 0.23855201
## 24 0.26939204
## 25 0.19231723
## 26 0.16598618
## 27 0.31582785
## 28 0.18047605
## 29 0.22404947
## 30 0.19036842
## 31 0.17597701
## 32 0.15765597
## 33 0.21026400
## 34 0.25416542
## 35 0.26309319
## 36 0.16065812
## 37 0.18868833
## 38 0.24248959
## 39 0.16536763
## 40 0.30294812
## 41 0.18833801
## 42 0.19713772
## 43 0.05147238
## 44 0.25983011
## 45 0.18400601
## 46 0.27117815
## 47 0.13289931
## 48 0.20941381
## 49 0.19705037
## 50 0.21504994
## 51 0.27462565
## 52 0.10324752
##
## $fit
## $fit$method
## [1] "REML"
##
## $fit$convergence
## [1] TRUE
##
## $fit$iterations
## [1] 3
##
## $fit$estcoef
## beta std.error tvalue pvalue
## Xconst 6398.4080074 3625.2095193 1.7649761 0.07756779
## Xnat1 0.2788105 0.2937627 0.9491014 0.34256904
## Xage2 -6397.2927579 3625.0551569 -1.7647436 0.07760687
## Xage3 -6398.5128689 3625.1998728 -1.7650097 0.07756214
## Xage4 -6397.4147932 3625.2829961 -1.7646663 0.07761986
## Xage5 -6398.7935262 3625.2639228 -1.7650559 0.07755437
## Xeduc0 -3057.8018284 3949.2372267 -0.7742766 0.43876726
## Xeduc2 -0.3166085 0.3851901 -0.8219538 0.41110320
## Xlabor0 -3341.0417925 3972.4102670 -0.8410616 0.40031342
## Xlabor2 -0.6771505 0.9106376 -0.7436005 0.45711822
##
## $fit$refvar
## [1] 0.003706203
##
## $fit$goodness
## loglike AIC BIC KIC
## 69.76170 -117.52340 -96.05972 -106.52340
Calculando el peso que se le estĆ” dando al estimador directo, sabemos que el estimador \(\theta_{i}^{\text{ EBLUP}}\) estĆ” dado por
\[ \theta_{i}^{\text{ EBLUP}} = (1- \widehat{B}_{i})Y_{i} + \widehat{B}_{i}x_{i}^{T}\tilde{\beta}, \quad \text{para } i = 1, \ldots, m. \]
donde
\[ \widehat{B}_{i} = \dfrac{\widehat{D}_{i}}{\widehat{D}_{i} + \widehat{A}} \]
por lo que tenemos que
\[ \begin{split} 1- \widehat{B}_{i} & = 1 - \dfrac{\widehat{D}_{i}}{\widehat{D}_{i} + \widehat{A}} \\ & = \dfrac{\widehat{D}_{i} + \widehat{A} - \widehat{D}_{i}}{\widehat{D}_{i} + \widehat{A}} \\ & = \dfrac{\widehat{A}}{\widehat{D}_{i} + \widehat{A}} \end{split} \]
gamma <- eblup$fit$refvar/(eblup$fit$refvar + provdirect$SD^{2})
gamma
## [1] 0.6120695 0.8001824 0.8876565 0.6898982 0.8501980 0.8725662 0.9401139
## [8] 0.9369475 0.6589461 0.7921315 0.8862118 0.7727243 0.7685136 0.7777410
## [15] 0.7071636 0.8595984 0.5570136 0.7337631 0.6982721 0.6758798 0.7888633
## [22] 0.7832487 0.6111190 0.6895544 0.8057785 0.7119388 0.5368706 0.9338147
## [29] 0.8340931 0.7578741 0.9319522 0.9206116 0.6843510 0.8949423 0.4182540
## [36] 0.8774835 0.8656495 0.8654668 0.8047954 0.7897325 0.5381351 0.8945267
## [43] 0.8516690 0.6239827 0.8992127 0.4505108 0.8907554 0.8966344 0.7946977
## [50] 0.8834775 0.5051642 0.9391562
summary(gamma)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.4183 0.6898 0.7934 0.7722 0.8842 0.9401
proveblupFH <- eblup$eblup
dresult <- data.frame(provdirect$Domain, provdirect$Direct, provdirect$SD^2, proveblupFH, eblup$fit$refvar)
colnames(dresult) <- c("Domain", "Direct", "D_i", "eblup", "A")
DT::datatable(
head(dresult, 52),
fillContainer = TRUE
)
shapefile_provincias <- readOGR("/Users/fabi/Desktop/CONSULTORĆA II/Avance/Provincias_ETRS89_30N", "Provincias_ETRS89_30N")
## Warning in OGRSpatialRef(dsn, layer, morphFromESRI = morphFromESRI, dumpSRS =
## dumpSRS, : Discarded datum European_Terrestrial_Reference_System_1989 in Proj4
## definition: +proj=utm +zone=30 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=m
## +no_defs
## OGR data source with driver: ESRI Shapefile
## Source: "/Users/fabi/Desktop/CONSULTORIĢA II/Avance/Provincias_ETRS89_30N", layer: "Provincias_ETRS89_30N"
## with 52 features
## It has 5 fields
data_provincias <- tidy(shapefile_provincias)
## Regions defined for each Polygons
DT::datatable(
head(data_provincias, 52),
fillContainer = TRUE
)
nombres_provincias <- data.frame(shapefile_provincias$Texto)
DT::datatable(
head(nombres_provincias, 52),
fillContainer = TRUE
)
nombres_provincias$id <- as.character(seq(0, nrow(nombres_provincias)-1))
DT::datatable(
head(nombres_provincias, 52),
fillContainer = TRUE
)
pobreza <- cbind(as.character(seq(0, nrow(nombres_provincias)-1)), shapefile_provincias$Texto, proveblupFH)
colnames(pobreza) <- c("id", "Provincias", "pobreza")
pobreza <- as.data.frame(pobreza)
pobreza$pobreza <- as.numeric(pobreza$pobreza)
pobreza$id <- as.character(pobreza$id)
pobreza_grafico <- data_provincias %>%
left_join(pobreza, by= "id")
pobreza_grafico %>%
ggplot(aes(x=long, y= lat, group = group)) +
geom_polygon(aes(fill= pobreza), color= "white", size = 0.6) +
labs( title = "Incidencia de pobreza",
subtitle = "Provincias de EspaƱa",
caption = "",
fill = "") +
theme_minimal() +
theme(
axis.line = element_blank(),
axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
plot.background = element_rect(fill = "snow", color = NA),
panel.background = element_rect(fill= "snow", color = NA),
plot.title = element_text(size = 16, hjust = 0),
plot.subtitle = element_text(size = 12, hjust = 0),
plot.caption = element_text(size = 8, hjust = 1),
legend.title = element_text(color = "grey40", size = 8),
legend.text = element_text(color = "grey40", size = 7, hjust = 0),
legend.position = c(0.93, 0.3),
plot.margin = unit(c(0.5,2,0.5,1), "cm")) +
scale_fill_gradient(low = "yellow", high = "red")