Carga de librerías

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tigris)
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
library(tidycensus)
library(knitr)
library(e1071)
library(broom)
library(ggfortify)
options(tigris_use_cache = TRUE)

Clave api

#census_api_key(key = "tu clave api")
## To install your API key for use in future sessions, run this function with `install = TRUE`.

Universo

Universo <- tidycensus::get_acs(
  geography = "county",
  variables = "B19013_001",
  shift_geo = TRUE,
  geometry = TRUE
)
## Getting data from the 2018-2022 5-year ACS
## Warning: The `shift_geo` argument is deprecated and will be removed in a future
## release. We recommend using `tigris::shift_geometry()` instead.
## Using feature geometry obtained from the albersusa package
## Please note: Alaska and Hawaii are being shifted and are not to scale.
## old-style crs object detected; please recreate object with a recent sf::st_crs()
head(Universo)
## Simple feature collection with 6 features and 5 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 1085367 ymin: -1419116 xmax: 1390214 ymax: -1045061
## Projected CRS: +proj=laea +lat_0=45 +lon_0=-100 +x_0=0 +y_0=0 +a=6370997 +b=6370997 +units=m +no_defs
##   GEOID                     NAME   variable estimate  moe
## 1 01001  Autauga County, Alabama B19013_001    68315 4941
## 2 01009   Blount County, Alabama B19013_001    57440 3308
## 3 01017 Chambers County, Alabama B19013_001    48805 5967
## 4 01021  Chilton County, Alabama B19013_001    62471 3466
## 5 01033  Colbert County, Alabama B19013_001    56149 3200
## 6 01045     Dale County, Alabama B19013_001    52813 3967
##                         geometry
## 1 MULTIPOLYGON (((1269841 -13...
## 2 MULTIPOLYGON (((1240383 -11...
## 3 MULTIPOLYGON (((1382944 -12...
## 4 MULTIPOLYGON (((1257515 -12...
## 5 MULTIPOLYGON (((1085910 -10...
## 6 MULTIPOLYGON (((1382203 -13...

Universo

ggplot(data = Universo) + 
  geom_sf(aes(fill = estimate), color = NA) + 
  coord_sf(datum = NA) + 
  theme_minimal() + 
  scale_fill_viridis_c()

Confiabilidad del intervalo de confianza

# Definición de un objeto tibble con la información necesaria
Universe <- tibble(
    county = Universo$NAME,
    median_income = Universo$estimate
  )
# Definición del número de muestras y el tamaño de cada una de ellas
  n_samples <- 100
  sample_size <- 30
  reliability <- 0.95
# Obtención de muestras aleatorias y cálculo de intervalos de confianza
  samples <- replicate(n_samples, {
    sample_data <- sample_n(Universe, sample_size)
    var_sample <- var(sample_data$median_income, na.rm=TRUE)
    ci <- c((sample_size-1)*var_sample/qchisq(p=(1 + reliability)/2,df=sample_size-1),(sample_size-1)*var_sample/qchisq(p=1 - (1 + reliability)/2,df=sample_size-1))
    tibble(
      variance = var_sample,
      lower_ci = ci[1],
      upper_ci = ci[2]
    )
  }, simplify = FALSE) %>%
    bind_rows()
# Determinación de los intervalos que contienen el parámetro poblacional
  true_var <- (length(Universe$median_income)-1)/length(Universe$median_income)*var(x=Universe$median_income, na.rm=TRUE)
  
  samples <- samples %>%
    mutate(contains_true_var = lower_ci <= true_var & upper_ci >= true_var)
# Gráfico de los intervalos de confianza
  ggplot(samples, aes(x = 1:n_samples, y = variance)) +
    geom_errorbar(aes(ymin = lower_ci, ymax = upper_ci, color = contains_true_var), width = 0.2) +
    geom_point(aes(color = contains_true_var)) +
    geom_hline(yintercept = true_var, linetype = "dashed", color = "red") +
    labs(title = "Intervalos de Confianza para la Varianza del Ingreso Mediano por Condado",
         x = "Muestra",
         y = "Varianza del Ingreso Mediano") +
  scale_color_manual(values = c("FALSE" = "blue", "TRUE" = "green")) +
  theme_minimal()

Conclusión

relibability_table <- samples %>% 
  count(contains_true_var)
relibability_table
## # A tibble: 2 × 2
##   contains_true_var     n
##   <lgl>             <int>
## 1 FALSE                20
## 2 TRUE                 80

Se encuentra que para la simulación realizada el 80% de los intervalos contenía al verdadero parámetro poblaciónal \(\mu_x\) \((1-\alpha)\) mientras que en el 20% de los casos no lo contenía \((\alpha)\).

Verdadero parámetro poblacional

Media

\[ \sigma_x^2 = \frac{1}{N} \sum\limits_{i=1}^{N}\left({x}_{i}-\mu_x\right)^2 \]

sigma2 <- (length(Universo$estimate)-1)/length(Universo$estimate)*var(Universo$estimate, na.rm = TRUE)
sigma2
## [1] 281267819