library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tigris)
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
library(tidycensus)
library(knitr)
library(e1071)
library(broom)
library(ggfortify)
options(tigris_use_cache = TRUE)
#census_api_key(key = "tu clave api")
## To install your API key for use in future sessions, run this function with `install = TRUE`.
Universo <- tidycensus::get_acs(
geography = "county",
variables = "B19013_001",
shift_geo = TRUE,
geometry = TRUE
)
## Getting data from the 2018-2022 5-year ACS
## Warning: The `shift_geo` argument is deprecated and will be removed in a future
## release. We recommend using `tigris::shift_geometry()` instead.
## Using feature geometry obtained from the albersusa package
## Please note: Alaska and Hawaii are being shifted and are not to scale.
## old-style crs object detected; please recreate object with a recent sf::st_crs()
head(Universo)
## Simple feature collection with 6 features and 5 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 1085367 ymin: -1419116 xmax: 1390214 ymax: -1045061
## Projected CRS: +proj=laea +lat_0=45 +lon_0=-100 +x_0=0 +y_0=0 +a=6370997 +b=6370997 +units=m +no_defs
## GEOID NAME variable estimate moe
## 1 01001 Autauga County, Alabama B19013_001 68315 4941
## 2 01009 Blount County, Alabama B19013_001 57440 3308
## 3 01017 Chambers County, Alabama B19013_001 48805 5967
## 4 01021 Chilton County, Alabama B19013_001 62471 3466
## 5 01033 Colbert County, Alabama B19013_001 56149 3200
## 6 01045 Dale County, Alabama B19013_001 52813 3967
## geometry
## 1 MULTIPOLYGON (((1269841 -13...
## 2 MULTIPOLYGON (((1240383 -11...
## 3 MULTIPOLYGON (((1382944 -12...
## 4 MULTIPOLYGON (((1257515 -12...
## 5 MULTIPOLYGON (((1085910 -10...
## 6 MULTIPOLYGON (((1382203 -13...
ggplot(data = Universo) +
geom_sf(aes(fill = estimate), color = NA) +
coord_sf(datum = NA) +
theme_minimal() +
scale_fill_viridis_c()
# Definición de un objeto tibble con la información necesaria
Universe <- tibble(
county = Universo$NAME,
median_income = Universo$estimate
)
# Definición del número de muestras y el tamaño de cada una de ellas
n_samples <- 100
sample_size <- 30
reliability <- 0.95
# Obtención de muestras aleatorias y cálculo de intervalos de confianza
samples <- replicate(n_samples, {
sample_data <- sample_n(Universe, sample_size)
var_sample <- var(sample_data$median_income, na.rm=TRUE)
ci <- c((sample_size-1)*var_sample/qchisq(p=(1 + reliability)/2,df=sample_size-1),(sample_size-1)*var_sample/qchisq(p=1 - (1 + reliability)/2,df=sample_size-1))
tibble(
variance = var_sample,
lower_ci = ci[1],
upper_ci = ci[2]
)
}, simplify = FALSE) %>%
bind_rows()
# Determinación de los intervalos que contienen el parámetro poblacional
true_var <- (length(Universe$median_income)-1)/length(Universe$median_income)*var(x=Universe$median_income, na.rm=TRUE)
samples <- samples %>%
mutate(contains_true_var = lower_ci <= true_var & upper_ci >= true_var)
# Gráfico de los intervalos de confianza
ggplot(samples, aes(x = 1:n_samples, y = variance)) +
geom_errorbar(aes(ymin = lower_ci, ymax = upper_ci, color = contains_true_var), width = 0.2) +
geom_point(aes(color = contains_true_var)) +
geom_hline(yintercept = true_var, linetype = "dashed", color = "red") +
labs(title = "Intervalos de Confianza para la Varianza del Ingreso Mediano por Condado",
x = "Muestra",
y = "Varianza del Ingreso Mediano") +
scale_color_manual(values = c("FALSE" = "blue", "TRUE" = "green")) +
theme_minimal()
relibability_table <- samples %>%
count(contains_true_var)
relibability_table
## # A tibble: 2 × 2
## contains_true_var n
## <lgl> <int>
## 1 FALSE 20
## 2 TRUE 80
Se encuentra que para la simulación realizada el 80% de los intervalos contenía al verdadero parámetro poblaciónal \(\mu_x\) \((1-\alpha)\) mientras que en el 20% de los casos no lo contenía \((\alpha)\).
\[ \sigma_x^2 = \frac{1}{N} \sum\limits_{i=1}^{N}\left({x}_{i}-\mu_x\right)^2 \]
sigma2 <- (length(Universo$estimate)-1)/length(Universo$estimate)*var(Universo$estimate, na.rm = TRUE)
sigma2
## [1] 281267819