library(tidyverse)
library(infer)
creditos <- read_rds("creditos_colombia_genero.Rds")
creditos %>% head()
prop_poblacional <- prop.table(table(creditos$genero))["M"]
prop_poblacional
## M
## 0.381877
# Muestra de tamaƱo 20
set.seed(2022)
muestra_n20 <-
creditos %>%
rep_sample_n(size = 20, reps = 100) %>%
group_by(replicate) %>%
summarise(total_mujeres = sum(genero == "M")) %>%
mutate(prop_mujeres = total_mujeres / 20,
N = 20)
# Muestra de tamaƱo 50
set.seed(2022)
muestra_n50 <-
creditos %>%
rep_sample_n(size = 50, reps = 100) %>%
group_by(replicate) %>%
summarise(total_mujeres = sum(genero == "M")) %>%
mutate(prop_mujeres = total_mujeres / 50,
N = 50)
# Muestra de tamaƱo 100
set.seed(2022)
muestra_n100 <-
creditos %>%
rep_sample_n(size = 100, reps = 100) %>%
group_by(replicate) %>%
summarise(total_mujeres = sum(genero == "M")) %>%
mutate(prop_mujeres = total_mujeres / 100,
N = 100)
# Muestra de tamaƱo 1000
set.seed(2022)
muestra_n1000 <-
creditos %>%
rep_sample_n(size = 1000, reps = 100) %>%
group_by(replicate) %>%
summarise(total_mujeres = sum(genero == "M")) %>%
mutate(prop_mujeres = total_mujeres / 1000,
N = 1000)
bind_rows(muestra_n20, muestra_n50, muestra_n100, muestra_n1000) %>%
select(prop_mujeres, N) %>%
ggplot(aes(x = prop_mujeres)) +
facet_wrap(~N, ncol = 4) +
geom_histogram(color = "white") +
geom_vline(xintercept = prop_poblacional, color = "red")
desv_n20 <- sd(muestra_n20$prop_mujeres)
desv_n50 <- sd(muestra_n50$prop_mujeres)
desv_n100 <- sd(muestra_n100$prop_mujeres)
desv_n1000 <- sd(muestra_n1000$prop_mujeres)
data.frame(
Muestra = c(20, 50, 100, 1000),
DE = c(desv_n20, desv_n50, desv_n100, desv_n1000)
)
set.seed(2022)
muestra_creditos <- creditos %>%
rep_sample_n(size = 50, reps = 1)
set.seed(2022)
remuestreo_creditos <- muestra_creditos %>%
rep_sample_n(size = 50, reps = 1, replace = TRUE)
prop.table(table(muestra_creditos$genero))
##
## H M
## 0.68 0.32
prop.table(table(remuestreo_creditos$genero))
##
## H M
## 0.62 0.38
set.seed(2022)
remuestreo_creditos_1000 <- muestra_creditos %>%
rep_sample_n(size = 50, reps = 1000, replace = TRUE) %>%
group_by(replicate) %>%
summarise(prop_mujeres = sum(genero == "M") / 50)
remuestreo_creditos_1000 %>%
ggplot(aes(x = prop_mujeres)) +
geom_histogram(color = "white", bins = 15) +
labs(title = "Distribución bootstrapp con n = 1000")
promedio_remuestreo <- remuestreo_creditos_1000$prop_mujeres %>% mean()
promedio_remuestreo
## [1] 0.32002
desv_remuestreo <- remuestreo_creditos_1000$prop_mujeres %>% sd()
desv_remuestreo
## [1] 0.06585998
quantile(x = remuestreo_creditos_1000$prop_mujeres, probs = c(0.025, 0.975))
## 2.5% 97.5%
## 0.1995 0.4600
\[\bar{X} \pm z \times Error\ EstƔndar\]
# Niveles de confianza frecuentes para z
qnorm(p = 0.05) # 90 % de confianza
## [1] -1.644854
qnorm(p = 0.025) # 95 % de confianza
## [1] -1.959964
qnorm(p = 0.005) # 99 % de confianza
## [1] -2.575829
limite_inferior <- promedio_remuestreo - (1.959964 * desv_remuestreo)
limite_superior <- promedio_remuestreo + (1.959964 * desv_remuestreo)
c(limite_inferior, limite_superior)
## [1] 0.1909368 0.4491032
infer
specify()
generate()
calculate()
visualize()
get_confidence_interval()
.
Nota: para mejorar la visualización de los intervalos
de confianza, se puede utilizar la función
shade_confidence_interval()
set.seed(2022)
remuestreo_creditos_infer <- muestra_creditos %>%
specify(response = genero, success = "M") %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "prop")
remuestreo_creditos_infer
remuestreo_creditos_infer %>%
visualize()
ic_percentiles <-
remuestreo_creditos_infer %>%
get_confidence_interval(level = 0.95, type = "percentile")
ic_percentiles
remuestreo_creditos_infer %>%
visualize() +
shade_confidence_interval(endpoints = ic_percentiles)
proporcion_muestral <-
prop.table(table(muestra_creditos$genero))["M"]
ic_errorestandar <- remuestreo_creditos_infer %>%
get_confidence_interval(level = 0.95,
type = "se",
point_estimate = proporcion_muestral)
ic_errorestandar
remuestreo_creditos_infer %>%
visualize() +
shade_confidence_interval(endpoints = ic_errorestandar)
remuestreo_creditos_infer %>%
visualize() +
shade_confidence_interval(endpoints = ic_percentiles,
color = "forestgreen",
fill = "white") +
shade_confidence_interval(endpoints = ic_errorestandar,
color = "dodgerblue",
fill = "white") +
geom_vline(xintercept = prop_poblacional, color = "red", lty = 2, size = 1.5) +
geom_vline(xintercept = promedio_remuestreo, color = "black", lty = 2, size = 1.5)
\[\hat{p}-Z_{\alpha/2}\sqrt{\frac{\hat{p}(1-\hat{p})}{n}} < p < \hat{p}+Z_{\alpha/2}\sqrt{\frac{\hat{p}(1-\hat{p})}{n}}\]
total_mujeres <- 16
total_muestra <- 50
proporcion_referencia <- 0.5
prop.test(x = total_mujeres, n = total_muestra, p = proporcion_referencia)
##
## 1-sample proportions test with continuity correction
##
## data: total_mujeres out of total_muestra, null probability proporcion_referencia
## X-squared = 5.78, df = 1, p-value = 0.01621
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.1992781 0.4683118
## sample estimates:
## p
## 0.32
media_poblacional <- guinea$proteina %>% mean()
media_poblacional
## [1] 10.24264
set.seed(2022)
muestra_guinea <- guinea %>%
rep_sample_n(size = 50, reps = )
set.seed(2022)
remuestreo_guinea <- muestra_guinea %>%
specify(response = proteina) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "mean")
remuestreo_guinea
remuestreo_guinea %>%
visualize()
ic_guinea_percentil <-
remuestreo_guinea %>%
get_confidence_interval(level = 0.95, type = "percentile")
ic_guinea_percentil
remuestreo_guinea %>%
visualize() +
shade_confidence_interval(endpoints = ic_guinea_percentil) +
geom_vline(
xintercept = media_poblacional,
color = "red",
lty = 2,
size = 1.5
) +
geom_vline(
xintercept = mean(remuestreo_guinea$stat),
color = "black",
lty = 2,
size = 1.5
)
\[\bar{X} - t_{\alpha/2, n-1}\frac{S}{\sqrt{n}} < \mu < \bar{X} + t_{\alpha/2, n-1}\frac{S}{\sqrt{n}}\]
t.test(x = muestra_guinea$proteina, mu = 10, conf.level = 0.95)
##
## One Sample t-test
##
## data: muestra_guinea$proteina
## t = 1.3913, df = 49, p-value = 0.1704
## alternative hypothesis: true mean is not equal to 10
## 95 percent confidence interval:
## 9.803835 11.078965
## sample estimates:
## mean of x
## 10.4414
medias_poblacionales <-
guinea %>%
group_by(departamento) %>%
summarise(promedio_poblacional = mean(digestibilidad))
medias_poblacionales
media_antioquia <- medias_poblacionales$promedio_poblacional[1]
media_tolima <- medias_poblacionales$promedio_poblacional[2]
diferencia_poblacional <- media_antioquia - media_tolima
diferencia_poblacional
## [1] -1.588059
set.seed(2022)
muestra_digestibilidad <-
guinea %>%
rep_sample_n(size = 50, reps = 1)
muestra_digestibilidad
set.seed(2022)
remuestreo_digestibilidad <- muestra_digestibilidad %>%
specify(formula = digestibilidad ~ departamento) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "diff in means", order = c("Antioquia", "Tolima"))
remuestreo_digestibilidad
remuestreo_digestibilidad %>%
visualize()
ic_diges_percentil <-
remuestreo_digestibilidad %>%
get_confidence_interval(level = 0.95, type = "percentile")
ic_diges_percentil
remuestreo_digestibilidad %>%
visualize() +
shade_confidence_interval(endpoints = ic_diges_percentil) +
geom_vline(
xintercept = diferencia_poblacional,
color = "red",
lty = 2,
size = 1.5
) +
geom_vline(
xintercept = mean(remuestreo_digestibilidad$stat),
color = "black",
lty = 2,
size = 1.5
)
t.test(muestra_digestibilidad$digestibilidad ~ muestra_digestibilidad$departamento,
mu = 0, conf.level = 0.95)
##
## Welch Two Sample t-test
##
## data: muestra_digestibilidad$digestibilidad by muestra_digestibilidad$departamento
## t = -1.5577, df = 43.939, p-value = 0.1265
## alternative hypothesis: true difference in means between group Antioquia and group Tolima is not equal to 0
## 95 percent confidence interval:
## -2.3088622 0.2957916
## sample estimates:
## mean in group Antioquia mean in group Tolima
## 56.57966 57.58619