2023/01/31 (updated: 2023-12-14)
\[E(\widehat{SATE})= E\left(\frac{1}{n_1}\sum_{i=1}^n T_iY_i(1)-\frac{1}{n-n_1}\sum_{i=1}^n (1-T_i)Y_i(0) \right)\\=\frac{1}{n_1} \sum_{i=1}^nE(T_i)Y_i(1)-\frac{1}{n-n_1}\sum_{i=1}^n E(1-T_i)Y_i(0)\\=\frac{1}{n_1}\sum_{i=1}^n \frac{n_1}{n}Y_i(1)-\frac{1}{n-n_1}\sum_{i=1}^n \left(1-\frac{n_1}{n}\right)Y_i(0)\\=\frac{1}{n_1} \sum_{i=1}^n\frac{n_1}{n}Y_i(1)-\frac{1}{n-n_1}\sum_{i=1}^n \left(\frac{n-n_1}{n}\right)Y_i(0)\\=\frac{1}{n}\sum_{i=1}^n\{Y_i(1)-Y_i(0) \}= \mbox{SATE}\]
library(tidyverse)
n <- 100
mu0 <- 0
sd0 <- 1
mu1 <- 1
sd1 <- 1
smpl <- tibble(id = seq_len(n),
Y0 = rnorm(n, mean=mu0, sd=sd0),
Y1 = rnorm(n, mean=mu1, sd=sd1),
tau = Y1 - Y0)
SATE <- smpl %>% select(tau) %>% summarize(SATE = mean(tau)) %>% pull() SATE
## [1] 0.7475618
sim_treat <- function(smpl) {
n <- nrow(smpl)
idx <- sample(seq_len(n), floor(nrow(smpl)/2), replace = FALSE)
smpl[["treat"]] <- as.integer(seq_len(nrow(smpl)) %in% idx)
smpl %>%
mutate(Y_obs = if_else(treat==1, Y1, Y0)) %>%
group_by(treat) %>%
summarize(Y_obs = mean(Y_obs)) %>%
pivot_wider(names_from = treat, values_from = Y_obs) %>%
rename(Y1_mean = `1`, Y0_mean = `0`) %>%
mutate(diff_mean = Y1_mean - Y0_mean,
est_error = diff_mean - SATE)
}
sim_treat(smpl)
## # A tibble: 1 × 4 ## Y0_mean Y1_mean diff_mean est_error ## <dbl> <dbl> <dbl> <dbl> ## 1 0.148 0.723 0.575 -0.172
n <- nrow(smpl) idx <- sample(seq_len(n), floor(nrow(smpl)/2), replace = FALSE) n
## [1] 100
idx
## [1] 24 70 1 13 65 17 52 42 18 84 37 38 19 9 41 33 59 74 21 ## [20] 87 79 93 90 91 28 80 67 76 71 35 45 62 69 46 95 20 50 99 ## [39] 51 66 86 88 56 58 75 26 6 100 97 23
smpl[["treat"]] <- as.integer(seq_len(nrow(smpl)) %in% idx) head(smpl)
## # A tibble: 6 × 5 ## id Y0 Y1 tau treat ## <int> <dbl> <dbl> <dbl> <int> ## 1 1 1.27 1.93 0.654 1 ## 2 2 -1.18 0.864 2.04 0 ## 3 3 0.386 0.612 0.226 0 ## 4 4 2.31 -0.936 -3.25 0 ## 5 5 -0.274 1.54 1.81 0 ## 6 6 -1.29 1.17 2.46 1
smpl %>%
mutate(Y_obs = if_else(treat==1, Y1, Y0))
## # A tibble: 100 × 6 ## id Y0 Y1 tau treat Y_obs ## <int> <dbl> <dbl> <dbl> <int> <dbl> ## 1 1 1.27 1.93 0.654 1 1.93 ## 2 2 -1.18 0.864 2.04 0 -1.18 ## 3 3 0.386 0.612 0.226 0 0.386 ## 4 4 2.31 -0.936 -3.25 0 2.31 ## 5 5 -0.274 1.54 1.81 0 -0.274 ## 6 6 -1.29 1.17 2.46 1 1.17 ## 7 7 -0.463 0.454 0.917 0 -0.463 ## 8 8 1.07 1.24 0.161 0 1.07 ## 9 9 0.537 0.430 -0.107 1 0.430 ## 10 10 0.137 -1.09 -1.23 0 0.137 ## # ℹ 90 more rows
smpl %>%
mutate(Y_obs = if_else(treat==1, Y1, Y0)) %>%
group_by(treat) %>%
summarize(Y_obs = mean(Y_obs))
## # A tibble: 2 × 2 ## treat Y_obs ## <int> <dbl> ## 1 0 0.0679 ## 2 1 0.749
smpl %>%
mutate(Y_obs = if_else(treat==1, Y1, Y0)) %>%
group_by(treat) %>%
summarize(Y_obs = mean(Y_obs)) %>%
pivot_wider(names_from = treat, values_from = Y_obs)
## # A tibble: 1 × 2 ## `0` `1` ## <dbl> <dbl> ## 1 0.0679 0.749
smpl %>%
mutate(Y_obs = if_else(treat==1, Y1, Y0)) %>%
group_by(treat) %>%
summarize(Y_obs = mean(Y_obs)) %>%
pivot_wider(names_from = treat, values_from = Y_obs) %>%
rename(Y1_mean = `1`, Y0_mean = `0`) %>%
mutate(diff_mean = Y1_mean - Y0_mean,
est_error = diff_mean - SATE)
## # A tibble: 1 × 4 ## Y0_mean Y1_mean diff_mean est_error ## <dbl> <dbl> <dbl> <dbl> ## 1 0.0679 0.749 0.681 -0.0669
map_df() para aplicar a função 500 vezes na nossa amostra e observar o erro de estimação.sims <- 500 sate_sims <- map_df(seq_len(sims), ~ sim_treat(smpl))
summary(sate_sims$est_error)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## -0.3789586 -0.0927292 0.0082436 0.0006646 0.0825342 0.4224462
sim_treat().PATE <- mu1 - mu0
sim_pate <- function(n, mu0, mu1, sd0, sd1){
PATE = mu1 - mu0
smpl <- tibble(Y0 = rnorm(n ,mean=mu0, sd=sd0),
Y1 = rnorm(n, mean=mu1, sd=sd1),
tau = Y1 - Y0)
idx <- sample(seq_len(n), floor(nrow(smpl)/2), replace = FALSE)
smpl[["treat"]] <- as.integer(seq_len(nrow(smpl)) %in% idx)
smpl %>%
mutate(Y_obs = if_else(treat==1, Y1, Y0)) %>%
group_by(treat) %>%
summarize(Y_obs = mean(Y_obs)) %>%
pivot_wider(names_from = treat, values_from = Y_obs) %>%
rename(Y1_mean = `1`, Y0_mean = `0`) %>%
mutate(diff_mean = Y1_mean - Y0_mean,
est_error = diff_mean - PATE)
}
sims <- 500 pate_sims <- map_df(seq_len(sims), ~ sim_pate(n, mu0, mu1, sd0, sd1)) summary(pate_sims$est_error)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## -0.549557 -0.118001 0.002255 0.003158 0.135687 0.466888
pate_sims %>%
ggplot(aes(diff_mean, after_stat(density))) +
geom_histogram(binwidth = 0.1) +
geom_vline(xintercept = PATE, color="black", linewidth=0.5) +
labs(title = "Sampling distribution",
x="Difference-in-means estimator",
y="Density") +
theme_classic()
pate_sims %>% select(diff_mean) %>% summarize(sd = sd(diff_mean))
## # A tibble: 1 × 1 ## sd ## <dbl> ## 1 0.184
RMSE <- pate_sims %>% summarize(rmse = sqrt(mean(est_error^2)))
\[\mbox{MSE}=E[(\hat{\theta}-\theta)^2]=E[\hat{\theta}-E(\hat{\theta})+E(\hat{\theta})-\theta]^2\\=E[(\hat{\theta}-E(\hat{\theta}))+(E(\hat{\theta})-\theta)]^2\\=E[(\hat{\theta}-E(\hat{\theta}))^2 +2E(\hat{\theta}-E(\hat{\theta}))(E(\hat{\theta})-\theta)+(E(\hat{\theta})-\theta)^2]\\=E[(\hat{\theta}-E(\hat{\theta}))] + E[(E(\hat{\theta})-\theta)^2]=\mbox{variância} + \mbox{viés}^2\]
\[E(\hat{\theta}-E(\hat{\theta}))(E(\hat{\theta})-\theta)=E[\hat{\theta}E(\hat{\theta})-\hat{\theta}\theta-E(\hat{\theta})^2+E(\hat{\theta})\theta]\\=E(\hat{\theta})^2-E(\hat{\theta})\theta-E(\hat{\theta})^2+E(\hat{\theta})\theta=0\]
\[\mbox{erro padrão}=\sqrt{\frac{\bar{X}_n(1-\bar{X}_n)}{n}}\]
\[\mbox{erro padrão da média amostral}=\sqrt{\widehat{V(\bar{X}_n)}}=\sqrt{\frac{\widehat{V(X)}}{n}}\]
sim_pate_se <- function(n, mu0, mu1, sd0, sd1){
PATE = mu1 - mu0
smpl <- tibble(Y0 = rnorm(n ,mean=mu0, sd=sd0),
Y1 = rnorm(n, mean=mu1, sd=sd1),
tau = Y1 - Y0)
idx <- sample(seq_len(n), floor(nrow(smpl)/2), replace = FALSE)
smpl[["treat"]] <- as.integer(seq_len(nrow(smpl)) %in% idx)
smpl %>%
mutate(Y_obs = if_else(treat==1, Y1, Y0)) %>%
group_by(treat) %>%
summarize(mean = mean(Y_obs),
var = var(Y_obs),
nobs=n()) %>%
summarize(diff_mean = diff(mean),
se=sqrt(sum(var/nobs)),
std_error = diff_mean-PATE)
}
sim_pate_se(n, mu0, mu1, sd0, sd1)
## # A tibble: 1 × 3 ## diff_mean se std_error ## <dbl> <dbl> <dbl> ## 1 1.06 0.191 0.0598
sims <- 500
pate_sims_se <- map_df(seq_len(sims),
~ sim_pate_se(n, mu0, mu1, sd0, sd1))
sd_se <- pate_sims_se %>%
summarize(sd = sd(diff_mean),
mean_se = mean(se))
sd_se
## # A tibble: 1 × 2 ## sd mean_se ## <dbl> <dbl> ## 1 0.201 0.199
\[\sqrt{\frac{V(Y(1))}{n_1}+\frac{V(Y(2))}{n-n_1}}=\sqrt{\frac{1}{50}+\frac{1}{50}}\\=\sqrt{\frac{1}{25}}=\frac{1}{5}\]
\[\bar{X}_n- E(X) \sim N\left(0, \frac{V(x)}{n}\right)\]
\[\bar{X}_n \sim N\left(E(X), \frac{V(x)}{n}\right)\]
qnorm() permite calcular o valor crítico para cada nível de confiança.#alpha = 0,01 -> nível de confiança 99% qnorm(0.995) # 1-0.01/2
## [1] 2.575829
#alpha = 0,05 -> nível de confiança 95% qnorm(0.975) # 1-0.05/2
## [1] 1.959964
#alpha = 0,1 -> nível de confiança 90% qnorm(0.95) # 1-0.1/2
## [1] 1.644854
n <- 1000
x_bar <- 0.6
se <- sqrt(x_bar * (1-x_bar)/n)
levels <- c(0.99, 0.95, 0.9)
tibble(level = levels) %>%
mutate(ci_lower = x_bar - qnorm(1-(1-level)/2)*se,
ci_upper = x_bar + qnorm(1-(1-level)/2)*se)
## # A tibble: 3 × 3 ## level ci_lower ci_upper ## <dbl> <dbl> <dbl> ## 1 0.99 0.560 0.640 ## 2 0.95 0.570 0.630 ## 3 0.9 0.575 0.625
level <- 0.95
pate_sims_ci <- pate_sims_se %>%
mutate(ci_lower = diff_mean - qnorm(1 - (1-level)/2)*se,
ci_upper = diff_mean + qnorm(1 - (1-level)/2)*se,
includes_pate = PATE > ci_lower & PATE < ci_upper)
glimpse(pate_sims_ci)
## Rows: 500 ## Columns: 6 ## $ diff_mean <dbl> 0.8218692, 1.0451775, 1.0508220, 1.0515216, 1.3311669, 1… ## $ se <dbl> 0.1923786, 0.2052930, 0.2013800, 0.2121763, 0.1819019, 0… ## $ std_error <dbl> -0.17813084, 0.04517751, 0.05082202, 0.05152156, 0.33116… ## $ ci_lower <dbl> 0.4448140, 0.6428107, 0.6561245, 0.6356637, 0.9746458, 0… ## $ ci_upper <dbl> 1.1989243, 1.4475443, 1.4455195, 1.4673794, 1.6876880, 1… ## $ includes_pate <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, FALSE, …
pate_sims_ci %>% summarize(coverage = mean(includes_pate))
## # A tibble: 1 × 1 ## coverage ## <dbl> ## 1 0.95
pate_sim_coverage <- function(.data, level = 0.95){
mutate(.data,
ci_lower = diff_mean - qnorm(1 - (1-level)/2)*se,
ci_upper = diff_mean + qnorm(1 - (1-level)/2)*se,
includes_pate = PATE > ci_lower & PATE < ci_upper) %>%
summarize(coverage = mean(includes_pate))
}
pate_sim_coverage(pate_sims_se)
## # A tibble: 1 × 1 ## coverage ## <dbl> ## 1 0.95
pate_sim_coverage(pate_sims_se, level = 0.95)
## # A tibble: 1 × 1 ## coverage ## <dbl> ## 1 0.95
pate_sim_coverage(pate_sims_se, level = 0.99)
## # A tibble: 1 × 1 ## coverage ## <dbl> ## 1 0.99
pate_sim_coverage(pate_sims_se, level = 0.9)
## # A tibble: 1 × 1 ## coverage ## <dbl> ## 1 0.896
binom_ci_contains <- function(n, p, alpha=0.05){
x <- rbinom(n, size=1, prob = p)
x_bar <- mean(x)
se <- sqrt(x_bar * (1-x_bar)/n)
ci_lower <- x_bar - qnorm(1 - alpha/2)*se
ci_upper <- x_bar + qnorm(1 - alpha/2)*se
(ci_lower <= p) & (p <= ci_upper)
}
p <- 0.6 n <- 10 binom_ci_contains(n=n, p=p, alpha=0.05)
## [1] TRUE
map_lgl() é semelhante à função map_df() porém retorna um objeto do tipo logical em vez de um data.frame. Vamos usar essa função para aplicar várias vezes a função binom_ci_contais() com amostras de tamanho \(n\).mean(map_lgl(seq_len(sims), ~ binom_ci_contains(n,p)))
## [1] 0.894
binom_ci_coverage <- function(n, p, sims){
mean(map_lgl(seq_len(sims), ~ binom_ci_contains(n,p)))
}
tibble(n=c(10, 100, 1000)) %>% mutate(coverage = map_dbl(n, binom_ci_coverage, p=p, sims=sims))
## # A tibble: 3 × 2 ## n coverage ## <dbl> <dbl> ## 1 10 0.876 ## 2 100 0.946 ## 3 1000 0.944
moe_pop_prop <- function(MoE) {
tibble(p = seq(from = 0.01, to = 0.99, by=0.01),
n = 1.96^2 * p *(1-p)/MoE^2,
MoE = MoE)
}
glimpse(moe_pop_prop(0.01))
## Rows: 99 ## Columns: 3 ## $ p <dbl> 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10, 0.11, … ## $ n <dbl> 380.3184, 752.9536, 1117.9056, 1475.1744, 1824.7600, 2166.6624, 25… ## $ MoE <dbl> 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, …
MoE <- c(0.01, 0.03, 0.05)
props <- map_df(MoE, moe_pop_prop)
props %>%
ggplot(aes(x=p, y=n, color = factor(MoE))) +
geom_line(aes(linetype = factor(MoE))) +
labs(color = "Margin of error",
linetype = "Margin of error",
x = "Population proportion",
y = "Sample size") +
theme_classic() +
theme(legend.position = "bottom") +
props %>% filter(p == 0.5) %>% select(MoE,n)
## # A tibble: 3 × 2 ## MoE n ## <dbl> <dbl> ## 1 0.01 9604 ## 2 0.03 1067. ## 3 0.05 384.
library(lubridate)
pres08 <- read.csv("pres08.csv")
polls08 <- read.csv("polls08.csv") %>%
mutate(middate = as.Date(middate))
ELECTION_DATE <- ymd(20081104)
polls08 <- polls08 %>% mutate(DaysToElection = as.integer(ELECTION_DATE - middate)) poll_pred <- polls08 %>% group_by(state) %>% filter(DaysToElection == min(DaysToElection)) %>% summarize(Obama = mean(Obama)/100)
head(poll_pred)
## # A tibble: 6 × 2 ## state Obama ## <chr> <dbl> ## 1 AK 0.39 ## 2 AL 0.36 ## 3 AR 0.44 ## 4 AZ 0.465 ## 5 CA 0.6 ## 6 CO 0.52
sample_size <- 1000
alpha <- 0.05
poll_pred <- poll_pred %>%
mutate(se = sqrt(Obama * (1-Obama)/sample_size),
ci_lwr = Obama + qnorm(alpha/2)*se,
ci_upr = Obama + qnorm(1-alpha/2)*se)
head(poll_pred)
## # A tibble: 6 × 5 ## state Obama se ci_lwr ci_upr ## <chr> <dbl> <dbl> <dbl> <dbl> ## 1 AK 0.39 0.0154 0.360 0.420 ## 2 AL 0.36 0.0152 0.330 0.390 ## 3 AR 0.44 0.0157 0.409 0.471 ## 4 AZ 0.465 0.0158 0.434 0.496 ## 5 CA 0.6 0.0155 0.570 0.630 ## 6 CO 0.52 0.0158 0.489 0.551
poll_pred <- poll_pred %>%
left_join(select(pres08, state, actual = Obama), by="state") %>%
mutate(actual = actual/100,
covers = (ci_lwr <= actual) & (actual <= ci_upr))
poll_pred %>%
ggplot(aes(actual, Obama, color = as.factor(covers))) +
geom_abline(intercept = 0, slope = 1, color = "black", linewidth = 0.5) +
geom_pointrange(aes(ymin=ci_lwr, ymax=ci_upr), shape=1) +
scale_y_continuous(limits = c(0,1)) +
scale_x_continuous(limits = c(0,1)) +
scale_color_discrete("CI includes result?") +
labs(x = "Obama'a vote share",
y= "Poll prediction") +
coord_fixed() +
theme_classic()
poll_pred %>% summarize(mean(covers))
## # A tibble: 1 × 1 ## `mean(covers)` ## <dbl> ## 1 0.588
poll_pred <- poll_pred %>%
mutate(bias = Obama - actual) %>%
mutate(Obama_bc = Obama - mean(bias),
se_bc = sqrt(Obama_bc * (1-Obama_bc)/ sample_size),
ci_lwr_bc = Obama_bc + qnorm(alpha/2)*se_bc,
ci_upr_bc = Obama_bc + qnorm(1-alpha/2)*se_bc,
covers_bc = (ci_lwr_bc <= actual) & (actual <= ci_upr_bc))
poll_pred %>% summarize(mean(bias))
## # A tibble: 1 × 1 ## `mean(bias)` ## <dbl> ## 1 -0.0268
mean(poll_pred$bias)
## [1] -0.02679739
poll_pred %>% summarize(mean(covers_bc))
## # A tibble: 1 × 1 ## `mean(covers_bc)` ## <dbl> ## 1 0.765
STAR <- read.csv("STAR.csv")
STAR <- STAR %>%
mutate(classtype = factor(classtype,
labels = c("Small class", "Regular class",
"Regular class with aide")))
glimpse(STAR)
## Rows: 6,325 ## Columns: 6 ## $ race <int> 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1,… ## $ classtype <fct> Regular class with aide, Regular class with aide, Regular c… ## $ yearssmall <int> 0, 0, 0, 4, 0, 0, 4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0,… ## $ hsgrad <int> NA, NA, 1, NA, NA, NA, NA, NA, 1, 1, NA, NA, 1, NA, 1, NA, … ## $ g4math <int> NA, 706, 711, 672, NA, NA, 668, NA, 709, 698, NA, NA, NA, N… ## $ g4reading <int> NA, 661, 750, 659, NA, NA, 657, NA, 725, 692, NA, NA, NA, N…
classtype_means <- STAR %>% group_by(classtype) %>% summarize(g4reading = mean(g4reading, na.rm=TRUE)) classtype_means
## # A tibble: 3 × 2 ## classtype g4reading ## <fct> <dbl> ## 1 Small class 723. ## 2 Regular class 720. ## 3 Regular class with aide 721.
classtypes_used <- c("Small class", "Regular class")
STAR %>%
filter(classtype %in% classtypes_used, !is.na(g4reading)) %>%
ggplot(aes(g4reading, after_stat(density))) +
geom_histogram(binwidth = 20) +
geom_vline(data = filter(classtype_means, classtype %in% classtypes_used),
mapping = aes(xintercept=g4reading),
color = "blue", linewidth = 0.5) +
facet_grid(classtype ~.) +
labs(x="Fourth-grade reading test scores",
y="Density") +
theme_classic()
alpha <- 0.05
star_estimates <- STAR %>%
filter(!is.na(g4reading), classtype %in% classtypes_used) %>%
group_by(classtype) %>%
summarize(n = n(),
est = mean(g4reading),
se = sd(g4reading)/sqrt(n)) %>%
mutate(lwr = est + qnorm(alpha/2)*se,
upr = est + qnorm(1-alpha/2)*se)
star_estimates
## # A tibble: 2 × 6 ## classtype n est se lwr upr ## <fct> <int> <dbl> <dbl> <dbl> <dbl> ## 1 Small class 726 723. 1.91 720. 727. ## 2 Regular class 836 720. 1.84 716. 723.
star_ate <- star_estimates %>%
arrange(desc(classtype)) %>%
summarize(se = sqrt(sum(se^2)),
est = diff(est)) %>%
mutate(ci_lwr = est + qnorm(alpha/2)*se,
ci_up = est + qnorm(1-alpha/2)*se)
star_ate
## # A tibble: 1 × 4 ## se est ci_lwr ci_up ## <dbl> <dbl> <dbl> <dbl> ## 1 2.65 3.50 -1.70 8.70
alfa <- c(0.01, 0.05, 0.1) data.frame(alfa, normal = qnorm(1-alfa/2), t=qt(1-alfa/2, df=5))
## alfa normal t ## 1 0.01 2.575829 4.032143 ## 2 0.05 1.959964 2.570582 ## 3 0.10 1.644854 2.015048
data.frame(alfa, normal = qnorm(1-alfa/2), t=qt(1-alfa/2, df=50))
## alfa normal t ## 1 0.01 2.575829 2.677793 ## 2 0.05 1.959964 2.008559 ## 3 0.10 1.644854 1.675905
alpha <- 0.05
star_estimates_t <- STAR %>%
filter(!is.na(g4reading), classtype %in% classtypes_used) %>%
group_by(classtype) %>%
summarize(n=n(),
est = mean(g4reading),
se = sd(g4reading)/sqrt(n)) %>%
mutate(lwr = est + qt(alpha/2, df=n-1)*se,
upr = est + qt(1-alpha/2, df=n-1)*se)
star_estimates_t
## # A tibble: 2 × 6 ## classtype n est se lwr upr ## <fct> <int> <dbl> <dbl> <dbl> <dbl> ## 1 Small class 726 723. 1.91 720. 727. ## 2 Regular class 836 720. 1.84 716. 723.
reading_small <- filter(STAR, classtype == "Small class")$g4reading reading_reg <- filter(STAR, classtype == "Regular class")$g4reading t.ci <- t.test(reading_small, reading_reg)
t.ci
## ## Welch Two Sample t-test ## ## data: reading_small and reading_reg ## t = 1.3195, df = 1541.2, p-value = 0.1872 ## alternative hypothesis: true difference in means is not equal to 0 ## 95 percent confidence interval: ## -1.703591 8.706055 ## sample estimates: ## mean of x mean of y ## 723.3912 719.8900
| Xícara | Lady | Correto | C1 | C2 | C3 |
|---|---|---|---|---|---|
| 1 | M | M | T | T | T |
| 2 | T | T | T | T | M |
| 3 | T | T | T | T | M |
| 4 | M | M | T | M | M |
| 5 | M | M | M | M | T |
| 6 | T | T | M | M | T |
| 7 | T | T | M | T | M |
| 8 | M | M | M | M | T |
…
cups <- 4
k <- c(0, seq_len(cups))
true <- tibble(correct = k*2,
n = choose(cups, k)*choose(cups, cups-k)) %>%
mutate(prob = n/sum(n))
true
## # A tibble: 5 × 3 ## correct n prob ## <dbl> <dbl> <dbl> ## 1 0 1 0.0143 ## 2 2 16 0.229 ## 3 4 36 0.514 ## 4 6 16 0.229 ## 5 8 1 0.0143
sims <- 500
guess <- tibble(guess = c("M", "T", "T", "M", "M", "T", "T", "M"))
randomize_tea <- function(df) {
assigment <- slice_sample(df,n=8) %>%
rename(actual = guess)
bind_cols(df, assigment) %>%
summarize(correct = sum(guess==actual))
}
slice_sample() sorteia \(n\) elementos do data.frame de forma aleatória, o deafult é sem reposição, caso queira com reposição é preciso usar o argumento replace como TRUE.map_df(seq_len(10), ~ randomize_tea(guess))
## # A tibble: 10 × 1 ## correct ## <int> ## 1 4 ## 2 4 ## 3 4 ## 4 6 ## 5 4 ## 6 0 ## 7 4 ## 8 4 ## 9 6 ## 10 2
approx <- map_df(seq_len(sims), ~ randomize_tea(guess)) %>% count(correct) %>% mutate(prob = n/sum(n))
approx
## # A tibble: 5 × 3 ## correct n prob ## <int> <int> <dbl> ## 1 0 11 0.022 ## 2 2 107 0.214 ## 3 4 254 0.508 ## 4 6 122 0.244 ## 5 8 6 0.012
results <- approx %>% select(correct, prob_sim = prob) %>% left_join(select(true, correct, prob_exact = prob), by="correct") %>% mutate(diff = prob_sim - prob_exact)
results
## # A tibble: 5 × 4 ## correct prob_sim prob_exact diff ## <dbl> <dbl> <dbl> <dbl> ## 1 0 0.022 0.0143 0.00771 ## 2 2 0.214 0.229 -0.0146 ## 3 4 0.508 0.514 -0.00629 ## 4 6 0.244 0.229 0.0154 ## 5 8 0.012 0.0143 -0.00229
| Rejeitar H0 | Não rejeitar H0 | |
| H0 verdadeira | Erro Tipo I | Correto |
|---|---|---|
| H0 falsa | Correto | Erro Tipo II |
fisher.test() faz o teste exato de Fisher. O argumento da função é uma matriz de contigência \(2\times 2\) com linhas e colunas representando a distribuição do tratamento e a variável de resultado.x <- matrix(c(4,0,0,4), byrow = TRUE, ncol=2, nrow=2)
y <- matrix(c(3,1,1,3), byrow = TRUE, ncol=2, nrow=2)
rownames(x) <- colnames(x) <- rownames(y) <-
colnames(y) <- c("M", "T")
x
## M T ## M 4 0 ## T 0 4
y
## M T ## M 3 1 ## T 1 3
fisher.test(x, alternative = "greater")
## ## Fisher's Exact Test for Count Data ## ## data: x ## p-value = 0.01429 ## alternative hypothesis: true odds ratio is greater than 1 ## 95 percent confidence interval: ## 2.003768 Inf ## sample estimates: ## odds ratio ## Inf
fisher.test(x)
## ## Fisher's Exact Test for Count Data ## ## data: x ## p-value = 0.02857 ## alternative hypothesis: true odds ratio is not equal to 1 ## 95 percent confidence interval: ## 1.339059 Inf ## sample estimates: ## odds ratio ## Inf
fisher.test(y)
## ## Fisher's Exact Test for Count Data ## ## data: y ## p-value = 0.4857 ## alternative hypothesis: true odds ratio is not equal to 1 ## 95 percent confidence interval: ## 0.2117329 621.9337505 ## sample estimates: ## odds ratio ## 6.408309
pnorm() calcula a probabilidade que desejamos.n <- 1018 x.bar <- 550/n se <- sqrt(0.5 * 0.5/n) upper <- pnorm(x.bar, mean = 0.5, sd = se, lower.tail = FALSE) lower <- pnorm(0.5 - (x.bar - 0.5), mean = 0.5, sd=se)
upper+lower
## [1] 0.01016866
2*upper
## [1] 0.01016866
upper
## [1] 0.005084332
z.score <- (x.bar-0.5)/se z.score
## [1] 2.57004
pnorm(z.score, lower.tail = FALSE)
## [1] 0.005084332
2*pnorm(z.score, lower.tail = FALSE)
## [1] 0.01016866
c(x.bar - qnorm(0.995)*se, x.bar + qnorm(0.995)*se)
## [1] 0.4999093 0.5806408
c(x.bar - qnorm(0.975)*se, x.bar + qnorm(0.975)*se)
## [1] 0.5095605 0.5709896
prop.test() implementa testes de hipótese e calcula o intervalo de confiança.prop.test(550, n=n, p=0.5, correct = FALSE)
## ## 1-sample proportions test without continuity correction ## ## data: 550 out of n, null probability 0.5 ## X-squared = 6.6051, df = 1, p-value = 0.01017 ## alternative hypothesis: true p is not equal to 0.5 ## 95 percent confidence interval: ## 0.5095661 0.5706812 ## sample estimates: ## p ## 0.540275
prop.test(550, n=n, p=0.5)
## ## 1-sample proportions test with continuity correction ## ## data: 550 out of n, null probability 0.5 ## X-squared = 6.445, df = 1, p-value = 0.01113 ## alternative hypothesis: true p is not equal to 0.5 ## 95 percent confidence interval: ## 0.5090744 0.5711680 ## sample estimates: ## p ## 0.540275
prop.test(550, n=n, p=0.5, conf.level = 0.99)
## ## 1-sample proportions test with continuity correction ## ## data: 550 out of n, null probability 0.5 ## X-squared = 6.445, df = 1, p-value = 0.01113 ## alternative hypothesis: true p is not equal to 0.5 ## 99 percent confidence interval: ## 0.4994182 0.5806040 ## sample estimates: ## p ## 0.540275
t.test().t.test(STAR$g4reading, mu=710)
## ## One Sample t-test ## ## data: STAR$g4reading ## t = 10.407, df = 2352, p-value < 2.2e-16 ## alternative hypothesis: true mean is not equal to 710 ## 95 percent confidence interval: ## 719.1284 723.3671 ## sample estimates: ## mean of x ## 721.2478
head(star_ate)
## # A tibble: 1 × 4 ## se est ci_lwr ci_up ## <dbl> <dbl> <dbl> <dbl> ## 1 2.65 3.50 -1.70 8.70
star_ate %>%
mutate(p_value_1sided = pnorm(-abs(est), mean=0, sd=se),
p_value_2sided = 2*pnorm(-abs(est), mean=0, sd=se))
## # A tibble: 1 × 6 ## se est ci_lwr ci_up p_value_1sided p_value_2sided ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 2.65 3.50 -1.70 8.70 0.0935 0.187
t.test() pode ser usada para fazer o teste de t de duas amostras.reading_small <- filter(STAR, classtype == "Small class")$g4reading reading_reg <- filter(STAR, classtype == "Regular class")$g4reading
t.test(reading_small, reading_reg)
## ## Welch Two Sample t-test ## ## data: reading_small and reading_reg ## t = 1.3195, df = 1541.2, p-value = 0.1872 ## alternative hypothesis: true difference in means is not equal to 0 ## 95 percent confidence interval: ## -1.703591 8.706055 ## sample estimates: ## mean of x mean of y ## 723.3912 719.8900
resume <- read.csv("resume.csv")
head(resume)
## firstname sex race call ## 1 Allison female white 0 ## 2 Kristen female white 0 ## 3 Lakisha female black 0 ## 4 Latonya female black 0 ## 5 Carrie female white 0 ## 6 Jay male white 0
x <- resume %>% count(race, call) %>% pivot_wider(names_from = call, values_from = n) %>% ungroup()
x
## # A tibble: 2 × 3 ## race `0` `1` ## <chr> <int> <int> ## 1 black 2278 157 ## 2 white 2200 235
prop.test(as.matrix(select(x,-race)), alternative = "greater")
## ## 2-sample test for equality of proportions with continuity correction ## ## data: as.matrix(select(x, -race)) ## X-squared = 16.449, df = 1, p-value = 2.499e-05 ## alternative hypothesis: greater ## 95 percent confidence interval: ## 0.01881967 1.00000000 ## sample estimates: ## prop 1 prop 2 ## 0.9355236 0.9034908
n0 <- sum(resume$race == "black") n1 <- sum(resume$race == "white") p <- mean(resume$call) p0 <- mean(filter(resume, race == "black")$call) p1 <- mean(filter(resume, race == "white")$call) est <- p1 - p0 est
## [1] 0.03203285
se <- sqrt(p*(1-p)*(1/n0 + 1/n1)) se
## [1] 0.007796894
zstat <- est/se zstat
## [1] 4.108412
pnorm(-abs(zstat))
## [1] 1.991943e-05
prop.test(), podemos resolver isso.prop.test(as.matrix(select(x,-race)), alternative = "greater", correct = FALSE)
## ## 2-sample test for equality of proportions without continuity correction ## ## data: as.matrix(select(x, -race)) ## X-squared = 16.879, df = 1, p-value = 1.992e-05 ## alternative hypothesis: greater ## 95 percent confidence interval: ## 0.01923035 1.00000000 ## sample estimates: ## prop 1 prop 2 ## 0.9355236 0.9034908
…
dbinom(8, size=8, prob = 0.5)
## [1] 0.00390625
1/(2^8)
## [1] 0.00390625
dbinom(3, size=3, prob = 1/3) * dbinom(5, size=5, prob=0.5)
## [1] 0.001157407
1/(3^3) * 1/(2^5)
## [1] 0.001157407
n <- 250 p.star <- 0.48 p <- 0.5 alpha <- 0.05 cr.value <- qnorm(1-alpha/2) se.star <- sqrt(p.star * (1-p.star)/n) se <- sqrt(p*(1-p)/n) #poder do teste pnorm(p - cr.value * se, mean = p.star, sd=se.star) + pnorm(p + cr.value * se, mean = p.star, sd=se.star, lower.tail = FALSE)
## [1] 0.09673114
n <- 5000 p.star <- 0.48 p <- 0.5 alpha <- 0.05 cr.value <- qnorm(1-alpha/2) se.star <- sqrt(p.star * (1-p.star)/n) se <- sqrt(p*(1-p)/n) #poder do teste pnorm(p - cr.value * se, mean = p.star, sd=se.star) + pnorm(p + cr.value * se, mean = p.star, sd=se.star, lower.tail = FALSE)
## [1] 0.8076207
n <- 250 p.star <- 0.48 p <- 0.3 alpha <- 0.05 cr.value <- qnorm(1-alpha/2) se.star <- sqrt(p.star * (1-p.star)/n) se <- sqrt(p*(1-p)/n) #poder do teste pnorm(p - cr.value * se, mean = p.star, sd=se.star) + pnorm(p + cr.value * se, mean = p.star, sd=se.star, lower.tail = FALSE)
## [1] 0.9999517
n1 <- 500 n0 <- 500 p1.star <- 0.05 p0.star <- 0.1
p <- (n1*p1.star + n0*p0.star)/(n1+n0) se <- sqrt(p*(1-p)*(1/n1 + 1/n0)) se.star <- sqrt(p1.star*(1-p1.star)/n1 + p0.star*(1-p0.star)/n0)
pnorm(-cr.value*se, mean = p1.star - p0.star, sd=se.star) + pnorm(cr.value*se, mean=p1.star-p0.star, sd=se.star, lower.tail = FALSE)
## [1] 0.85228
power.prop.test(n=500, p1=0.05, p2=0.1, sig.level = 0.05)
## ## Two-sample comparison of proportions power calculation ## ## n = 500 ## p1 = 0.05 ## p2 = 0.1 ## sig.level = 0.05 ## power = 0.8522797 ## alternative = two.sided ## ## NOTE: n is number in *each* group
power.prop.test(p1=0.05, p2=0.1, sig.level = 0.05, power=0.9)
## ## Two-sample comparison of proportions power calculation ## ## n = 581.0821 ## p1 = 0.05 ## p2 = 0.1 ## sig.level = 0.05 ## power = 0.9 ## alternative = two.sided ## ## NOTE: n is number in *each* group
power.t.test(n=100, delta = 0.25, sd=1, type="one.sample")
## ## One-sample t test power calculation ## ## n = 100 ## delta = 0.25 ## sd = 1 ## sig.level = 0.05 ## power = 0.6969757 ## alternative = two.sided
power.t.test(power=0.9, delta = 0.25, sd=1, type="one.sample")
## ## One-sample t test power calculation ## ## n = 170.0511 ## delta = 0.25 ## sd = 1 ## sig.level = 0.05 ## power = 0.9 ## alternative = two.sided
power.t.test(n=100, delta = 0.25, sd=1, type="two.sample")
## ## Two-sample t test power calculation ## ## n = 100 ## delta = 0.25 ## sd = 1 ## sig.level = 0.05 ## power = 0.4204383 ## alternative = two.sided ## ## NOTE: n is number in *each* group
power.t.test(power=0.9, delta = 0.25, sd=1, type="two.sample")
## ## Two-sample t test power calculation ## ## n = 337.2008 ## delta = 0.25 ## sd = 1 ## sig.level = 0.05 ## power = 0.9 ## alternative = two.sided ## ## NOTE: n is number in *each* group
minwage <- read.csv("minwage.csv")
minwage <- minwage %>%
mutate(fullPropBefore = fullBefore/(fullBefore + partBefore),
fullPropAfter = fullAfter/(fullAfter + partAfter),
NJ = if_else(location == "PA", 0, 1))
fit_minwage <- lm(fullPropAfter ~ -1 + NJ + fullPropBefore + wageBefore + chain,
data = minwage)
fit_minwage
## ## Call: ## lm(formula = fullPropAfter ~ -1 + NJ + fullPropBefore + wageBefore + ## chain, data = minwage) ## ## Coefficients: ## NJ fullPropBefore wageBefore chainburgerking ## 0.05422 0.16879 0.08133 -0.11563 ## chainkfc chainroys chainwendys ## -0.15080 -0.20639 -0.22013
fit_minwage1 <- lm(fullPropAfter ~ NJ + fullPropBefore + wageBefore + chain,
data = minwage)
fit_minwage1
## ## Call: ## lm(formula = fullPropAfter ~ NJ + fullPropBefore + wageBefore + ## chain, data = minwage) ## ## Coefficients: ## (Intercept) NJ fullPropBefore wageBefore chainkfc ## -0.11563 0.05422 0.16879 0.08133 -0.03517 ## chainroys chainwendys ## -0.09076 -0.10451
library(modelr) pred_1 <- minwage %>% slice(1) %>% add_predictions(fit_minwage) %>% select(pred) %>% mutate(model = "fit_minwage") pred_compare <- minwage %>% slice(1) %>% add_predictions(fit_minwage1) %>% select(pred) %>% mutate(model = "fit_minwage1") %>% bind_rows(pred_1)
pred_compare
## pred model ## 1 0.2709367 fit_minwage1 ## 2 0.2709367 fit_minwage
\[\begin{align*}\hat{\beta}&=\frac{\sum_{i=1}^n (\beta X_i+\varepsilon_i-\beta\bar{X}-\bar{\varepsilon})(X_i-\bar{X})}{\sum_{i=1}^n(X_i - \bar{X})^2}\Rightarrow\\ \hat{\beta}&=\frac{\sum_{i=1}^n[\beta(X_i-\bar{X})+(\varepsilon_i-\bar{\varepsilon})](X_i-\bar{X})}{\sum_{i=1}^n(X_i - \bar{X})^2}\Rightarrow\\ \hat{\beta}&=\frac{\sum_{i=1}^n \beta (X_i-\bar{X})^2}{\sum_{i=1}^n(X_i - \bar{X})^2}+\frac{\sum_{i=1}^n(\varepsilon_i-\bar{\varepsilon})(X_i-\bar{X})}{\sum_{i=1}^n(X_i - \bar{X})^2}\Rightarrow \\ \hat{\beta} &= \beta + \underbrace{\frac{\sum_{i=1}^n(\varepsilon_i-\bar{\varepsilon})(X_i-\bar{X})}{\sum_{i=1}^n(X_i - \bar{X})^2}}_{\mbox{erro de estimação}}\end{align*}\]
\[\begin{align*}\sum_{i=1}^n(\varepsilon_i-\bar{\varepsilon})(X_i-\bar{X}) &= \sum_{i=1}^n\varepsilon_i(X_i - \bar{X})+\sum_{i=1}^n\bar{\varepsilon} (X_i - \bar{X})\\ &= \sum_{i=1}^n\varepsilon_i(X_i - \bar{X}) + \bar{\varepsilon}\left(\sum_{i=1}^nX_i - n\bar{X} \right) \\ &= \sum_{i=1}^n\varepsilon_i(X_i - \bar{X})\end{align*}\]
\[\begin{align*} E(\hat{\alpha})&=E(E(\hat{\alpha}|X))=E(\alpha)=\alpha\\E(\hat{\beta})&=E(E(\hat{\beta}|X))=E(\beta)=\beta\end{align*}\]
\[V(\hat{\beta}|X)=V(\hat{\beta}-\beta|X)=V\left(\frac{\sum_{i=1}^n\varepsilon_i(X_i - \bar{X})}{\sum_{i=1}^n(X_i-\bar{X})^2}\bigg|X\right)\\=\frac{1}{\left[\sum_{i=1}^n(X_i - \bar{X})^2\right]^2}V\left(\sum_{i=1}^n\varepsilon_i(X_i - \bar{X})\bigg|X\right)\]
\[V\left(\sum_{i=1}^n\varepsilon_i(X_i - \bar{X})\bigg|X\right)=\sum_{i=1}^n V(\varepsilon_i|X)(X_i-\bar{X})^2\\=V(\varepsilon_i)\sum_{i=1}^n(X_i-\bar{X})^2\] - Portanto: \[V(\hat{\beta}|X)=\frac{V(\varepsilon_i)}{\sum_{i=1}^n(X_i - \bar{X})^2}\]
\[\mbox{CI}(\alpha)=\left[\hat{\beta}-z_{\frac{\alpha}{2}}\times\mbox{erro padrão de }\hat{\beta}, \hat{\beta}+z_{\frac{\alpha}{2}}\times\mbox{erro padrão de }\hat{\beta}\right]\]
women <- read.csv("women.csv")
fit_women <- lm(water ~ reserved, data= women)
summary() retorna os valores estimados e as estatísticas relevantes.tidy() do pacote broom retornam essas informações em um tibble.summary(fit_women)
## ## Call: ## lm(formula = water ~ reserved, data = women) ## ## Residuals: ## Min 1Q Median 3Q Max ## -23.991 -14.738 -7.865 2.262 316.009 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 14.738 2.286 6.446 4.22e-10 *** ## reserved 9.252 3.948 2.344 0.0197 * ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 33.45 on 320 degrees of freedom ## Multiple R-squared: 0.01688, Adjusted R-squared: 0.0138 ## F-statistic: 5.493 on 1 and 320 DF, p-value: 0.0197
library(broom) tidy(fit_women)
## # A tibble: 2 × 5 ## term estimate std.error statistic p.value ## <chr> <dbl> <dbl> <dbl> <dbl> ## 1 (Intercept) 14.7 2.29 6.45 4.22e-10 ## 2 reserved 9.25 3.95 2.34 1.97e- 2
tidy() permite calcular os intervalos de confiançatidy(fit_women, conf.int=TRUE)
## # A tibble: 2 × 7 ## term estimate std.error statistic p.value conf.low conf.high ## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 (Intercept) 14.7 2.29 6.45 4.22e-10 10.2 19.2 ## 2 reserved 9.25 3.95 2.34 1.97e- 2 1.49 17.0
tidy(fit_women, conf.int=TRUE, conf.level=0.99)
## # A tibble: 2 × 7 ## term estimate std.error statistic p.value conf.low conf.high ## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 (Intercept) 14.7 2.29 6.45 4.22e-10 8.81 20.7 ## 2 reserved 9.25 3.95 2.34 1.97e- 2 -0.977 19.5
tidy(fit_minwage, conf.int=TRUE)
## # A tibble: 7 × 7 ## term estimate std.error statistic p.value conf.low conf.high ## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 NJ 0.0542 0.0332 1.63 0.103 -0.0111 0.120 ## 2 fullPropBefore 0.169 0.0566 2.98 0.00307 0.0574 0.280 ## 3 wageBefore 0.0813 0.0389 2.09 0.0374 0.00478 0.158 ## 4 chainburgerking -0.116 0.179 -0.646 0.518 -0.467 0.236 ## 5 chainkfc -0.151 0.183 -0.824 0.411 -0.511 0.209 ## 6 chainroys -0.206 0.187 -1.11 0.270 -0.574 0.161 ## 7 chainwendys -0.220 0.188 -1.17 0.243 -0.591 0.150
MPs <- read.csv("MPs.csv")
MPs_labour <- filter(MPs, party == "labour")
MPs_tory <- filter(MPs, party == "tory")
labour_fit1 <- lm(ln.net ~ margin,
data = filter(MPs_labour, margin < 0))
labour_fit2 <- lm(ln.net ~ margin,
data = filter(MPs_labour, margin > 0))
tory_fit1 <- lm(ln.net ~ margin,
data = filter(MPs_tory, margin < 0))
tory_fit2 <- lm(ln.net ~ margin,
data = filter(MPs_tory, margin > 0))
tory_y0 <- augment(tory_fit1, newdata=tibble(margin=0),
interval="confidence", conf.level=0.95)
tory_y1 <- augment(tory_fit2, newdata=tibble(margin=0),
interval="confidence", conf.level=0.95)
tory_y0
## # A tibble: 1 × 4 ## margin .fitted .lower .upper ## <dbl> <dbl> <dbl> <dbl> ## 1 0 12.5 12.1 13.0
tory_y1
## # A tibble: 1 × 4 ## margin .fitted .lower .upper ## <dbl> <dbl> <dbl> <dbl> ## 1 0 13.2 12.8 13.6
labour_y0 <- augment(labour_fit1, newdata=tibble(margin=0),
interval="confidence", conf.level=0.95)
labour_y1 <- augment(labour_fit2, newdata=tibble(margin=0),
interval="confidence", conf.level=0.95)
labour_y0
## # A tibble: 1 × 4 ## margin .fitted .lower .upper ## <dbl> <dbl> <dbl> <dbl> ## 1 0 12.3 12.0 12.6
labour_y1
## # A tibble: 1 × 4 ## margin .fitted .lower .upper ## <dbl> <dbl> <dbl> <dbl> ## 1 0 11.9 11.4 12.5
data_grid() do pacote modelr para criar dois data.frames, um com valores positivos e outro com valores negativos para margem.y1_range <- data_grid(filter(MPs_tory, margin <=0), margin) tory_y0 <- augment(tory_fit1, newdata = y1_range, interval="confidence") y2_range <- data_grid(filter(MPs_tory, margin >= 0), margin) tory_y1 <- augment(tory_fit2, newdata = y2_range, interval="confidence")
geom_ribbon().ggplot() + geom_vline(xintercept = 0, linetype = "dashed") + geom_ribbon(aes(x=margin, ymin=.lower, ymax=.upper), data = tory_y0, alpha=0.3) + geom_line(aes(x=margin, y=.fitted), data = tory_y0) + geom_ribbon(aes(x=margin, ymin=.lower, ymax=.upper), data = tory_y1, alpha=0.3) + geom_line(aes(x=margin, y=.fitted), data = tory_y1) + xlim(-.5, .25) + labs(x="margin of victory", y="log net wealth") + theme_classic()
augment() calcula o erro padrão quando definido como TRUE.tory_y0 <- augment(tory_fit1, newdata=tibble(margin=0),
interval = "confidence", se_fit = TRUE)
tory_y0
## # A tibble: 1 × 5 ## margin .fitted .lower .upper .se.fit ## <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 0 12.5 12.1 13.0 0.214
tory_y1 <- augment(tory_fit2, newdata=tibble(margin=0),
interval = "confidence", se_fit = TRUE)
tory_y1
## # A tibble: 1 × 5 ## margin .fitted .lower .upper .se.fit ## <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 0 13.2 12.8 13.6 0.192
diff_est <- tory_y1$.fitted - tory_y0$.fitted diff_est
## [1] 0.6496861
se_diff <- sqrt(tory_y0$.se.fit^2 + tory_y1$.se.fit^2) se_diff
## [1] 0.2876281
CI <- c(diff_est - se_diff * qnorm(0.975), diff_est + se_diff * qnorm(0.975)) CI
## [1] 0.0859455 1.2134268
z_score <- diff_est/se_diff p.value <- 2*pnorm(abs(z_score), lower.tail = FALSE) p.value
## [1] 0.02389759