2023/01/31 (updated: 2023-12-11)
…
…
factorial() para calcular o fatorial de lfactorial() para calcular o log do fatorial.factorial(5)
## [1] 120
factorial(10)
## [1] 3628800
factorial(30)
## [1] 2.652529e+32
lfactorial(5)
## [1] 4.787492
log(120)
## [1] 4.787492
exp(lfactorial(5))
## [1] 120
factorial(365)
## [1] Inf
lfactorial(365)
## [1] 1792.332
library(tidyverse)
birthday <- function(k) {
logdenom <- k * log(365) + lfactorial(365-k)
lognumer <- lfactorial(365)
pr <- 1 - exp(lognumer - logdenom)
pr
}
bday <- tibble(k=1:50, pr=birthday(k)) head(bday)
## # A tibble: 6 × 2 ## k pr ## <int> <dbl> ## 1 1 2.27e-13 ## 2 2 2.74e- 3 ## 3 3 8.20e- 3 ## 4 4 1.64e- 2 ## 5 5 2.71e- 2 ## 6 6 4.05e- 2
ggplot(bday, aes(k, pr)) +
geom_line() +
geom_point() +
scale_y_continuous(limits=c(0,1), breaks=seq(0,1, by=0.1)) +
labs(y="Probability that at least two\n people have the same birthday",
x="Number of people") +
theme_classic()
birthday(23)
## [1] 0.5072972
birthday(50)
## [1] 0.9703736
sample() com o argumento replace definido como TRUE.sample() tem os argumentos x, vetor do qual a amostra será retirada, size número de elementos retirados de \(x\), e prob um vetor com pesos para as probabilidades de escolher cada elemento de x.sample(1:10, 3, replace = TRUE)
## [1] 5 1 6
sample(1:10, 10, replace = TRUE)
## [1] 3 3 4 3 2 5 2 10 9 4
sample(1:10, 10, replace = FALSE)
## [1] 4 6 1 5 7 9 10 2 3 8
unique() para checar se tem números repetidos.set.seed(4444)
k <- 23
sims <- 1000
event <- 0
for (i in 1:sims){
days <- sample(1:365, k, replace = TRUE)
days.unique <- unique(days)
if (length(days.unique) < k) {
event <- event + 1
}
}
event/sims
## [1] 0.511
set.seed(4444)
k <- 23
sims <- 100000
event <- 0
for (i in 1:sims){
days <- sample(1:365, k, replace = TRUE)
days.unique <- unique(days)
if (length(days.unique) < k) {
event <- event + 1
}
}
event/sims
## [1] 0.50781
grupo <- rep(c("H", "M"), 10)
duas.mulheres <- rep(NA, 10000)
for (i in seq_along(duas.mulheres)) {
comite <- sample(grupo, 5, replace = FALSE)
total.mulheres = sum(comite == "M")
duas.mulheres[i] = total.mulheres > 1
}
sum(duas.mulheres)/length(duas.mulheres)
## [1] 0.8483
…
choose() do R, a função lchoose() faz o cálculo com logaritmos.choose(84,6)
## [1] 406481544
FLVoters <- read.csv("FLVoters.csv")
glimpse(FLVoters)
## Rows: 10,000 ## Columns: 6 ## $ surname <chr> "PIEDRA", "LYNCH", "CHESTER", "LATHROP", "HUMMEL", "CHRISTISON… ## $ county <int> 115, 115, 115, 115, 115, 115, 115, 115, 1, 1, 115, 115, 115, 1… ## $ VTD <int> 66, 13, 103, 80, 8, 55, 84, 48, 41, 39, 26, 45, 11, 48, 22, 88… ## $ age <int> 58, 51, 63, 54, 77, 49, 77, 34, 56, 60, 44, 45, 80, 83, 88, 55… ## $ gender <chr> "f", "m", "m", "m", "f", "m", "f", "f", "f", "m", "m", "f", "m… ## $ race <chr> "white", "white", NA, "white", "white", "white", "white", "whi…
FLVoters <- FLVoters %>% na.omit() dim(FLVoters)
## [1] 9113 6
margin_race <- FLVoters %>% count(race) %>% mutate(prop = n/sum(n))
margin_race
## race n prop ## 1 asian 175 0.019203336 ## 2 black 1194 0.131021617 ## 3 hispanic 1192 0.130802151 ## 4 native 29 0.003182267 ## 5 other 310 0.034017338 ## 6 white 6213 0.681773291
margin_gender <- FLVoters %>% count(gender) %>% mutate(prop = n/sum(n)) margin_gender
## gender n prop ## 1 f 4883 0.5358279 ## 2 m 4230 0.4641721
margin_race_f <- FLVoters %>% filter(gender == "f") %>% count(race) %>% mutate(prop = n/sum(n)) margin_race_f
## race n prop ## 1 asian 83 0.016997747 ## 2 black 678 0.138849068 ## 3 hispanic 666 0.136391563 ## 4 native 17 0.003481466 ## 5 other 158 0.032357157 ## 6 white 3281 0.671922998
margin_race_m <- FLVoters %>% filter(gender == "m") %>% count(race) %>% mutate(prop = n/sum(n)) margin_race_m
## race n prop ## 1 asian 92 0.021749409 ## 2 black 516 0.121985816 ## 3 hispanic 526 0.124349882 ## 4 native 12 0.002836879 ## 5 other 152 0.035933806 ## 6 white 2932 0.693144208
joint_p <- FLVoters %>% count(gender, race) %>% mutate(prop = n/sum(n))
## gender race n prop ## 1 f asian 83 0.009107868 ## 2 f black 678 0.074399210 ## 3 f hispanic 666 0.073082410 ## 4 f native 17 0.001865467 ## 5 f other 158 0.017337869 ## 6 f white 3281 0.360035115 ## 7 m asian 92 0.010095468 ## 8 m black 516 0.056622408 ## 9 m hispanic 526 0.057719741 ## 10 m native 12 0.001316800 ## 11 m other 152 0.016679469 ## 12 m white 2932 0.321738176
pivot_wider() para criar uma coluna para cada distribuição conjunta e depois somamos as duas.joint_p %>% select(-n) %>% pivot_wider(names_from = gender, values_from = prop)
## # A tibble: 6 × 3 ## race f m ## <chr> <dbl> <dbl> ## 1 asian 0.00911 0.0101 ## 2 black 0.0744 0.0566 ## 3 hispanic 0.0731 0.0577 ## 4 native 0.00187 0.00132 ## 5 other 0.0173 0.0167 ## 6 white 0.360 0.322
joint_p %>% select(-n) %>% pivot_wider(names_from = gender, values_from = prop) %>% mutate(total_prop = f + m)
## # A tibble: 6 × 4 ## race f m total_prop ## <chr> <dbl> <dbl> <dbl> ## 1 asian 0.00911 0.0101 0.0192 ## 2 black 0.0744 0.0566 0.131 ## 3 hispanic 0.0731 0.0577 0.131 ## 4 native 0.00187 0.00132 0.00318 ## 5 other 0.0173 0.0167 0.0340 ## 6 white 0.360 0.322 0.682
rowSums(), que soma as linhas de um data.frame, across(), que permite selecionar colunas com a mesma semântica da função select(), e where(), que seleciona as variáveis para as quais a condição estabelecida é verdadeira nos ajudarão a fazer a soma.joint_p %>% select(-n) %>% pivot_wider(names_from = race, values_from = prop) %>% mutate(total_prop = rowSums(across(where(is.numeric))))
## # A tibble: 2 × 8 ## gender asian black hispanic native other white total_prop ## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 f 0.00911 0.0744 0.0731 0.00187 0.0173 0.360 0.536 ## 2 m 0.0101 0.0566 0.0577 0.00132 0.0167 0.322 0.464
case_when(), como fizemos em outros exemplos, ou com a função cut().cut() diz para o R que o valor da fronteira superior deve ficar no grupo inferior, ou seja, o intervalo é fechado a direita.FLVoters <- FLVoters %>%
mutate(age_group = cut(age, breaks = c(0,20,40,60,Inf),
right = TRUE,
labels = c("<= 20", "20-40", "40-60", "> 60")))
cut() cria a variável como fator.glimpse(FLVoters)
## Rows: 9,113 ## Columns: 7 ## $ surname <chr> "PIEDRA", "LYNCH", "LATHROP", "HUMMEL", "CHRISTISON", "HOMAN… ## $ county <int> 115, 115, 115, 115, 115, 115, 115, 1, 1, 115, 115, 115, 115,… ## $ VTD <int> 66, 13, 80, 8, 55, 84, 48, 41, 39, 26, 45, 11, 48, 88, 25, 8… ## $ age <int> 58, 51, 54, 77, 49, 77, 34, 56, 60, 44, 45, 80, 83, 55, 33, … ## $ gender <chr> "f", "m", "m", "f", "m", "f", "f", "f", "m", "m", "f", "m", … ## $ race <chr> "white", "white", "white", "white", "white", "white", "white… ## $ age_group <fct> 40-60, 40-60, 40-60, > 60, 40-60, > 60, 20-40, 40-60, 40-60,…
joint3 <- FLVoters %>% count(race, age_group, gender) %>% mutate(prop = n/sum(n))
-Qual a probabilidade de ser mulher e afro-americana dado que tem mais de 60 anos? - Para responder essa pergunta precisamos dividir a probabilidade conjunta, que está em joint3, pela probabilidade marginal de ter mais de 60.
margin_age <- FLVoters %>% count(age_group) %>% mutate(margin_age = n/sum(n)) %>% select(-n) margin_age
## age_group margin_age ## 1 <= 20 0.01766707 ## 2 20-40 0.27093164 ## 3 40-60 0.36047405 ## 4 > 60 0.35092725
joint3 <- left_join(joint3, margin_age, by="age_group") %>% mutate(con_prob_age = prop/margin_age) filter(joint3, race == "black", gender == "f", age_group == "> 60") %>% select(con_prob_age)
## con_prob_age ## 1 0.05378361
joint2 <- FLVoters %>% count(age_group, gender) %>% mutate(prob_age_gender = n/sum(n)) %>% select(-n) joint2
## age_group gender prob_age_gender ## 1 <= 20 f 0.009656535 ## 2 <= 20 m 0.008010534 ## 3 20-40 f 0.143092286 ## 4 20-40 m 0.127839350 ## 5 40-60 f 0.189838692 ## 6 40-60 m 0.170635356 ## 7 > 60 f 0.193240426 ## 8 > 60 m 0.157686821
joint3 <- left_join(joint3, joint2, by=c("age_group", "gender")) %>%
mutate(con_prob_race = prop/prob_age_gender)
filter(joint3, gender == "f", age_group == "> 60", race == "black") %>%
select(con_prob_race)
## con_prob_race ## 1 0.09767178
margin_race <- margin_race %>%
select(race, prob_race = prop)
margin_gender <- margin_gender %>%
select(gender, prob_gender = prop)
race_gender_indep <- tidyr::crossing(margin_race, margin_gender) %>%
left_join(select(joint_p, gender, race, prob_joint=prop),
by = c("gender", "race")) %>%
mutate(prob_indp = prob_race * prob_gender)
race_gender_indep %>%
filter(gender == "f") %>%
ggplot(aes(prob_indp, prob_joint)) +
geom_abline(intercept=0, slope=1, color = "black", linewidth = 0.5) +
geom_point(shape=1, size=2) +
coord_fixed() +
labs(x= "P(race)*P(gender)",
y="P(race e gender)") +
theme_classic()
\[P(\mbox{A e B e C})=P(A)\times P(B)\times P(C)\] - A independência pode ser definida em termos condicionais. \[P(\mbox{A e B|C})=P(\mbox{A|C})\times P(\mbox{B|C})\]
margin_age <- margin_age %>%
select(age_group, prob_age=margin_age)
joint_indep <- tidyr::crossing(margin_age, margin_gender, margin_race) %>%
mutate(indep_prob = prob_race * prob_age * prob_gender) %>%
left_join(select(joint3, race, age_group, gender, prob_joint=prop),
by=c("gender", "age_group", "race"))
joint_indep %>%
filter(age_group == "> 60", gender == "f") %>%
ggplot(aes(prob_joint, indep_prob)) +
geom_abline(intercept=0, slope = 1, color="black", linewidth=0.5) +
geom_point(shape=1, size=2) +
coord_fixed() +
labs(title = "Joint independence",
x="P(race e 60+ e female)",
y="P(race)*P(60+)*P(female)") +
theme_classic()
#raça e idade dado gênero
cond_prob_gender <- left_join(select(joint3, race, age_group, gender, joint_prob = prop),
margin_gender, by=c("gender")) %>%
mutate(cond_prob_gender = joint_prob/prob_gender)
#raça dado gênero
cond_prob_race_gender <- left_join(select(joint_p, race, gender, prob_race_gender=prop),
margin_gender, by=c("gender")) %>%
mutate(cond_prob_race_gender = prob_race_gender/prob_gender) %>%
select(race, gender, cond_prob_race_gender)
#idade dado gênero
cond_prob_age_gender <- left_join(select(joint2, age_group, gender, prob_age_gender),
margin_gender, by=c("gender")) %>%
mutate(cond_prob_age_gender = prob_age_gender/prob_gender) %>%
select(age_group, gender, cond_prob_age_gender)
#probabilidade conjunta de raça e idade
indep_cond_gender <- full_join(cond_prob_race_gender, cond_prob_age_gender,
by = c("gender")) %>%
mutate(indep_prob = cond_prob_race_gender * cond_prob_age_gender)
#juntar probabilidades conjuntas com as condicionais
master <- left_join(cond_prob_gender, indep_cond_gender,
by = c("gender", "age_group", "race"))
master %>%
filter(age_group == "> 60", gender == "f") %>%
ggplot(aes(cond_prob_gender, indep_prob)) +
geom_abline(intercept = 0, slope = 1, color="black", linewidth=0.5) +
geom_point(shape=1, size=2) +
coord_fixed() +
labs(title = "Marginal independence",
x = "P(race and 60+ | female)",
y="P(race | female) * P(60+ | female)") +
theme_classic()
\[P(\mbox{prêmio na primeira}) = \frac{1}{3}\] \[P(\mbox{pegadinha na primeira}) = \frac{2}{3}\]
Se a estratégia for não trocar, temos: \[P(\mbox{prêmio}|\mbox{prêmio na primeira})=1\] \[P(\mbox{prêmio}|\mbox{pegadinha na primeira})=0\]
Logo, com a estratégia de não trocar: \[P(\mbox{prêmio}) = 1\times \frac{1}{3} + 0\times \frac{2}{3} = \frac{1}{3}\]
Se a estratégia for trocar, temos: \[P(\mbox{prêmio}|\mbox{prêmio na primeira})=0\] \[P(\mbox{prêmio}|\mbox{pegadinha na primeira})=1\]
Logo, com a estratégia de não trocar: \[P(\mbox{prêmio}) = 0\times \frac{1}{3} + 1\times \frac{2}{3} = \frac{2}{3}\]
sims <- 1000
portas <- c("pegadinha", "pegadinha", "premio")
result.trocar <- result.naotrocar <- rep(NA, sims)
for (i in 1:sims){
#escolha inicial
primeira.escolha <- sample(1:3, size=1)
result.naotrocar[i] <- portas[primeira.escolha]
restante <- portas[-primeira.escolha]
#Mallandro abre uma porta com pegadinha
if(portas[primeira.escolha]=="premio") #escolhe uma das duas com pegadinha
mallandro <- sample(1:2, size=1)
else #só sobrou uma com pegadinha
mallandro <- (1:2)[restante == "pegadinha"]
result.trocar[i] <- restante[-mallandro]
}
mean(result.naotrocar == "premio")
## [1] 0.308
mean(result.trocar == "premio")
## [1] 0.692
cnames <- read.csv("cnames.csv")
glimpse(cnames)
## Rows: 151,671 ## Columns: 7 ## $ surname <chr> "SMITH", "JOHNSON", "WILLIAMS", "BROWN", "JONES", "MILLER"… ## $ count <int> 2376206, 1857160, 1534042, 1380145, 1362755, 1127803, 1072… ## $ pctwhite <dbl> 73.342666, 61.550000, 48.520000, 60.716072, 57.690000, 85.… ## $ pctblack <dbl> 22.217778, 33.800000, 46.720000, 34.543454, 37.730000, 10.… ## $ pctapi <dbl> 0.399960, 0.420000, 0.370000, 0.410041, 0.350000, 0.419958… ## $ pcthispanic <dbl> 1.559844, 1.500000, 1.600000, 1.640164, 1.440000, 1.429857… ## $ pctothers <dbl> 2.479752, 2.730000, 2.790000, 2.690269, 2.790000, 1.939806…
most_likely_race <- cnames %>%
select(-count) %>%
pivot_longer(names_to = "race_pred", values_to = "highest_pct", -surname) %>%
mutate(race_pred = str_replace(race_pred, "pct", "")) %>%
group_by(surname) %>%
filter(row_number(desc(highest_pct))==1) %>%
ungroup()
cnames <- full_join(cnames, most_likely_race, by=c("surname"))
FLVotersJoin <- FLVoters %>% inner_join(cnames, by="surname")
unique(FLVotersJoin$race)
## [1] "white" "other" "black" "asian" "hispanic" "native"
unique(FLVotersJoin$race_pred)
## [1] "hispanic" "white" "black" "others" "api"
FLVotersJoin <- FLVotersJoin %>%
mutate(race = recode(race, "native"="other"),
race_pred = recode(race_pred, "api"="asian", "others"="other"))
unique(FLVotersJoin$race)
## [1] "white" "other" "black" "asian" "hispanic"
unique(FLVotersJoin$race_pred)
## [1] "hispanic" "white" "black" "other" "asian"
race_tp <- FLVotersJoin %>% group_by(race) %>% summarize(tp = mean(race == race_pred)) %>% arrange(desc(tp))
race_tp
## # A tibble: 5 × 2 ## race tp ## <chr> <dbl> ## 1 white 0.950 ## 2 hispanic 0.847 ## 3 asian 0.564 ## 4 black 0.160 ## 5 other 0.00361
race_fp <- FLVotersJoin %>% group_by(race) %>% summarize(fp = mean(race != race_pred)) %>% arrange(desc(fp))
race_fp
## # A tibble: 5 × 2 ## race fp ## <chr> <dbl> ## 1 other 0.996 ## 2 black 0.840 ## 3 asian 0.436 ## 4 hispanic 0.153 ## 5 white 0.0498
FLCensus <- read.csv("FLCensus.csv")
FLCensus <- FLCensus %>%
rename("asian"="api", "other" = "others")
weighted.mean() que calcula médias ponderadas.race_prop <- FLCensus %>% select(total.pop, white, black, asian, hispanic, other) %>% pivot_longer(names_to = "race", values_to = "pct", -total.pop) %>% group_by(race) %>% summarize(race_prop = weighted.mean(pct, w=total.pop)) %>% arrange(desc(race_prop))
race_prop
## # A tibble: 5 × 2 ## race race_prop ## <chr> <dbl> ## 1 white 0.579 ## 2 hispanic 0.225 ## 3 black 0.152 ## 4 asian 0.0242 ## 5 other 0.0206
FLVotersJoin <- left_join(FLVotersJoin, race_prop, by="race")
total.count <- sum(cnames$count)
cnames_reshape <- cnames %>%
select(-race_pred, -highest_pct) %>%
pivot_longer(names_to = "race", values_to = "pct", cols = starts_with("pct")) %>%
mutate(race = str_replace(race, "pct", ""),
race = recode(race, "api"="asian", "others"="other"),
race_surname = pct/100) %>%
select(-pct) %>%
left_join(race_prop, by="race") %>%
mutate(prob_surname_race = race_surname * (count/total.count)/race_prop) %>%
rename("posib_race" = race)
merge_temp <- FLCensus %>%
pivot_longer(names_to = "pop_race",
values_to = "pop_pct",
cols = c(white, black, hispanic, asian, other)) %>%
inner_join(FLVoters, by=c("county", "VTD")) %>%
inner_join(cnames_reshape, by=c("surname", "pop_race" = "posib_race")) %>%
mutate(race_residence = prob_surname_race * pop_pct)
name_residence <- merge_temp %>% group_by(surname, county, VTD) %>% summarize(name_residence = sum(race_residence))
## `summarise()` has grouped output by 'surname', 'county'. You can override using ## the `.groups` argument.
FLVoters_full <- merge_temp %>%
inner_join(name_residence, by=c("surname", "county", "VTD")) %>%
mutate(pred_race = prob_surname_race * pop_pct/name_residence) %>%
select(surname, pop_race, race, pred_race, county, VTD)
FL_updated <- FLVoters_full %>% ungroup() %>% group_by(surname, county, VTD) %>% filter(row_number(desc(pred_race))==1) %>% ungroup() race_tp_new <- FL_updated %>% group_by(race) %>% summarize(tp = mean(race == pop_race)) %>% arrange(desc(tp))
## # A tibble: 6 × 2 ## race tp ## <chr> <dbl> ## 1 white 0.942 ## 2 hispanic 0.857 ## 3 black 0.628 ## 4 asian 0.604 ## 5 other 0.00797 ## 6 native 0
cnames %>% filter(surname == "WHITE") %>% pull()
## [1] 67.91
FLVoters_full %>% filter(surname == "WHITE", pop_race == "black") %>% select(pred_race) %>% summary()
## pred_race ## Min. :0.004588 ## 1st Qu.:0.072232 ## Median :0.159496 ## Mean :0.250711 ## 3rd Qu.:0.293640 ## Max. :0.981864
race_fp_new <- FL_updated %>% group_by(pop_race) %>% summarize(fp = mean(race != pop_race)) %>% arrange(desc(fp))
race_fp_new
## # A tibble: 5 × 2 ## pop_race fp ## <chr> <dbl> ## 1 other 0.778 ## 2 asian 0.333 ## 3 black 0.220 ## 4 hispanic 0.212 ## 5 white 0.122
dunif() calcula a PDF de uma distribuição uniforme (não é bem isso…).dunif(0.5, min=0, max=1)
## [1] 1
dunif(1, min=-2, max=2)
## [1] 0.25
punif() calcula a CDF de uma distribuição uniforme.punif(1, min=-2, max=2)
## [1] 0.75
punif(0, min=-1, max=1)
## [1] 0.5
punif(0.6, min=0, max=1)
## [1] 0.6
punif(0.3, min=0, max=1)
## [1] 0.3
punif(0.8, min=0, max=1)
## [1] 0.8
runif() retorna realizações de uma variável aleatória uniforme.sims <- 1000 x <- runif(sims, min=0, max=1) head(x, n=10)
## [1] 0.01920359 0.40652059 0.56208492 0.95918199 0.77801306 0.07608611 ## [7] 0.33794735 0.74663095 0.53016555 0.78382456
runif()e o resultado que em uma distribuição uniforme entre zero e um a probabilidade de obter um valor menor do que \(x\) é igual a \(x\) para \(0 \leq x \leq 1\) permitem simular uma variável aleatória Bernoulli.p <- 0.5 y <- as.integer(x <= p) head(y, n=10)
## [1] 1 1 0 0 0 1 1 0 0 0
mean(y)
## [1] 0.507
sims <- 1000 x <- runif(sims, min=0, max=1) head(x, n=10)
## [1] 0.2610260 0.6118536 0.9440752 0.9351696 0.4937785 0.3668355 0.8740632 ## [8] 0.9958626 0.5702721 0.5810730
p <- 0.3 y <- as.integer(x <= p) head(y, n=10)
## [1] 1 0 0 0 0 0 0 0 0 0
mean(y)
## [1] 0.31
dbinom() retorna a PMF (probabilidade) de uma binomialdbinom(2, size=3, prob=0.5)
## [1] 0.375
dbinom(0:3, size=3, prob=0.5)
## [1] 0.125 0.375 0.375 0.125
pbinom() retorna a CDF de um binomialpbinom(2, size=3, prob=0.5)
## [1] 0.875
pbinom(0:3, size=3, prob=0.5)
## [1] 0.125 0.500 0.875 1.000
\[P(-k \leq Z \leq k)=P(Z \leq k) - P(Z \leq -k)= F(k)-F(-k)\]
pnorm(1)-pnorm(-1)
## [1] 0.6826895
pnorm(2)-pnorm(-2)
## [1] 0.9544997
mu <- 5 sigma <- 2 pnorm(mu+sigma, mean=mu, sd = sigma) - pnorm(mu-sigma, mean=mu, sd=sigma)
## [1] 0.6826895
pnorm(mu+2*sigma, mean=mu, sd = sigma) - pnorm(mu-2*sigma, mean=mu, sd=sigma)
## [1] 0.9544997
pres08 <- read.csv("pres08.csv")
pres12 <- read.csv("pres12.csv")
pres <- full_join(pres08, pres12, by="state")
pres <- pres %>%
mutate(Obama2008.z = as.numeric(scale(Obama.x)),
Obama2012.z = as.numeric(scale(Obama.y)))
fit1 <- lm(Obama2012.z ~ -1+Obama2008.z, data=pres) fit1
## ## Call: ## lm(formula = Obama2012.z ~ -1 + Obama2008.z, data = pres) ## ## Coefficients: ## Obama2008.z ## 0.9834
e <- resid(fit1)
e.zscore <- scale(e)
hist(z.score, freq=FALSE, ylim=c(0,4),
xlab = "Standardized residuals",
main = "Distribution of standardized residuals")
x <- seq(from=-3, to=3, by=0.01)
lines(x, dnorm(x))
qqnorm(e.zscore, xlim=c(-3,3), ylim=c(-3,3)) abline(0,1)
mean(e)
## [1] -1.093832e-16
e.sd <- sd(e) e.sd
## [1] 0.1812239
(61-mean(pres$Obama.x))/sd(pres$Obama.x)
## [1] 0.8720631
dnorm() responde nossa pergunta. Como a função retorna a probabilidade da variável aleatória ser menor do que \(x\) e queremos a probabilidade de ser maior vamos calcular 1 menos a probabilidade de ser menor que \(x\), ou, mais fácil, usar o argumento lower.tail como falso na função.CA.2008 <- filter(pres, state == "CA") %>% select(Obama2008.z) %>% pull() CA.mean2012 <- coef(fit1) * CA.2008 CA.mean2012
## Obama2008.z ## 0.8576233
1 - pnorm(CA.2008, mean=CA.mean2012, sd=e.sd)
## [1] 0.4682463
pnorm(CA.2008, mean=CA.mean2012, sd=e.sd, lower.tail = FALSE)
## [1] 0.4682463
TX.2008 <- filter(pres, state == "TX") %>% select(Obama2008.z) %>% pull() TX.mean2012 <- coef(fit1) * TX.2008 TX.mean2012
## Obama2008.z ## -0.6567543
pnorm(TX.2008, mean=TX.mean2012, sd=e.sd, lower.tail = FALSE)
## [1] 0.5243271
\[V(X)=E(X^2)-[E(X)]^2=\int_a^b \frac{x^2}{b-a} - \left(\frac{a+b}{2} \right)^2\\=\frac{x^3}{3(b-a)}\Bigg|_a^b-\left(\frac{a+b}{2} \right)^2=\frac{1}{12}(b-a)^2\]
p <- 0.5 p * (1-p)
## [1] 0.25
y <- runif(500, 0,1) >= p var(y)
## [1] 0.2502445
a <- 1 b <- 5 (1/12)*(b-a)^2
## [1] 1.333333
var(runif(500, min=1, max=5))
## [1] 1.321897
pres08 <- read.csv("pres08.csv")
pres08 <- pres08 %>%
mutate(p = Obama/(Obama + McCain))
rbinom() com \(n=1000\) e \(p_j\) dado por um vetor com as probabilidades de cada estado.rbinom(2, 1000, prob = c(0.1, 0.9))
## [1] 80 893
sim_election <- function(.id, df, n_draws=1000){
mutate(df, draws = rbinom(n(), n_draws, p)) %>%
filter(draws > n_draws/2) %>%
summarize(EV=sum(EV), .id=.id)
}
sim_election(1, pres08, n_draws = 1000)
## EV .id ## 1 364 1
sim_election(2, pres08, n_draws = 1000)
## EV .id ## 1 353 2
map_df() do pacote purr que é parte do tidyverse.sims <- 10000 sim_results <- map_df(seq_len(sims), ~ sim_election(.x, pres08, n_draws = 1000))
head(sim_results)
## EV .id ## 1 326 1 ## 2 375 2 ## 3 367 3 ## 4 364 4 ## 5 364 5 ## 6 338 6
sim_results %>% ggplot(aes(EV, y= after_stat(density))) + geom_histogram(binwidth = 12) + geom_vline(xintercept = 364, color = "blue", linewidth=0.5) + labs(x="Obama's Electoral College votes", y="Density") + theme_classic()
sim_results %>%
select(EV) %>%
summarise_all(list(mean = mean,
median = median,
var = var,
sd = sd,
max = max,
min=min))
## mean median var sd max min ## 1 352.3853 353 270.0883 16.43436 393 285
pbinom(), como queremos maior ou igual uaremos o argumento lower.tail como falso.n_draws <- 1000 pres08 %>% mutate(pb = pbinom(n_draws/2, size=n_draws, prob=p, lower.tail=FALSE)) %>% summarise(mean = sum(pb*EV))
## mean ## 1 352.1388
pres08 %>%
mutate(pb = pbinom(n_draws/2, size=n_draws, prob=p, lower.tail=FALSE)) %>%
summarize(V = sum(pb*(1-pb)*EV^2),
sd = sqrt(V))
## V sd ## 1 268.8008 16.39515
cumsum() que calcula a soma cumulativa de um vetor.cumsum(c(1,2,3,4))
## [1] 1 3 6 10
cumsum(1:10)
## [1] 1 3 6 10 15 21 28 36 45 55
sims <- 5000
p <- 0.2
size <- 10
lln_bin <- tibble(n = seq_len(sims),
x = rbinom(sims, prob=p, size=size),
mean = cumsum(x)/n,
distrib = str_c("Binomial (", size, ", ", p, ")"))
head(lln_bin, n=10)
## # A tibble: 10 × 4 ## n x mean distrib ## <int> <int> <dbl> <chr> ## 1 1 1 1 Binomial (10, 0.2) ## 2 2 3 2 Binomial (10, 0.2) ## 3 3 2 2 Binomial (10, 0.2) ## 4 4 4 2.5 Binomial (10, 0.2) ## 5 5 4 2.8 Binomial (10, 0.2) ## 6 6 3 2.83 Binomial (10, 0.2) ## 7 7 1 2.57 Binomial (10, 0.2) ## 8 8 1 2.38 Binomial (10, 0.2) ## 9 9 2 2.33 Binomial (10, 0.2) ## 10 10 2 2.3 Binomial (10, 0.2)
tail(lln_bin, n=10)
## # A tibble: 10 × 4 ## n x mean distrib ## <int> <int> <dbl> <chr> ## 1 4991 2 1.99 Binomial (10, 0.2) ## 2 4992 4 1.99 Binomial (10, 0.2) ## 3 4993 2 1.99 Binomial (10, 0.2) ## 4 4994 4 1.99 Binomial (10, 0.2) ## 5 4995 2 1.99 Binomial (10, 0.2) ## 6 4996 3 1.99 Binomial (10, 0.2) ## 7 4997 2 1.99 Binomial (10, 0.2) ## 8 4998 3 1.99 Binomial (10, 0.2) ## 9 4999 3 1.99 Binomial (10, 0.2) ## 10 5000 1 1.99 Binomial (10, 0.2)
lln_bin %>% ggplot() + geom_line(aes(x=n, y=mean)) + geom_hline(yintercept = 2, lty= "dashed") + labs(title = "Binomial (10, 0,2)", x="Sample size", y="Sample mean") + theme_classic()
lln_unif <- tibble(n = seq_len(sims),
x = runif(sims),
mean = cumsum(x)/n,
distrib = str_c("Uniforme (0, 1)"))
head(lln_unif, n=10)
## # A tibble: 10 × 4 ## n x mean distrib ## <int> <dbl> <dbl> <chr> ## 1 1 0.679 0.679 Uniforme (0, 1) ## 2 2 0.143 0.411 Uniforme (0, 1) ## 3 3 0.166 0.329 Uniforme (0, 1) ## 4 4 0.884 0.468 Uniforme (0, 1) ## 5 5 0.557 0.486 Uniforme (0, 1) ## 6 6 0.220 0.441 Uniforme (0, 1) ## 7 7 0.358 0.430 Uniforme (0, 1) ## 8 8 0.230 0.405 Uniforme (0, 1) ## 9 9 0.838 0.453 Uniforme (0, 1) ## 10 10 0.538 0.461 Uniforme (0, 1)
tail(lln_unif, n=10)
## # A tibble: 10 × 4 ## n x mean distrib ## <int> <dbl> <dbl> <chr> ## 1 4991 0.544 0.499 Uniforme (0, 1) ## 2 4992 0.847 0.499 Uniforme (0, 1) ## 3 4993 0.338 0.499 Uniforme (0, 1) ## 4 4994 0.272 0.499 Uniforme (0, 1) ## 5 4995 0.0438 0.499 Uniforme (0, 1) ## 6 4996 0.853 0.499 Uniforme (0, 1) ## 7 4997 0.890 0.499 Uniforme (0, 1) ## 8 4998 0.335 0.499 Uniforme (0, 1) ## 9 4999 0.0394 0.499 Uniforme (0, 1) ## 10 5000 0.268 0.499 Uniforme (0, 1)
lln_unif %>% ggplot() + geom_line(aes(x=n, y=mean)) + geom_hline(yintercept = 0.5, lty= "dashed") + labs(title = "Uniforme (0, 1)", x="Sample size", y="Sample mean") + theme_classic()
sims <- 1000
n.samp <- 1000
z.binom <- z.unif <- rep(NA, sims)
for (i in 1:sims){
x <- rbinom(n.samp, p=0.2, size=10)
z.binom[i] <- (mean(x)-2)/sqrt(1.6/n.samp)
x <- runif(n.samp, min=0, max=1)
z.unif[i] <- (mean(x)-0.5)/sqrt(1/(12*n.samp))
}
results <- tibble(z.binom = z.binom,
z.unif = z.unif,
n.samp = seq(1:n.samp))
results %>%
ggplot() +
geom_histogram(aes(x=z.binom, y=after_stat(density)), bins=20) +
stat_function(fun = dnorm, color = "blue") +
labs(title = "Binomial (0,2, 10)",
x="z-score", y="Density") +
theme_classic()
results %>%
ggplot() +
geom_histogram(aes(x=z.unif, y=after_stat(density)), bins=20) +
stat_function(fun = dnorm, color = "blue") +
labs(title = "Uniforme (0, 1)",
x="z-score", y="Density") +
theme_classic()