2023/01/31 (updated: 2023-12-08)
…
for(i in X){
expressão 1
expressão 2
...
expressão N
}
rep() para criar um vetor de NAs.values <- c(2,4,6) values
## [1] 2 4 6
n <- length(values) n
## [1] 3
results <- rep(NA,n) results
## [1] NA NA NA
str_c() do pacote stringr. Essa função combina múltiplos objetos em um único vetor de caracteres. A função str_c() é semelhante à função paste0() do R básico, mas é mais compatível com a estrutura do tidyverse.seq_along() para enumerar as iterações.library(stringr) x1 <- "Roberto" x2 <- "Ellery" str_c(x1,x2, sep = " ")
## [1] "Roberto Ellery"
x1 <- "ECO" x2 <- "UnB" str_c(x1,x2, sep = "/")
## [1] "ECO/UnB"
seq_along(values)
## [1] 1 2 3
seq_along(c("a", "b", "c", "d"))
## [1] 1 2 3 4
for (i in seq_along(values)) {
results[i] = values[i] * 2
print(str_c(values[i], " vezes 2 é igual a ", results[i]))
}
## [1] "2 vezes 2 é igual a 4" ## [1] "4 vezes 2 é igual a 8" ## [1] "6 vezes 2 é igual a 12"
results
## [1] 4 8 12
values * 2
## [1] 4 8 12
print() para avisar o que está acontecendo em cada iteração, isso ajuda a identificar onde ocorre um erro. Para ilustrar, primeiro criemos um data.frame com os dados fictícios.data <- data.frame("a" = 1:2, "b" = c("hi", "hey"), "c"=3:4)
data
## a b c ## 1 1 hi 3 ## 2 2 hey 4
results <- rep(NA, 3)
for (i in seq_along(data)) {
print(str_c("iteration ", i))
results[i] <- median(data[,i])
}
results
results <- rep(NA, 3)
for (i in seq_along(data)) {
print(str_c("iteration ", i))
results[i] <- median(data[,i])
}
## [1] "iteration 1" ## [1] "iteration 2"
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]): ## argumento não é numérico nem lógico: retornando NA
## [1] "iteration 3"
results
## [1] 1.5 NA 3.5
if_else() e case_when(), ambas executam alguma tarefa quando uma condição é verdadeira. Nesta unidade vamos tratar dos comandos if () {} e if () {} else {}, esses comandos permitem condicionar um bloco de código a uma expressão ser verdadeira.if (X) {
expressão 1
expressão 2
...
expressão N
}
operation <- "add"
if (operation == "add") {
print("fazer adição 4 + 4")
4 + 4
}
if (operation == "multiply") {
print("fazer multiplicação 4 * 4")
4 * 4
}
operation <- "add"
if (operation == "add") {
print("fazer adição 4 + 4")
4 + 4
}
## [1] "fazer adição 4 + 4"
## [1] 8
if (operation == "multiply") {
print("fazer multiplicação 4 * 4")
4 * 4
}
if (X) {
expressão 1a
expressão 2a
...
expressão Na
} else {
expressão 1b
expressão 2b
...
expressão Nb
}
operation <- "multiply"
if (operation == "add") {
print("fazer adição 4 + 4")
4 + 4
} else {
print("fazer multiplicação 4 * 4")
4 * 4
}
operation <- "multiply"
if (operation == "add") {
print("fazer adição 4 + 4")
4 + 4
} else {
print("fazer multiplicação 4 * 4")
4 * 4
}
## [1] "fazer multiplicação 4 * 4"
## [1] 16
if (X) {
expressão 1a
...
expressão Na
} else if (Y) {
expressão 1b
...
expressão Nb
} else {
expressão 1c
...
expressão Nc
}
operation <- "subtract"
if (operation == "add") {
print("fazer adição 4 + 4")
4 + 4
} else if (operation == "multiply") {
print("fazer multiplicação 4 * 4")
4 * 4
} else {
print(str_c("`", operation, "´ é inválido. Use `add´ ou `multiply´."))
}
operation <- "subtract"
if (operation == "add") {
print("fazer adição 4 + 4")
4 + 4
} else if (operation == "multiply") {
print("fazer multiplicação 4 * 4")
4 * 4
} else {
print(str_c("`", operation, "´ é inválido. Use `add´ ou `multiply´."))
}
## [1] "`subtract´ é inválido. Use `add´ ou `multiply´."
values <- 1:5
n <- length(values)
results <- rep(NA, n)
for (i in seq_along(values)) {
x <- values[i]
r <- x %% 2
if (r==0) {
print(str_c(x, " é par, operação é adição: ", x, "+", x))
results[i] <- x + x
} else {
print(str_c(x, " é ímpar, operação é multiplicação: ", x, "*", x))
results[i] <- x * x
}
}
results
## [1] "1 é ímpar, operação é multiplicação: 1*1" ## [1] "2 é par, operação é adição: 2+2" ## [1] "3 é ímpar, operação é multiplicação: 3*3" ## [1] "4 é par, operação é adição: 4+4" ## [1] "5 é ímpar, operação é multiplicação: 5*5"
## [1] 1 4 9 8 25
library(tidyverse)
pres08 <- read.csv("pres08.csv")
polls08 <- read.csv("polls08.csv")
pres08 <- pres08 %>%
mutate(margin = Obama - McCain)
polls08 <- polls08 %>%
mutate(margin = Obama - McCain)
head(pres08)
## state.name state Obama McCain EV margin ## 1 Alabama AL 39 60 9 -21 ## 2 Alaska AK 38 59 3 -21 ## 3 Arizona AZ 45 54 10 -9 ## 4 Arkansas AR 39 59 6 -20 ## 5 California CA 61 37 55 24 ## 6 Colorado CO 54 45 9 9
head(polls08)
## state Pollster Obama McCain middate margin ## 1 AL SurveyUSA-2 36 61 2008-10-27 -25 ## 2 AL Capital Survey-2 34 54 2008-10-15 -20 ## 3 AL SurveyUSA-2 35 62 2008-10-08 -27 ## 4 AL Capital Survey-2 35 55 2008-10-06 -20 ## 5 AL Rasmussen-1 39 60 2008-09-22 -21 ## 6 AL SurveyUSA-2 34 64 2008-09-16 -30
library(lubridate)
x <- ymd("2008-11-04")
y <- ymd("2008/9/1")
class(x)
## [1] "Date"
class(y)
## [1] "Date"
x
## [1] "2008-11-04"
y
## [1] "2008-09-01"
subtraction <- x - y subtraction
## Time difference of 64 days
class(subtraction)
## [1] "difftime"
as.numeric(subtraction)
## [1] 64
unique() será usada para obter a sigla única de cada estado.str(polls08)
## 'data.frame': 1332 obs. of 6 variables: ## $ state : chr "AL" "AL" "AL" "AL" ... ## $ Pollster: chr "SurveyUSA-2" "Capital Survey-2" "SurveyUSA-2" "Capital Survey-2" ... ## $ Obama : int 36 34 35 35 39 34 36 25 35 34 ... ## $ McCain : int 61 54 62 55 60 64 58 52 55 47 ... ## $ middate : chr "2008-10-27" "2008-10-15" "2008-10-08" "2008-10-06" ... ## $ margin : int -25 -20 -27 -20 -21 -30 -22 -27 -20 -13 ...
ymd() para resolver esse problema.polls08 <- polls08 %>% mutate(middate = ymd(middate)) str(polls08)
## 'data.frame': 1332 obs. of 6 variables: ## $ state : chr "AL" "AL" "AL" "AL" ... ## $ Pollster: chr "SurveyUSA-2" "Capital Survey-2" "SurveyUSA-2" "Capital Survey-2" ... ## $ Obama : int 36 34 35 35 39 34 36 25 35 34 ... ## $ McCain : int 61 54 62 55 60 64 58 52 55 47 ... ## $ middate : Date, format: "2008-10-27" "2008-10-15" ... ## $ margin : int -25 -20 -27 -20 -21 -30 -22 -27 -20 -13 ...
election_date <- ymd("2008-11-04")
polls08 <- polls08 %>%
mutate(DaysToElection = as.numeric(election_date - middate))
poll.pred <- rep(NA,51)
st.names <- unique(polls08$state)
names(poll.pred) <- as.character(st.names)
for(i in seq_along(st.names)) {
state.data <- polls08 %>%
filter(state == st.names[i])
min_days <- min(state.data$DaysToElection)
state.data <- state.data %>%
filter(DaysToElection == min_days)
poll.pred[i] <- mean(state.data$margin)
}
head(poll.pred, n=10)
## AL AK AZ AR CA CO CT DC DE FL ## -25.0 -19.0 -2.5 -7.0 24.0 7.0 25.0 69.0 30.0 2.0
errors <- pres08$margin - poll.pred head(errors, n=10)
## AL AK AZ AR CA CO CT DC DE FL ## 4.0 -2.0 -6.5 -13.0 0.0 2.0 -2.0 16.0 -5.0 1.0
mean(errors)
## [1] 1.062092
sqrt(mean(errors^2))
## [1] 5.90894
errors_tib <- as_tibble(errors)
errors_tib %>%
ggplot(aes(errors, after_stat(density))) +
geom_histogram(binwidth = 5, boundary = 0) +
geom_vline(xintercept = mean(errors_tib$value)) +
annotate("text", x=-.2, y=0.07, hjust=1, label = "average error", size=14/.pt) +
labs(title = "Poll prediction error", y = "Density",
x= "Error in predicit margin for Obama (percentage points") +
theme_classic(base_size = 13)
cbind(). Para usar cbind() as observações devem estar na mesma ordem nas colunas que serão combinadas.pres08 <- pres08 %>% cbind(poll.pred = poll.pred) ggplot(data=pres08, aes(x=poll.pred, y=margin)) + geom_text(aes(label = state)) + geom_abline(linetype="dashed") + geom_vline(xintercept = 0) + geom_hline(yintercept = 0) + ylim(-40, 100) + xlim(-40, 100) + labs(x = "Poll results", y="Acutal election results")
sign() retorna o sinal de um número, especificamente retorna -1 se o número for negativo e 1 se o número for positivo. Quando a margem tem mesmo sinal na pesquisa e no resultado observado é porque a pesquisa acertou o vencedor (no gráfico correponde aos quadrante inferior esquerdo e superior direito). Quando os sinais são diferentes a pesquisa errou o vencedor.sign() para encontrar os estados onde as pesquisas erraram.pres08 <- pres08 %>% mutate(correct = if_else(sign(poll.pred) == sign(margin), 1, 0)) pres08 %>% filter(correct == 0) %>% select(state.name, Obama, McCain, margin, poll.pred, correct)
## state.name Obama McCain margin poll.pred correct ## IN Indiana 50 49 1 -5 0 ## MO Missouri 48 49 -1 1 0 ## NC North Carolina 50 49 1 -1 0
…
#Votos obtidos pres08 %>% filter(margin > 0) %>% summarize(total_EV = sum(EV))
## total_EV ## 1 364
#Votos previstos pres08 %>% filter(poll.pred > 0) %>% summarize(pred_EV = sum(EV))
## pred_EV ## 1 349
pollsUS08 <- read.csv("pollsUS08.csv")
str(pollsUS08)
## 'data.frame': 526 obs. of 4 variables: ## $ Pollster: chr "IBD/TIPP " "GWU (Lake/Tarrance) " "Diageo/Hotline " "Newsweek " ... ## $ McCain : int 48 51 43 44 44 45 49 42 44 48 ... ## $ Obama : int 36 39 38 46 47 47 42 48 44 48 ... ## $ middate : chr "2007-07-04" "2007-07-11" "2007-07-14" "2007-07-19" ...
head(pollsUS08, n=15)
## Pollster McCain Obama middate ## 1 IBD/TIPP 48 36 2007-07-04 ## 2 GWU (Lake/Tarrance) 51 39 2007-07-11 ## 3 Diageo/Hotline 43 38 2007-07-14 ## 4 Newsweek 44 46 2007-07-19 ## 5 Rasmussen 44 47 2007-07-19 ## 6 ABC/Post 45 47 2007-07-19 ## 7 Time 49 42 2007-07-24 ## 8 Newsweek 42 48 2007-07-26 ## 9 Rasmussen 44 44 2007-08-09 ## 10 USA Today/Gallup 48 48 2007-08-11 ## 11 WNBC/Marist 44 41 2007-08-15 ## 12 Quinnipiac 43 43 2007-08-17 ## 13 Zogby 40 44 2007-08-24 ## 14 Time 42 46 2007-08-26 ## 15 Newsweek 43 45 2007-08-30
pollsUS08 <- pollsUS08 %>% mutate(middate = ymd(middate)) str(pollsUS08)
## 'data.frame': 526 obs. of 4 variables: ## $ Pollster: chr "IBD/TIPP " "GWU (Lake/Tarrance) " "Diageo/Hotline " "Newsweek " ... ## $ McCain : int 48 51 43 44 44 45 49 42 44 48 ... ## $ Obama : int 36 39 38 46 47 47 42 48 44 48 ... ## $ middate : Date, format: "2007-07-04" "2007-07-11" ...
all_dates <- seq(min(pollsUS08$middate), election_date, by="days") head(all_dates)
## [1] "2007-07-04" "2007-07-05" "2007-07-06" "2007-07-07" "2007-07-08" ## [6] "2007-07-09"
tail(all_dates)
## [1] "2008-10-30" "2008-10-31" "2008-11-01" "2008-11-02" "2008-11-03" ## [6] "2008-11-04"
prior_days <- 7
vote_avg <- vector(length(all_dates), mode = "list")
for (i in seq_along(all_dates)) {
date <- all_dates[i]
week_data <-
filter(pollsUS08, as.numeric(date - middate) >= 0,
as.numeric(date - middate) < prior_days) %>%
summarize(Obama = mean(Obama, na.rm = TRUE),
McCain = mean(McCain, na.rm = TRUE))
week_data$date <- date
vote_avg[[i]] <- week_data
}
head(vote_avg, n=4)
## [[1]] ## Obama McCain date ## 1 36 48 2007-07-04 ## ## [[2]] ## Obama McCain date ## 1 36 48 2007-07-05 ## ## [[3]] ## Obama McCain date ## 1 36 48 2007-07-06 ## ## [[4]] ## Obama McCain date ## 1 36 48 2007-07-07
vote_avg_df <- bind_rows(vote_avg) head(vote_avg_df, n=4)
## Obama McCain date ## 1 36 48 2007-07-04 ## 2 36 48 2007-07-05 ## 3 36 48 2007-07-06 ## 4 36 48 2007-07-07
vote_avg_df %>%
filter(election_date - date <= 90) %>%
ggplot() +
geom_point(aes(x=date, y=Obama), color = "blue", shape = 1) +
geom_point(aes(x=date, y=McCain), color = "red", shape=1) +
ylim(40,55) +
labs(y="Support for candidate (percentage points)", x="Date") +
annotate("text", x=ymd("2008-08-15"), y=47, label="Obama", color="blue") +
annotate("text", x=ymd("2008-08-15"), y=41, label="McCain", color="red") +
geom_vline(xintercept = election_date) +
geom_point(aes(x=election_date, y=52.93), color="blue") +
geom_point(aes(x=election_date, y=45.65), color="red") +
theme_classic()
…
face <- read.csv("face.csv")
face <- mutate(face,
d.share = d.votes/(d.votes + r.votes),
r.share = r.votes/(d.votes + r.votes),
diff.share = d.share - r.share)
face %>%
ggplot(aes(d.comp, diff.share, color=w.party)) +
geom_point() +
scale_colour_manual(values = c(D = "blue", R = "red")) +
labs(x="Competence score for Democrats",
y="Democrats margin in vote share",
title = "Facial competence and vote share") +
ylim(-1,1) +
xlim(0,1) +
theme(legend.position = "none")
cor(face$d.comp, face$diff.share)
## [1] 0.4327743
library(datasauRus)
data("datasaurus_dozen")
datasaurus_dozen %>%
filter(dataset %in% c("bullseye", "dino", "circle", "x_shape")) %>%
group_by(dataset) %>%
summarise(media_x = mean(x),
media_y = mean(y),
sd_x = sd(x),
sd_y = sd(y),
cor_xy = cor(x,y))
## # A tibble: 4 × 6 ## dataset media_x media_y sd_x sd_y cor_xy ## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 bullseye 54.3 47.8 16.8 26.9 -0.0686 ## 2 circle 54.3 47.8 16.8 26.9 -0.0683 ## 3 dino 54.3 47.8 16.8 26.9 -0.0645 ## 4 x_shape 54.3 47.8 16.8 26.9 -0.0656
lm() do R faz regressão linear. Essa função tem como principal argumento uma fórmula, do tipo Y ~ X, também é comum usar o argumento data para indicar onde estão os dados. No R o símbolo ~ denota uma fórmula.fit <- lm(diff.share ~ d.comp, data = face) fit
## ## Call: ## lm(formula = diff.share ~ d.comp, data = face) ## ## Coefficients: ## (Intercept) d.comp ## -0.3122 0.6604
lm(face$diff.share ~ face$d.comp)
## ## Call: ## lm(formula = face$diff.share ~ face$d.comp) ## ## Coefficients: ## (Intercept) face$d.comp ## -0.3122 0.6604
class(fit)
## [1] "lm"
names(fit)
## [1] "coefficients" "residuals" "effects" "rank" ## [5] "fitted.values" "assign" "qr" "df.residual" ## [9] "xlevels" "call" "terms" "model"
coef(fit)
## (Intercept) d.comp ## -0.3122259 0.6603815
fit$coef
## (Intercept) d.comp ## -0.3122259 0.6603815
head(fitted(fit))
## 1 2 3 4 5 6 ## 0.06060411 -0.08643340 0.09217061 0.04539236 0.13698690 -0.10057206
library(tidymodels) glance(fit)
## # A tibble: 1 × 12 ## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 0.187 0.180 0.266 27.0 0.000000885 1 -10.5 27.0 35.3 ## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
tidy() retorna um data.frame onde cada linha traz a estimativa e estatísticas sobre um coeficiente.tidy(fit)
## # A tibble: 2 × 5 ## term estimate std.error statistic p.value ## <chr> <dbl> <dbl> <dbl> <dbl> ## 1 (Intercept) -0.312 0.0660 -4.73 0.00000624 ## 2 d.comp 0.660 0.127 5.19 0.000000885
augment() retorna os dados originais, os valores previstos (ajustados), os resíduos e outras estatísticas relacionadas as observações.augment(fit)
## # A tibble: 119 × 8 ## diff.share d.comp .fitted .resid .hat .sigma .cooksd .std.resid ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 0.210 0.565 0.0606 0.150 0.00996 0.267 0.00160 0.564 ## 2 0.119 0.342 -0.0864 0.206 0.0129 0.267 0.00394 0.778 ## 3 0.0499 0.612 0.0922 -0.0423 0.0123 0.268 0.000158 -0.160 ## 4 0.197 0.542 0.0454 0.151 0.00922 0.267 0.00151 0.570 ## 5 0.496 0.680 0.137 0.359 0.0174 0.266 0.0163 1.36 ## 6 -0.350 0.321 -0.101 -0.249 0.0143 0.267 0.00644 -0.941 ## 7 0.697 0.404 -0.0456 0.743 0.00979 0.258 0.0388 2.80 ## 8 0.266 0.603 0.0860 0.180 0.0118 0.267 0.00276 0.681 ## 9 -0.372 0.539 0.0434 -0.415 0.00914 0.265 0.0113 -1.57 ## 10 0.0106 0.869 0.262 -0.251 0.0426 0.267 0.0206 -0.963 ## # ℹ 109 more rows
ggplot() +
geom_point(data=face, aes(d.comp, diff.share), shape=1) +
geom_abline(slope = coef(fit)["d.comp"],
intercept = coef(fit)["(Intercept)"]) +
scale_x_continuous("Competence score for Democrats",
breaks = seq(0,1,by=0.2), limits = c(0,1)) +
scale_y_continuous("Democratic margin in vote shares",
breaks = seq(-1,1,by=0.5), limits = c(-1,1)) +
geom_vline(xintercept=mean(face$d.comp), linetype="dashed") +
geom_hline(yintercept=mean(face$diff.share), linetype="dashed") +
ggtitle("Facial competence and vote share") +
theme_classic()
ggplot(data=face, aes(d.comp, diff.share)) +
geom_point(shape=1) +
geom_smooth(method="lm", se=FALSE, color="black") +
scale_x_continuous("Competence score for Democrats",
breaks = seq(0,1,by=0.2), limits = c(0,1)) +
scale_y_continuous("Democratic margin in vote shares",
breaks = seq(-1,1,by=0.5), limits = c(-1,1)) +
geom_vline(xintercept=mean(face$d.comp), linetype="dashed") +
geom_hline(yintercept=mean(face$diff.share), linetype="dashed") +
ggtitle("Facial competence and vote share") +
theme_classic()
epsilon.hat <- resid(fit) sqrt(mean(epsilon.hat^2))
## [1] 0.2642361
mean(resid(fit))
## [1] -2.04732e-17
cbind(), mas era preciso que as observações tivessem na mesma ordem, isso nem sempre é simples de conseguir. O tidyverse tem uma série de funções para juntar dois data.frames, \(df_1\) e \(df_2\):
left_join(): inclui todas as linhas do \(df_1\)right_join(): inlui todas as linhas do \(df_2\)full_join(): inclui as linhas comuns em \(df_1\) e \(df_2\)inner_join(): inclui todas as linhas em \(df_1\) ou \(df_2\)head(df1)
## time pontos ## 1 Atlético Mineiro 84 ## 2 Flamengo 71 ## 3 Palmeiras 66 ## 4 Fortaleza 58 ## 5 Corinthians 57
head(df2)
## time pontos ## 1 Palmeiras 81 ## 2 Internacional 73 ## 3 Fluminense 70 ## 4 Corinthians 65 ## 5 Flamengo 62
left_join(df1,df2, by=c("time"="time"), suffix=c(".21",".22"))
## time pontos.21 pontos.22 ## 1 Atlético Mineiro 84 NA ## 2 Flamengo 71 62 ## 3 Palmeiras 66 81 ## 4 Fortaleza 58 NA ## 5 Corinthians 57 65
right_join(df1,df2, by=c("time"="time"), suffix=c(".21",".22"))
## time pontos.21 pontos.22 ## 1 Flamengo 71 62 ## 2 Palmeiras 66 81 ## 3 Corinthians 57 65 ## 4 Internacional NA 73 ## 5 Fluminense NA 70
inner_join(df1,df2, by=c("time"="time"), suffix=c(".21",".22"))
## time pontos.21 pontos.22 ## 1 Flamengo 71 62 ## 2 Palmeiras 66 81 ## 3 Corinthians 57 65
full_join(df1,df2, by=c("time"="time"), suffix=c(".21",".22"))
## time pontos.21 pontos.22 ## 1 Atlético Mineiro 84 NA ## 2 Flamengo 71 62 ## 3 Palmeiras 66 81 ## 4 Fortaleza 58 NA ## 5 Corinthians 57 65 ## 6 Internacional NA 73 ## 7 Fluminense NA 70
full_join().pres12 <- read.csv("pres12.csv")
head(pres12)
## state Obama Romney EV ## 1 AL 38 61 9 ## 2 AK 41 55 3 ## 3 AZ 45 54 11 ## 4 AR 37 61 6 ## 5 CA 60 37 55 ## 6 CO 51 46 9
pres <- full_join(x=pres08, y=pres12, by="state")
str(pres)
## 'data.frame': 51 obs. of 11 variables: ## $ state.name: chr "Alabama" "Alaska" "Arizona" "Arkansas" ... ## $ state : chr "AL" "AK" "AZ" "AR" ... ## $ Obama.x : int 39 38 45 39 61 54 61 92 62 51 ... ## $ McCain : int 60 59 54 59 37 45 38 7 37 48 ... ## $ EV.x : int 9 3 10 6 55 9 7 3 3 27 ... ## $ margin : int -21 -21 -9 -20 24 9 23 85 25 3 ... ## $ poll.pred : num -25 -19 -2.5 -7 24 7 25 69 30 2 ... ## $ correct : num 1 1 1 1 1 1 1 1 1 1 ... ## $ Obama.y : int 38 41 45 37 60 51 58 91 59 50 ... ## $ Romney : int 61 55 54 61 37 46 41 7 40 49 ... ## $ EV.y : int 9 3 11 6 55 9 7 3 3 29 ...
pres12 <- rename(pres12, state.abbrev=state) head(pres12)
## state.abbrev Obama Romney EV ## 1 AL 38 61 9 ## 2 AK 41 55 3 ## 3 AZ 45 54 11 ## 4 AR 37 61 6 ## 5 CA 60 37 55 ## 6 CO 51 46 9
pres <- full_join(pres08,pres12, by = c("state" = "state.abbrev"),
suffix = c("_08", "_12"))
scale() faz esse trabalho.pres <- pres %>%
mutate(Obama2008.z = as.numeric(scale(Obama_08)),
Obama2012.z = as.numeric(scale(Obama_12)))
fit1 <- lm(Obama2012.z ~ Obama2008.z, data = pres) fit1
## ## Call: ## lm(formula = Obama2012.z ~ Obama2008.z, data = pres) ## ## Coefficients: ## (Intercept) Obama2008.z ## -5.076e-17 9.834e-01
fit1 <- lm(Obama2012.z ~ -1 + Obama2008.z, data = pres) fit1
## ## Call: ## lm(formula = Obama2012.z ~ -1 + Obama2008.z, data = pres) ## ## Coefficients: ## Obama2008.z ## 0.9834
pres %>%
ggplot(aes(Obama2008.z,Obama2012.z)) +
geom_smooth(method="lm", se=FALSE, linewidth=0.5) +
geom_point(shape=1) +
coord_fixed() +
scale_x_continuous(limits = c(-4,4)) +
scale_y_continuous(limits = c(-4,4)) +
labs(x = "Obama's standardized votes share in 2008",
y = "Obama's standardized votes share in 2012") +
theme_classic()
pres %>% filter(Obama2008.z < quantile(Obama2008.z, 0.25)) %>% summarize(improve = mean(Obama2012.z > Obama2008.z))
## improve ## 1 0.5833333
pres %>% filter(Obama2008.z > quantile(Obama2008.z, 0.75)) %>% summarize(improve = mean(Obama2012.z > Obama2008.z))
## improve ## 1 0.4615385
florida <- read.csv("florida.csv")
fit2 <- lm(Buchanan00 ~ Perot96, data=florida) fit2
## ## Call: ## lm(formula = Buchanan00 ~ Perot96, data = florida) ## ## Coefficients: ## (Intercept) Perot96 ## 1.34575 0.03592
TSS2 <- florida %>% mutate(diff_sq = (Buchanan00 - mean(florida$Buchanan00))^2) %>% summarize(TSS=sum(diff_sq)) SSR2 <- sum(resid(fit2)^2) 1 - SSR2/TSS2
## TSS ## 1 0.5130333
R2 <- function(fit){
resid <- resid(fit)
y <- fitted(fit) + resid
TSS <- sum((y-mean(y))^2)
SSR <- sum(resid^2)
R2 <- 1 - SSR/TSS
return(R2)
}
R2(fit2)
## [1] 0.5130333
summary() quando aplicada a um objeto do tipo lm retorna informações relativas a regressão, entre essas informações está o \(R^2\).summary(fit2)
## ## Call: ## lm(formula = Buchanan00 ~ Perot96, data = florida) ## ## Residuals: ## Min 1Q Median 3Q Max ## -612.74 -65.96 1.94 32.88 2301.66 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 1.34575 49.75931 0.027 0.979 ## Perot96 0.03592 0.00434 8.275 9.47e-12 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 316.4 on 65 degrees of freedom ## Multiple R-squared: 0.513, Adjusted R-squared: 0.5055 ## F-statistic: 68.48 on 1 and 65 DF, p-value: 9.474e-12
fit2summary <- summary(fit2) fit2summary$r.squared
## [1] 0.5130333
glance() do pacote broom.glance(fit2)
## # A tibble: 1 × 12 ## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 0.513 0.506 316. 68.5 9.47e-12 1 -480. 966. 972. ## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
R2(fit1)
## [1] 0.9671579
augment_fit2 <- augment(fit2) augment_fit2 %>% ggplot(aes(.fitted, .resid)) + geom_point(shape=1) + geom_hline(yintercept = 0) + labs(x="Fitted values", y="Residuals") + theme_classic()
add_predictions() e add_residuals() do pacote modelr ajudam a identificar o condado.library(modelr) florida_fit2 <- florida %>% add_predictions(fit2) %>% add_residuals(fit2) florida_fit2 %>% filter(resid == max(resid)) %>% select(county) %>% pull()
## [1] "PalmBeach"
…
florida.pb <- filter(florida, county != "PalmBeach") fit3 <- lm(Buchanan00 ~ Perot96, data = florida.pb) fit3
## ## Call: ## lm(formula = Buchanan00 ~ Perot96, data = florida.pb) ## ## Coefficients: ## (Intercept) Perot96 ## 45.84193 0.02435
R2(fit3)
## [1] 0.8511675
florida.pb %>%
add_residuals(fit3) %>%
add_predictions(fit3) %>%
ggplot(aes(pred, resid)) +
geom_hline(yintercept = 0) +
geom_point(shape=1) +
ylim(-750, 2500) +
xlim(0,1500) +
labs(title = "Residual plot without Plam Beach",
x= "Fitted values",
y="Residuals") +
theme_classic()
ggplot() +
geom_point(data = florida, aes(Perot96, Buchanan00), shape=1) +
geom_smooth(data = florida, aes(Perot96, Buchanan00), method="lm", se=FALSE) +
geom_smooth(data = florida.pb, aes(Perot96, Buchanan00), method="lm",
se=FALSE, linetype="dashed") +
annotate("text", x=30000, y=3200, label = "Palm Beach") +
annotate("text", x=30000, y=300, label = "Regression line without\n Palm Beach") +
annotate("text", x=25000, y=1400, label = "Regression line with\n Palm Beach") +
theme_classic()
women <- read.csv("women.csv")
head(women)
## GP village reserved female irrigation water ## 1 1 2 1 1 0 10 ## 2 1 1 1 1 5 0 ## 3 2 2 1 1 2 2 ## 4 2 1 1 1 4 31 ## 5 3 2 0 0 0 0 ## 6 3 1 0 0 0 0
women %>% group_by(reserved) %>% summarize(prop_female = mean(female))
## # A tibble: 2 × 2 ## reserved prop_female ## <int> <dbl> ## 1 0 0.0748 ## 2 1 1
women %>%
group_by(reserved) %>%
summarize(irrigation = mean(irrigation),
water = mean(water)) %>%
pivot_longer(names_to = "variables", -reserved) %>%
pivot_wider(names_from = reserved) %>%
rename("not_reserved" = `0`, "reserved" = `1`) %>%
mutate(diff = reserved - not_reserved)
## # A tibble: 2 × 4 ## variables not_reserved reserved diff ## <chr> <dbl> <dbl> <dbl> ## 1 irrigation 3.39 3.02 -0.369 ## 2 water 14.7 24.0 9.25
lm(water ~ reserved, data = women)
## ## Call: ## lm(formula = water ~ reserved, data = women) ## ## Coefficients: ## (Intercept) reserved ## 14.738 9.252
lm(irrigation ~ reserved, data = women)
## ## Call: ## lm(formula = irrigation ~ reserved, data = women) ## ## Coefficients: ## (Intercept) reserved ## 3.3879 -0.3693
lm(), a fórmula no chamado da função deve incluir todas as variáveis explicativas, por exemplo: lm(y ~ x1 + x2 + x3).lm() automaticamente cria um conjunto de variáveis binárias com um nas observações correspondentes ao grupo e zero nas outras observações.social <- read.csv("social.csv")
class(social$messages)
## [1] "character"
head(social$messages)
## [1] "Civic Duty" "Civic Duty" "Hawthorne" "Hawthorne" "Hawthorne" ## [6] "Control"
unique(social$messages)
## [1] "Civic Duty" "Hawthorne" "Control" "Neighbors"
fit <- lm(primary2006 ~ messages, data=social) fit
## ## Call: ## lm(formula = primary2006 ~ messages, data = social) ## ## Coefficients: ## (Intercept) messagesControl messagesHawthorne messagesNeighbors ## 0.314538 -0.017899 0.007837 0.063411
social <- social %>%
mutate(Control = if_else(messages == "Control", 1, 0),
Hawthorne = if_else(messages == "Hawthorne", 1, 0),
Neighbors = if_else(messages == "Neighbors", 1, 0))
social %>% select(Control, Hawthorne, Neighbors) %>% head()
## Control Hawthorne Neighbors ## 1 0 0 0 ## 2 0 0 0 ## 3 0 1 0 ## 4 0 1 0 ## 5 0 1 0 ## 6 1 0 0
lm(primary2006 ~ Control + Hawthorne + Neighbors, data = social)
## ## Call: ## lm(formula = primary2006 ~ Control + Hawthorne + Neighbors, data = social) ## ## Coefficients: ## (Intercept) Control Hawthorne Neighbors ## 0.314538 -0.017899 0.007837 0.063411
add_predictions() do pacote modelr. Para isso vamos usar a função data_grid(), também do modelr, para criar um data.frame com cada valor único da variável messages. A função data_grid() cria um data.frame com combinações únicas das colunas especificadas.data_grid(social, messages)
## # A tibble: 4 × 1 ## messages ## <chr> ## 1 Civic Duty ## 2 Control ## 3 Hawthorne ## 4 Neighbors
unique_messages <- data_grid(social, messages) %>% add_predictions(fit) unique_messages
## # A tibble: 4 × 2 ## messages pred ## <chr> <dbl> ## 1 Civic Duty 0.315 ## 2 Control 0.297 ## 3 Hawthorne 0.322 ## 4 Neighbors 0.378
social %>% group_by(messages) %>% summarize(mean(primary2006))
## # A tibble: 4 × 2 ## messages `mean(primary2006)` ## <chr> <dbl> ## 1 Civic Duty 0.315 ## 2 Control 0.297 ## 3 Hawthorne 0.322 ## 4 Neighbors 0.378
fit.noint <- lm(primary2006 ~ -1 + messages, data=social) fit.noint
## ## Call: ## lm(formula = primary2006 ~ -1 + messages, data = social) ## ## Coefficients: ## messagesCivic Duty messagesControl messagesHawthorne messagesNeighbors ## 0.3145 0.2966 0.3224 0.3779
relevel() para mudar o nível de referência do fator.social.f <- social %>%
mutate(messages.f = as.factor(messages),
mes.C = relevel(messages.f, ref = "Control"),
mes.H = relevel(messages.f, ref = "Hawthorne"),
mes.N = relevel(messages.f, ref = "Neighbors"))
lm(primary2006 ~ mes.C, data = social.f)
## ## Call: ## lm(formula = primary2006 ~ mes.C, data = social.f) ## ## Coefficients: ## (Intercept) mes.CCivic Duty mes.CHawthorne mes.CNeighbors ## 0.29664 0.01790 0.02574 0.08131
lm(primary2006 ~ mes.H, data = social.f)
## ## Call: ## lm(formula = primary2006 ~ mes.H, data = social.f) ## ## Coefficients: ## (Intercept) mes.HCivic Duty mes.HControl mes.HNeighbors ## 0.322375 -0.007837 -0.025736 0.055574
lm(primary2006 ~ mes.N, data = social.f)
## ## Call: ## lm(formula = primary2006 ~ mes.N, data = social.f) ## ## Coefficients: ## (Intercept) mes.NCivic Duty mes.NControl mes.NHawthorne ## 0.37795 -0.06341 -0.08131 -0.05557
group_by() e calcular a diferença em relação ao grupo de interesse.social %>%
group_by(messages) %>%
summarize(primary2006 = mean(primary2006)) %>%
mutate(Control = primary2006[messages == "Control"],
diff = primary2006 - Control)
## # A tibble: 4 × 4 ## messages primary2006 Control diff ## <chr> <dbl> <dbl> <dbl> ## 1 Civic Duty 0.315 0.297 0.0179 ## 2 Control 0.297 0.297 0 ## 3 Hawthorne 0.322 0.297 0.0257 ## 4 Neighbors 0.378 0.297 0.0813
adjR2 <- function(fit) {
resid <- resid(fit)
y <- fitted(fit) + resid
n <- length(y)
TSS.adj <- sum((y - mean(y))^2)/(n - 1)
SSR.adj <- sum(resid^2)/(n - length(coef(fit)))
R2.adj <- 1 - SSR.adj/TSS.adj
return(R2.adj)
}
adjR2(fit)
## [1] 0.003272788
R2(fit)
## [1] 0.003282564
summary(fit)$adj.r.squared
## [1] 0.003272788
ate <- social %>% group_by(primary2004, messages) %>% summarize(primary2006 = mean(primary2006)) %>% pivot_wider(names_from = messages, values_from = primary2006) %>% mutate(ate_Neighbors = Neighbors - Control) %>% select(primary2004, Neighbors, Control, ate_Neighbors) %>% ungroup()
ate
## # A tibble: 2 × 4 ## primary2004 Neighbors Control ate_Neighbors ## <int> <dbl> <dbl> <dbl> ## 1 0 0.306 0.237 0.0693 ## 2 1 0.482 0.386 0.0965
ate.voter <- filter(ate, primary2004 == 1) %>% select(ate_Neighbors) %>% pull() ate.nonvoter <- filter(ate, primary2004 == 0) %>% select(ate_Neighbors) %>% pull() ate.voter - ate.nonvoter
## [1] 0.02722908
social.neighbor <- social %>%
filter(messages == "Control" | messages == "Neighbors")
fit.int <- lm(primary2006 ~ primary2004 + messages + primary2004:messages,
data = social.neighbor)
fit.int
## ## Call: ## lm(formula = primary2006 ~ primary2004 + messages + primary2004:messages, ## data = social.neighbor) ## ## Coefficients: ## (Intercept) primary2004 ## 0.23711 0.14870 ## messagesNeighbors primary2004:messagesNeighbors ## 0.06930 0.02723
lm(primary2006 ~ primary2004*messages, data = social.neighbor)
## ## Call: ## lm(formula = primary2006 ~ primary2004 * messages, data = social.neighbor) ## ## Coefficients: ## (Intercept) primary2004 ## 0.23711 0.14870 ## messagesNeighbors primary2004:messagesNeighbors ## 0.06930 0.02723
social.neighbor <- social.neighbor %>% mutate(age = 2008 - yearofbirth) summary(social.neighbor$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 22.00 43.00 52.00 51.82 61.00 108.00
fit.age <- lm(primary2006 ~ age*messages, data = social.neighbor) fit.age
## ## Call: ## lm(formula = primary2006 ~ age * messages, data = social.neighbor) ## ## Coefficients: ## (Intercept) age messagesNeighbors ## 0.0894768 0.0039982 0.0485728 ## age:messagesNeighbors ## 0.0006283
crossing() do pacote tidyr que será utilizada para criar as combinações entre idade e tartamento recebido.crossing(age = seq(from=20, to=80, by=20), messages = c("Neighbors", "Control"))
## # A tibble: 8 × 2 ## age messages ## <dbl> <chr> ## 1 20 Control ## 2 20 Neighbors ## 3 40 Control ## 4 40 Neighbors ## 5 60 Control ## 6 60 Neighbors ## 7 80 Control ## 8 80 Neighbors
ate.age <- tidyr::crossing(age = seq(from=20, to=80, by=20),
messages = c("Neighbors", "Control")) %>%
add_predictions(fit.age) %>%
pivot_wider(names_from = messages, values_from = pred) %>%
mutate(diff = Neighbors - Control)
ate.age
## # A tibble: 4 × 4 ## age Control Neighbors diff ## <dbl> <dbl> <dbl> <dbl> ## 1 20 0.169 0.231 0.0611 ## 2 40 0.249 0.323 0.0737 ## 3 60 0.329 0.416 0.0863 ## 4 80 0.409 0.508 0.0988
I().fit.age2 <- lm(primary2006 ~ age + I(age^2) + messages + age:messages + I(age^2):messages,
data = social.neighbor)
fit.age2
## ## Call: ## lm(formula = primary2006 ~ age + I(age^2) + messages + age:messages + ## I(age^2):messages, data = social.neighbor) ## ## Coefficients: ## (Intercept) age ## -9.700e-02 1.172e-02 ## I(age^2) messagesNeighbors ## -7.389e-05 -5.275e-02 ## age:messagesNeighbors I(age^2):messagesNeighbors ## 4.804e-03 -3.961e-05
data_grid() do pacote modelr para criar todas as combinações possíveis de idade e tratamento, estas combinações serão os cenários para os quais faremos as previsões.social.neighbor %>% data_grid(age, messages)
## # A tibble: 172 × 2 ## age messages ## <dbl> <chr> ## 1 22 Control ## 2 22 Neighbors ## 3 23 Control ## 4 23 Neighbors ## 5 24 Control ## 6 24 Neighbors ## 7 25 Control ## 8 25 Neighbors ## 9 26 Control ## 10 26 Neighbors ## # ℹ 162 more rows
y.hat <- social.neighbor %>% data_grid(age, messages) %>% add_predictions(fit.age2) head(y.hat)
## # A tibble: 6 × 3 ## age messages pred ## <dbl> <chr> <dbl> ## 1 22 Control 0.125 ## 2 22 Neighbors 0.159 ## 3 23 Control 0.134 ## 4 23 Neighbors 0.170 ## 5 24 Control 0.142 ## 6 24 Neighbors 0.182
ggplot(y.hat, aes(age, pred)) +
geom_line(aes(linetype = messages, color = messages)) +
labs(color = "", linetype="",
y = "Predicited turnout rate", x="Age") +
xlim(20,90) +
theme_classic()
social.neighbor.ate <- y.hat %>% pivot_wider(names_from = messages, values_from = pred) %>% mutate(ate = Neighbors - Control) head(social.neighbor.ate)
## # A tibble: 6 × 4 ## age Control Neighbors ate ## <dbl> <dbl> <dbl> <dbl> ## 1 22 0.125 0.159 0.0338 ## 2 23 0.134 0.170 0.0368 ## 3 24 0.142 0.182 0.0397 ## 4 25 0.150 0.192 0.0426 ## 5 26 0.158 0.203 0.0454 ## 6 27 0.166 0.214 0.0481
head(social.neighbor.ate)
## # A tibble: 6 × 4 ## age Control Neighbors ate ## <dbl> <dbl> <dbl> <dbl> ## 1 22 0.125 0.159 0.0338 ## 2 23 0.134 0.170 0.0368 ## 3 24 0.142 0.182 0.0397 ## 4 25 0.150 0.192 0.0426 ## 5 26 0.158 0.203 0.0454 ## 6 27 0.166 0.214 0.0481
social.neighbor.ate %>%
ggplot(aes(age, ate)) +
geom_line() +
labs(y="Estimated average treatment effect",
x = "Age") +
xlim(20,90) +
ylim(0,0.1) +
theme_classic()
MPs <- read.csv("MPs.csv")
labour_winners <- filter(MPs, party=="labour", margin > 0) labour_losers <- filter(MPs, party=="labour", margin < 0) tory_winners <- filter(MPs, party=="tory", margin > 0) tory_losers <- filter(MPs, party=="tory", margin < 0) labour_fit_win <- lm(ln.net ~ margin, data = labour_winners) labour_fit_lose <- lm(ln.net ~ margin, data = labour_losers) tory_fit_win <- lm(ln.net ~ margin, data = tory_winners) tory_fit_lose <- lm(ln.net ~ margin, data = tory_losers)
y1_labour_win <- labour_winners %>% data_grid(margin) %>% add_predictions(labour_fit_win) y2_labour_lose <- labour_losers %>% data_grid(margin) %>% add_predictions(labour_fit_lose) y1_tory_win <- tory_winners %>% data_grid(margin) %>% add_predictions(tory_fit_win) y2_tory_lose <- tory_losers %>% data_grid(margin) %>% add_predictions(tory_fit_lose)
head(y1_labour_win)
## # A tibble: 6 × 2 ## margin pred ## <dbl> <dbl> ## 1 0.00243 11.9 ## 2 0.00372 11.9 ## 3 0.00647 11.9 ## 4 0.00938 12.0 ## 5 0.0150 12.0 ## 6 0.0161 12.0
ggplot() +
geom_point(data = labour_winners, aes(margin, ln.net), shape = 1) +
geom_point(data = labour_losers, aes(margin, ln.net), shape = 1) +
geom_line(data = y1_labour_win, aes(margin, pred), color="blue", linewidth=1) +
geom_line(data = y2_labour_lose, aes(margin, pred), color="blue", linewidth=1) +
geom_vline(xintercept=0, linetype="dashed") +
labs(title = "Labour",
x = "Margin of victory",
y = "log net wealth at death") +
xlim(-0.5, 0.5) +
ylim(6, 18) +
theme_classic()
ggplot() +
geom_point(data = tory_winners, aes(margin, ln.net), shape = 1) +
geom_point(data = tory_losers, aes(margin, ln.net), shape = 1) +
geom_line(data = y1_tory_win, aes(margin, pred), color="blue", linewidth=1) +
geom_line(data = y2_tory_lose, aes(margin, pred), color="blue", linewidth=1) +
geom_vline(xintercept=0, linetype="dashed") +
labs(title = "Tory",
x = "Margin of victory",
y = "log net wealth at death") +
xlim(-0.5, 0.5) +
ylim(6, 18) +
theme_classic()
exp(), para reverter o logaritmo e finalmente calculamos a diferencça.spread_predictions() do pacote modelr.spread_predictions(tibble(margin=0),
tory_fit_win, tory_fit_lose)
## # A tibble: 1 × 3 ## margin tory_fit_win tory_fit_lose ## <dbl> <dbl> <dbl> ## 1 0 13.2 12.5
spread_predictions(tibble(margin=0),
tory_fit_win, tory_fit_lose) %>%
mutate(rd_est = exp(tory_fit_win) - exp(tory_fit_lose)) %>%
select(rd_est) %>%
pull()
## 1 ## 255050.9
tory_fit_win_placebo <- lm(margin.pre ~ margin, data = tory_winners) tory_fit_lose_placebo <- lm(margin.pre ~ margin, data = tory_losers)
tidy() do pacote tidyverse.win_intercept <- tidy(tory_fit_win_placebo) %>% filter(term == "(Intercept)") %>% select(estimate) %>% pull() lose_intercept <- tidy(tory_fit_lose_placebo) %>% filter(term == "(Intercept)") %>% select(estimate) %>% pull() win_intercept - lose_intercept
## [1] -0.01725578