2023/01/31 (updated: 2024-08-19)
afghan <- read.csv("afghan.csv")
str(afghan)
## 'data.frame': 2754 obs. of 11 variables: ## $ province : chr "Logar" "Logar" "Logar" "Logar" ... ## $ district : chr "Baraki Barak" "Baraki Barak" "Baraki Barak" "Baraki Barak" ... ## $ village.id : int 80 80 80 80 80 80 80 80 80 35 ... ## $ age : int 26 49 60 34 21 18 42 39 20 18 ... ## $ educ.years : int 10 3 0 14 12 10 6 12 5 10 ... ## $ employed : int 0 1 1 1 1 1 1 1 1 0 ... ## $ income : chr "2,001-10,000" "2,001-10,000" "2,001-10,000" "2,001-10,000" ... ## $ violent.exp.ISAF : int 0 0 1 0 0 0 0 0 0 0 ... ## $ violent.exp.taliban: int 0 0 0 0 0 0 0 1 0 0 ... ## $ list.group : chr "control" "control" "control" "ISAF" ... ## $ list.response : int 0 1 1 3 3 2 1 3 1 2 ...
head(afghan)
## province district village.id age educ.years employed income ## 1 Logar Baraki Barak 80 26 10 0 2,001-10,000 ## 2 Logar Baraki Barak 80 49 3 1 2,001-10,000 ## 3 Logar Baraki Barak 80 60 0 1 2,001-10,000 ## 4 Logar Baraki Barak 80 34 14 1 2,001-10,000 ## 5 Logar Baraki Barak 80 21 12 1 2,001-10,000 ## 6 Logar Baraki Barak 80 18 10 1 <NA> ## violent.exp.ISAF violent.exp.taliban list.group list.response ## 1 0 0 control 0 ## 2 0 0 control 1 ## 3 1 0 control 1 ## 4 0 0 ISAF 3 ## 5 0 0 ISAF 3 ## 6 0 0 ISAF 2
library(tidyverse) afghan %>% select(age, educ.years, employed, income) %>% summary()
## age educ.years employed income ## Min. :15.00 Min. : 0.000 Min. :0.0000 Length:2754 ## 1st Qu.:22.00 1st Qu.: 0.000 1st Qu.:0.0000 Class :character ## Median :30.00 Median : 1.000 Median :1.0000 Mode :character ## Mean :32.39 Mean : 4.002 Mean :0.5828 ## 3rd Qu.:40.00 3rd Qu.: 8.000 3rd Qu.:1.0000 ## Max. :80.00 Max. :18.000 Max. :1.0000
afghan %>% select(age, educ.years, employed, income) %>% mutate(income = as.factor(income)) %>% summary()
## age educ.years employed income ## Min. :15.00 Min. : 0.000 Min. :0.0000 10,001-20,000 : 616 ## 1st Qu.:22.00 1st Qu.: 0.000 1st Qu.:0.0000 2,001-10,000 :1420 ## Median :30.00 Median : 1.000 Median :1.0000 20,001-30,000 : 93 ## Mean :32.39 Mean : 4.002 Mean :0.5828 less than 2,000: 457 ## 3rd Qu.:40.00 3rd Qu.: 8.000 3rd Qu.:1.0000 over 30,000 : 14 ## Max. :80.00 Max. :18.000 Max. :1.0000 NA's : 154
count(afghan, income)
## income n ## 1 10,001-20,000 616 ## 2 2,001-10,000 1420 ## 3 20,001-30,000 93 ## 4 less than 2,000 457 ## 5 over 30,000 14 ## 6 <NA> 154
unique(afghan$income)
## [1] "2,001-10,000" NA "10,001-20,000" "less than 2,000" ## [5] "20,001-30,000" "over 30,000"
harm_props <- afghan %>% group_by(violent.exp.ISAF, violent.exp.taliban) %>% count() %>% ungroup() %>% mutate(prop = n/sum(n))
## # A tibble: 9 × 4 ## violent.exp.ISAF violent.exp.taliban n prop ## <int> <int> <int> <dbl> ## 1 0 0 1330 0.483 ## 2 0 1 354 0.129 ## 3 0 NA 22 0.00799 ## 4 1 0 475 0.172 ## 5 1 1 526 0.191 ## 6 1 NA 22 0.00799 ## 7 NA 0 7 0.00254 ## 8 NA 1 8 0.00290 ## 9 NA NA 10 0.00363
afghan %>% group_by(violent.exp.ISAF, violent.exp.taliban) %>% count() %>% # ungroup() %>% mutate(prop = n/sum(n))
## # A tibble: 9 × 4 ## # Groups: violent.exp.ISAF, violent.exp.taliban [9] ## violent.exp.ISAF violent.exp.taliban n prop ## <int> <int> <int> <dbl> ## 1 0 0 1330 1 ## 2 0 1 354 1 ## 3 0 NA 22 1 ## 4 1 0 475 1 ## 5 1 1 526 1 ## 6 1 NA 22 1 ## 7 NA 0 7 1 ## 8 NA 1 8 1 ## 9 NA NA 10 1
ISAF_harm_prop <- harm_props %>% filter(violent.exp.ISAF == 1) %>% summarize(harm_prop = sum(prop)) %>% pull() ISAF_harm_prop
## [1] 0.3714597
talib_harm_prop <- harm_props %>% filter(violent.exp.taliban == 1) %>% summarize(harm_prop = sum(prop)) %>% pull() talib_harm_prop
## [1] 0.3224401
both_harm_prop <- harm_props %>% filter(violent.exp.taliban == 1 & violent.exp.ISAF == 1) %>% summarize(harm_prop = sum(prop)) %>% pull() both_harm_prop
## [1] 0.1909949
afghan %>% select(income) %>% slice(1:10)
## income ## 1 2,001-10,000 ## 2 2,001-10,000 ## 3 2,001-10,000 ## 4 2,001-10,000 ## 5 2,001-10,000 ## 6 <NA> ## 7 10,001-20,000 ## 8 2,001-10,000 ## 9 2,001-10,000 ## 10 <NA>
is.na() identifica os NAs presentes nos dados.afghan %>% select(income) %>% slice(1:10) %>% is.na()
## income ## [1,] FALSE ## [2,] FALSE ## [3,] FALSE ## [4,] FALSE ## [5,] FALSE ## [6,] TRUE ## [7,] FALSE ## [8,] FALSE ## [9,] FALSE ## [10,] TRUE
is.na() é o número de NAs e a média é a proporção de NAs.afghan %>%
summarize(n_missing = sum(is.na(income)),
p_missing = mean(is.na(income)))
## n_missing p_missing ## 1 154 0.05591866
5 + NA
## [1] NA
5*NA
## [1] NA
x <- c(1,2,3,4,NA) sum(x)
## [1] NA
sum(x, na.rm = TRUE)
## [1] 10
x <- c(1,2,3,4,NA) mean(x)
## [1] NA
mean(x, na.rm = TRUE)
## [1] 2.5
filter() combinada com a função is.na() oeferece uma maneira rápida de tirar NAs de uma análise.afghan %>% filter(!is.na(violent.exp.ISAF), !is.na(violent.exp.taliban)) %>% group_by(violent.exp.ISAF, violent.exp.taliban) %>% count() %>% ungroup() %>% mutate(prop = n/sum(n)) %>% arrange(prop)
## # A tibble: 4 × 4 ## violent.exp.ISAF violent.exp.taliban n prop ## <int> <int> <int> <dbl> ## 1 0 1 354 0.132 ## 2 1 0 475 0.177 ## 3 1 1 526 0.196 ## 4 0 0 1330 0.495
arrange() organizou os dados da menor para maior proporção, podemos organizar em ordem decrescente usando a função desc().afghan %>% filter(!is.na(violent.exp.ISAF), !is.na(violent.exp.taliban)) %>% group_by(violent.exp.ISAF, violent.exp.taliban) %>% count() %>% ungroup() %>% mutate(prop = n/sum(n)) %>% arrange(desc(prop))
## # A tibble: 4 × 4 ## violent.exp.ISAF violent.exp.taliban n prop ## <int> <int> <int> <dbl> ## 1 0 0 1330 0.495 ## 2 1 1 526 0.196 ## 3 1 0 475 0.177 ## 4 0 1 354 0.132
filter() e is.na() também poder ser usadas para identificar NAs em mais de uma variável, por exemplo, podemos encontrar quantos NAs para quem sofreu violência com ISAF ou taliban.missing_prop <- harm_props %>% filter(is.na(violent.exp.ISAF) | is.na(violent.exp.taliban)) %>% summarize(missing_prop = sum(prop)) %>% pull missing_prop
## [1] 0.02505447
na.omit(), do R básico, e drop_na(), do tidyverse excluem as observações onde aparece NA em pelo menos uma variável. Repare que as funções excluem toda a linha mesmo que esteja faltando dados para apenas uma variável.nrow(afghan)
## [1] 2754
afghan.sub <- na.omit(afghan) nrow(afghan.sub)
## [1] 2554
nrow(afghan)
## [1] 2754
afghan.sub2 <- drop_na(afghan) nrow(afghan.sub2)
## [1] 2554
afghan %>% drop_na(income) %>% nrow()
## [1] 2600
geom_col() para fazer gráficos de barras, aqui vamos usar a função geom_bar(). A função geom_col() é uma versão simplificada da geom_bar().ggplot(data = afghan, aes(x=as.factor(violent.exp.ISAF))) +
geom_bar(aes(y= after_stat(prop))) +
scale_x_discrete(labels = c("No harm", "Harm", "Nonresponse")) +
ylab("Proportion of respondents") +
xlab("Response category") +
ggtitle("Civilian Victimization by the ISAF")
aes() no arumento da função geom_bar() é chamada da seguinte forma: geom_bar(aes(y=..prop.., group=1)) o argumento ..prop.. foi substituido por after_stat(prop) para ficar compatível com as versões mais modernas do ggplot2. O argumento group = 1 é um “truque” para fazer o agrupamento pelos níveis de violent.exp.ISAF. Por fim, note que os autores transformaram a variável violent.exp.ISAF em fator.prop na função after_stat() diz para usar proporções. Para ilustrar a função after_stat() vou refazer o gráfico usando o argumento count.ggplot(data = afghan, aes(x=as.factor(violent.exp.ISAF))) +
geom_bar(aes(y= after_stat(count))) +
scale_x_discrete(labels = c("No harm", "Harm", "Nonresponse")) +
ylab("Proportion of respondents") +
xlab("Response category") +
ggtitle("Civilian Victimization by the ISAF")
ggplot(data = afghan, aes(x=as.factor(violent.exp.taliban))) +
geom_bar(aes(y= after_stat(prop))) +
scale_x_discrete(labels = c("No harm", "Harm", "Nonresponse")) +
ylab("Proportion of respondents") +
xlab("Response category") +
ggtitle("Civilian Victimization by the ISAF")
afghan %>%
pivot_longer(violent.exp.ISAF:violent.exp.taliban,
names_to ="harming group", values_to="harm") %>%
ggplot(aes(x=as.factor(harm))) +
geom_bar(aes(y=after_stat(prop), fill=harming_group, group=harming_group),
position = "dodge") +
scale_x_discrete(labels=c("No harm", "Harm", "Nonresponse")) +
scale_fill_discrete(name="Harming group", labels=c("ISAF", "Taliban")) +
labs(title = "Civilian victimization",
x = "Response category",
y = "Proportion of respondents")
ggplot()afghan %>%
pivot_longer(violent.exp.ISAF:violent.exp.taliban,
names_to ="harming group", values_to="harm")
afghan %>%
pivot_longer(violent.exp.ISAF:violent.exp.taliban,
names_to ="harming_group", values_to="harm") %>%
ggplot(aes(x=as.factor(harm))) +
geom_bar(aes(y=after_stat(prop), fill=harming_group, group=harming_group),
position = "dodge") +
scale_x_discrete(labels=c("No harm", "Harm", "Nonresponse")) +
scale_fill_brewer(name="Harming group", labels=c("ISAF", "Taliban"),
palette = "Blues") +
labs(title = "Civilian victimization",
x = "Response category",
y = "Proportion of respondents") +
theme_classic()
geom_histogram() do ggplot2 faz o histograma.afghan %>%
ggplot(aes(x = age)) +
geom_histogram(aes(y= after_stat(density)), binwidth=5, boundary=0) +
scale_x_continuous(breaks = seq(20,80,by=10)) +
labs(title = "Distribution of respondent's age",
y = "Density",
x = "Age") +
theme_classic()
afghan %>%
ggplot(aes(x = age)) +
geom_histogram(aes(y= after_stat(count)), binwidth=5, boundary=0) +
scale_x_continuous(breaks = seq(20,80,by=10)) +
labs(title = "Distribuição das idades",
y = "Número de observações",
x = "Idade") +
theme_classic()
geom_vline() coloca linhas verticais em um gráfico (para linhas horizontais use a função geom_hline()) e a função annotate() faz anotações no gráfico.afghan %>%
ggplot(aes(x = educ.years, y= after_stat(density))) +
geom_histogram(binwidth=1, center=0) +
geom_vline(xintercept = median(afghan$educ.years)) +
annotate(geom="text", x = median(afghan$educ.years),
y=0.4, label="Median", hjust=-0.1)
labs(title = "Distribution of respondent's education",
y = "denssity",
x = "Years of education") +
theme_classic()
afghan %>% ggplot(aes(y=age)) + geom_boxplot() + labs(y="Age", x=NULL, title = "Distribution of age")
afghan %>% ggplot(aes(y=educ.years, x=province)) + geom_boxplot() + labs(y="Years of education", x = "Province", title = "Education by province")
afghan %>%
group_by(province) %>%
summarize(violence.exp.taliban = mean(violent.exp.taliban, na.rm=TRUE),
violence.exp.ISAF = mean(violent.exp.ISAF, na.rm=TRUE))
## # A tibble: 5 × 3 ## province violence.exp.taliban violence.exp.ISAF ## <chr> <dbl> <dbl> ## 1 Helmand 0.504 0.541 ## 2 Khost 0.233 0.242 ## 3 Kunar 0.303 0.399 ## 4 Logar 0.0802 0.144 ## 5 Uruzgan 0.455 0.496
ggsave(). Deixo por conta de vocês olhar o help desta função.library(gridExtra)
age_hist <- afghan %>%
ggplot(aes(x=age)) +
geom_histogram(aes(y= after_stat(density)), binwidth=5, boundary=0) +
scale_x_continuous(breaks = seq(20,80,by=10)) +
labs(title = "Distribution of respondent's age",
y = "Density",
x = "Age") +
theme_classic()
educ_hist <- afghan %>%
ggplot(aes(x = educ.years, y= after_stat(density))) +
geom_histogram(binwidth=1, center=0) +
geom_vline(xintercept = median(afghan$educ.years)) +
annotate(geom="text", x = median(afghan$educ.years),
y=0.4, label="Median", hjust=-0.1) +
labs(title = "Distribution of respondent's education",
y = "denssity",
x = "Years of education") +
theme_classic()
grid.arrange(age_hist, educ_hist, ncol=2)
afghan.village <- read.csv("afghan-village.csv")
head(afghan.village)
## altitude population village.surveyed ## 1 1959.08 197 1 ## 2 2425.88 744 0 ## 3 2236.60 179 1 ## 4 1691.76 225 0 ## 5 1928.04 379 0 ## 6 1194.56 617 0
afghan.village %>%
ggplot(aes(x=altitude)) +
geom_histogram(aes(y=after_stat(density))) +
labs(x = "Altitude (metros)",
y= "densidade")
afghan.village %>%
ggplot(aes(x=population)) +
geom_histogram(aes(y=after_stat(density))) +
labs(x = "População",
y= "densidade")
afghan.village %>%
ggplot(aes(x=log(population))) +
geom_histogram(aes(y=after_stat(density))) +
labs(x = "População (log)",
y= "densidade")
afghan.village %>%
ggplot(aes(x = as.factor(x=village.surveyed),
y = altitude)) +
geom_boxplot() +
scale_x_discrete(labels = c("Fora da amostra", "Na amostra")) +
labs(y = "Altitude (metros)", x=NULL) +
theme_classic()
afghan.village %>%
ggplot(aes(x = as.factor(x=village.surveyed),
y = log(population))) +
geom_boxplot() +
scale_x_discrete(labels = c("Fora da amostra", "Na amostra")) +
labs(y = "População (log)", x=NULL) +
theme_classic()
…
congress <- read.csv("congress.csv")
head(congress)
## congress district state party name dwnom1 dwnom2 ## 1 80 0 USA Democrat TRUMAN -0.276 0.016 ## 2 80 1 ALABAMA Democrat BOYKIN F. -0.026 0.796 ## 3 80 2 ALABAMA Democrat GRANT G. -0.042 0.999 ## 4 80 3 ALABAMA Democrat ANDREWS G. -0.008 1.005 ## 5 80 4 ALABAMA Democrat HOBBS S. -0.082 1.066 ## 6 80 5 ALABAMA Democrat RAINS A. -0.170 0.870
plot_80 <- congress %>%
filter(congress == 80) %>%
ggplot(aes(x=dwnom1, y=dwnom2)) +
geom_point(aes(shape=party, color = party), show.legend = FALSE) +
scale_color_manual(values = c(Democrat = "blue",
Republican = "red",
Other = "green")) +
scale_shape_manual(values = c(Democrat = "square",
Republican = "triangle",
Other = "circle")) +
annotate("text",x=-1, y=1, label="Democrats", color = "blue") +
annotate("text",x=0.5, y=-1, label="Republicans", color = "red") +
scale_y_continuous("Racial liberalism/conservatism", limits = c(-1.5, 1.5)) +
scale_x_continuous("Economic liberalism/conservatism", limits = c(-1.5, 1.5)) +
ggtitle("80th Congress") +
coord_fixed() +
theme_classic()
plot_112 <- congress %>%
filter(congress == 112) %>%
ggplot(aes(x=dwnom1, y=dwnom2)) +
geom_point(aes(shape=party, color = party), show.legend = FALSE) +
scale_color_manual(values = c(Democrat = "blue",
Republican = "red",
Other = "green")) +
scale_shape_manual(values = c(Democrat = "square",
Republican = "triangle",
Other = "circle")) +
annotate("text",x=-1, y=1, label="Democrats", color = "blue") +
annotate("text",x=0.5, y=-1, label="Republicans", color = "red") +
scale_y_continuous("Racial liberalism/conservatism", limits = c(-1.5, 1.5)) +
scale_x_continuous("Economic liberalism/conservatism", limits = c(-1.5, 1.5)) +
ggtitle("80th Congress") +
coord_fixed() +
theme_classic()
grid.arrange(plot_80, plot_112, ncol=2)
median_dw1 <- congress %>%
filter(party %in% c("Republican", "Democrat")) %>%
group_by(party, congress) %>%
summarize(median_dw1 = median(dwnom1)) %>%
ungroup()
## `summarise()` has grouped output by 'party'. You can override using the ## `.groups` argument.
median_dw1
## # A tibble: 66 × 3 ## party congress median_dw1 ## <chr> <int> <dbl> ## 1 Democrat 80 -0.126 ## 2 Democrat 81 -0.207 ## 3 Democrat 82 -0.179 ## 4 Democrat 83 -0.174 ## 5 Democrat 84 -0.223 ## 6 Democrat 85 -0.231 ## 7 Democrat 86 -0.259 ## 8 Democrat 87 -0.258 ## 9 Democrat 88 -0.285 ## 10 Democrat 89 -0.292 ## # ℹ 56 more rows
ggplot(data=median_dw1, aes(x=congress, y=median_dw1, color=party)) + geom_line() + labs(title = "DW-NOMINATE score (1st dimension)", x="Congress", y="Party") + theme_classic()
…
\[\mbox{coeficiente de Gini}=\frac{\mbox{área A}}{\mbox{área A + área B}}\]
polarization <- median_dw1 %>%
pivot_wider(names_from = party,
values_from = median_dw1) %>%
mutate(polarization = Republican - Democrat)
polarization %>%
ggplot(aes(congress, polarization)) +
geom_point() +
labs(title = "Politican polarization",
x="Congress",
y="Pepublican median -\n Democratic median") +
theme_classic()
USGini <- read.csv("USGini.csv")
head(USGini)
## X year gini ## 1 1 1947 0.376 ## 2 2 1948 0.371 ## 3 3 1949 0.378 ## 4 4 1950 0.379 ## 5 5 1951 0.363 ## 6 6 1952 0.368
USGini %>%
ggplot(aes(x=year, y=gini)) +
geom_point() +
labs(title = "Gini coefficent",
x= "Year",
y = "Income inequality") +
theme_classic
gini_2yr <- USGini %>% filter(row_number() %% 2 ==0) %>% select(gini) %>% pull() pol_annual <- polarization %>% select(polarization) %>% pull() cor(gini_2yr, pol_annual)
## [1] 0.9418128
facet_grid() do ggplot2 permite fazer gráficos do tipo. No exemplo vamos comparar as distribuições da segunda dimensão do DW-NOMINATE score entre republicanos e democratas da 112o legislatura.congress %>%
filter(congress == 112, party %in% c("Republican", "Democrat")) %>%
ggplot(aes(x=dwnom2, y=after_stat(density))) +
geom_histogram(binwidth = .2) +
facet_grid(party ~ .) +
labs(x="Racial liberalism/conservatism dimension",
y="Density") +
theme_classic()
…
face <- read.csv("face.csv")
face <- mutate(face,
d.share = d.votes/(d.votes + r.votes),
r.share = r.votes/(d.votes + r.votes),
diff.share = d.share - r.share)
face %>%
ggplot(aes(d.comp, diff.share, color=w.party)) +
geom_point() +
scale_colour_manual(values = c(D = "blue", R = "red")) +
labs(x="Competence score for Democrats",
y="Democrats margin in vote share",
title = "Facial competence and vote share") +
ylim(-1,1) +
xlim(0,1) +
theme(legend.position = "none")
cor(face$d.comp, face$diff.share)
## [1] 0.4327743
library(datasauRus)
data("datasaurus_dozen")
datasaurus_dozen %>%
filter(dataset %in% c("bullseye", "dino", "circle", "x_shape")) %>%
group_by(dataset) %>%
summarise(media_x = mean(x),
media_y = mean(y),
sd_x = sd(x),
sd_y = sd(y),
cor_xy = cor(x,y))
## # A tibble: 4 × 6 ## dataset media_x media_y sd_x sd_y cor_xy ## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 bullseye 54.3 47.8 16.8 26.9 -0.0686 ## 2 circle 54.3 47.8 16.8 26.9 -0.0683 ## 3 dino 54.3 47.8 16.8 26.9 -0.0645 ## 4 x_shape 54.3 47.8 16.8 26.9 -0.0656
lm() do R faz regressão linear. Essa função tem como principal argumento uma fórmula, do tipo Y ~ X, também é comum usar o argumento data para indicar onde estão os dados. No R o símbolo ~ denota uma fórmula.fit <- lm(diff.share ~ d.comp, data = face) fit
## ## Call: ## lm(formula = diff.share ~ d.comp, data = face) ## ## Coefficients: ## (Intercept) d.comp ## -0.3122 0.6604
lm(face$diff.share ~ face$d.comp)
## ## Call: ## lm(formula = face$diff.share ~ face$d.comp) ## ## Coefficients: ## (Intercept) face$d.comp ## -0.3122 0.6604
class(fit)
## [1] "lm"
names(fit)
## [1] "coefficients" "residuals" "effects" "rank" ## [5] "fitted.values" "assign" "qr" "df.residual" ## [9] "xlevels" "call" "terms" "model"
coef(fit)
## (Intercept) d.comp ## -0.3122259 0.6603815
fit$coef
## (Intercept) d.comp ## -0.3122259 0.6603815
head(fitted(fit))
## 1 2 3 4 5 6 ## 0.06060411 -0.08643340 0.09217061 0.04539236 0.13698690 -0.10057206
library(tidymodels) glance(fit)
## # A tibble: 1 × 12 ## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 0.187 0.180 0.266 27.0 0.000000885 1 -10.5 27.0 35.3 ## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
tidy() retorna um data.frame onde cada linha traz a estimativa e estatísticas sobre um coeficiente.tidy(fit)
## # A tibble: 2 × 5 ## term estimate std.error statistic p.value ## <chr> <dbl> <dbl> <dbl> <dbl> ## 1 (Intercept) -0.312 0.0660 -4.73 0.00000624 ## 2 d.comp 0.660 0.127 5.19 0.000000885
augment() retorna os dados originais, os valores previstos (ajustados), os resíduos e outras estatísticas relacionadas as observações.augment(fit)
## # A tibble: 119 × 8 ## diff.share d.comp .fitted .resid .hat .sigma .cooksd .std.resid ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 0.210 0.565 0.0606 0.150 0.00996 0.267 0.00160 0.564 ## 2 0.119 0.342 -0.0864 0.206 0.0129 0.267 0.00394 0.778 ## 3 0.0499 0.612 0.0922 -0.0423 0.0123 0.268 0.000158 -0.160 ## 4 0.197 0.542 0.0454 0.151 0.00922 0.267 0.00151 0.570 ## 5 0.496 0.680 0.137 0.359 0.0174 0.266 0.0163 1.36 ## 6 -0.350 0.321 -0.101 -0.249 0.0143 0.267 0.00644 -0.941 ## 7 0.697 0.404 -0.0456 0.743 0.00979 0.258 0.0388 2.80 ## 8 0.266 0.603 0.0860 0.180 0.0118 0.267 0.00276 0.681 ## 9 -0.372 0.539 0.0434 -0.415 0.00914 0.265 0.0113 -1.57 ## 10 0.0106 0.869 0.262 -0.251 0.0426 0.267 0.0206 -0.963 ## # ℹ 109 more rows
ggplot() +
geom_point(data=face, aes(d.comp, diff.share), shape=1) +
geom_abline(slope = coef(fit)["d.comp"],
intercept = coef(fit)["(Intercept)"]) +
scale_x_continuous("Competence score for Democrats",
breaks = seq(0,1,by=0.2), limits = c(0,1)) +
scale_y_continuous("Democratic margin in vote shares",
breaks = seq(-1,1,by=0.5), limits = c(-1,1)) +
geom_vline(xintercept=mean(face$d.comp), linetype="dashed") +
geom_hline(yintercept=mean(face$diff.share), linetype="dashed") +
ggtitle("Facial competence and vote share") +
theme_classic()
ggplot(data=face, aes(d.comp, diff.share)) +
geom_point(shape=1) +
geom_smooth(method="lm", se=FALSE, color="black") +
scale_x_continuous("Competence score for Democrats",
breaks = seq(0,1,by=0.2), limits = c(0,1)) +
scale_y_continuous("Democratic margin in vote shares",
breaks = seq(-1,1,by=0.5), limits = c(-1,1)) +
geom_vline(xintercept=mean(face$d.comp), linetype="dashed") +
geom_hline(yintercept=mean(face$diff.share), linetype="dashed") +
ggtitle("Facial competence and vote share") +
theme_classic()
epsilon.hat <- resid(fit) sqrt(mean(epsilon.hat^2))
## [1] 0.2642361
mean(resid(fit))
## [1] -2.04732e-17
florida <- read.csv("florida.csv")
fit2 <- lm(Buchanan00 ~ Perot96, data=florida) fit2
## ## Call: ## lm(formula = Buchanan00 ~ Perot96, data = florida) ## ## Coefficients: ## (Intercept) Perot96 ## 1.34575 0.03592
TSS2 <- florida %>% mutate(diff_sq = (Buchanan00 - mean(florida$Buchanan00))^2) %>% summarize(TSS=sum(diff_sq)) SSR2 <- sum(resid(fit2)^2) 1 - SSR2/TSS2
## TSS ## 1 0.5130333
R2 <- function(fit){
resid <- resid(fit)
y <- fitted(fit) + resid
TSS <- sum((y-mean(y))^2)
SSR <- sum(resid^2)
R2 <- 1 - SSR/TSS
return(R2)
}
R2(fit2)
## [1] 0.5130333
summary() quando aplicada a um objeto do tipo lm retorna informações relativas a regressão, entre essas informações está o \(R^2\).summary(fit2)
## ## Call: ## lm(formula = Buchanan00 ~ Perot96, data = florida) ## ## Residuals: ## Min 1Q Median 3Q Max ## -612.74 -65.96 1.94 32.88 2301.66 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 1.34575 49.75931 0.027 0.979 ## Perot96 0.03592 0.00434 8.275 9.47e-12 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 316.4 on 65 degrees of freedom ## Multiple R-squared: 0.513, Adjusted R-squared: 0.5055 ## F-statistic: 68.48 on 1 and 65 DF, p-value: 9.474e-12
fit2summary <- summary(fit2) fit2summary$r.squared
## [1] 0.5130333
glance() do pacote broom.glance(fit2)
## # A tibble: 1 × 12 ## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 0.513 0.506 316. 68.5 9.47e-12 1 -480. 966. 972. ## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
augment_fit2 <- augment(fit2) augment_fit2 %>% ggplot(aes(.fitted, .resid)) + geom_point(shape=1) + geom_hline(yintercept = 0) + labs(x="Fitted values", y="Residuals") + theme_classic()
add_predictions() e add_residuals() do pacote modelr ajudam a identificar o condado.library(modelr) florida_fit2 <- florida %>% add_predictions(fit2) %>% add_residuals(fit2) florida_fit2 %>% filter(resid == max(resid)) %>% select(county) %>% pull()
## [1] "PalmBeach"
…
florida.pb <- filter(florida, county != "PalmBeach") fit3 <- lm(Buchanan00 ~ Perot96, data = florida.pb) fit3
## ## Call: ## lm(formula = Buchanan00 ~ Perot96, data = florida.pb) ## ## Coefficients: ## (Intercept) Perot96 ## 45.84193 0.02435
R2(fit3)
## [1] 0.8511675
florida.pb %>%
add_residuals(fit3) %>%
add_predictions(fit3) %>%
ggplot(aes(pred, resid)) +
geom_hline(yintercept = 0) +
geom_point(shape=1) +
ylim(-750, 2500) +
xlim(0,1500) +
labs(title = "Residual plot without Plam Beach",
x= "Fitted values",
y="Residuals") +
theme_classic()
ggplot() +
geom_point(data = florida, aes(Perot96, Buchanan00), shape=1) +
geom_smooth(data = florida, aes(Perot96, Buchanan00), method="lm", se=FALSE) +
geom_smooth(data = florida.pb, aes(Perot96, Buchanan00), method="lm",
se=FALSE, linetype="dashed") +
annotate("text", x=30000, y=3200, label = "Palm Beach") +
annotate("text", x=30000, y=300, label = "Regression line without\n Palm Beach") +
annotate("text", x=25000, y=1400, label = "Regression line with\n Palm Beach") +
theme_classic()
women <- read.csv("women.csv")
head(women)
## GP village reserved female irrigation water ## 1 1 2 1 1 0 10 ## 2 1 1 1 1 5 0 ## 3 2 2 1 1 2 2 ## 4 2 1 1 1 4 31 ## 5 3 2 0 0 0 0 ## 6 3 1 0 0 0 0
women %>% group_by(reserved) %>% summarize(prop_female = mean(female))
## # A tibble: 2 × 2 ## reserved prop_female ## <int> <dbl> ## 1 0 0.0748 ## 2 1 1
women %>%
group_by(reserved) %>%
summarize(irrigation = mean(irrigation),
water = mean(water)) %>%
pivot_longer(names_to = "variables", -reserved) %>%
pivot_wider(names_from = reserved) %>%
rename("not_reserved" = `0`, "reserved" = `1`) %>%
mutate(diff = reserved - not_reserved)
## # A tibble: 2 × 4 ## variables not_reserved reserved diff ## <chr> <dbl> <dbl> <dbl> ## 1 irrigation 3.39 3.02 -0.369 ## 2 water 14.7 24.0 9.25
lm(water ~ reserved, data = women)
## ## Call: ## lm(formula = water ~ reserved, data = women) ## ## Coefficients: ## (Intercept) reserved ## 14.738 9.252
lm(irrigation ~ reserved, data = women)
## ## Call: ## lm(formula = irrigation ~ reserved, data = women) ## ## Coefficients: ## (Intercept) reserved ## 3.3879 -0.3693
lm(), a fórmula no chamado da função deve incluir todas as variáveis explicativas, por exemplo: lm(y ~ x1 + x2 + x3).lm() automaticamente cria um conjunto de variáveis binárias com um nas observações correspondentes ao grupo e zero nas outras observações.social <- read.csv("social.csv")
class(social$messages)
## [1] "character"
head(social$messages)
## [1] "Civic Duty" "Civic Duty" "Hawthorne" "Hawthorne" "Hawthorne" ## [6] "Control"
unique(social$messages)
## [1] "Civic Duty" "Hawthorne" "Control" "Neighbors"
fit <- lm(primary2006 ~ messages, data=social) fit
## ## Call: ## lm(formula = primary2006 ~ messages, data = social) ## ## Coefficients: ## (Intercept) messagesControl messagesHawthorne messagesNeighbors ## 0.314538 -0.017899 0.007837 0.063411
social <- social %>%
mutate(Control = if_else(messages == "Control", 1, 0),
Hawthorne = if_else(messages == "Hawthorne", 1, 0),
Neighbors = if_else(messages == "Neighbors", 1, 0))
social %>% select(Control, Hawthorne, Neighbors) %>% head()
## Control Hawthorne Neighbors ## 1 0 0 0 ## 2 0 0 0 ## 3 0 1 0 ## 4 0 1 0 ## 5 0 1 0 ## 6 1 0 0
lm(primary2006 ~ Control + Hawthorne + Neighbors, data = social)
## ## Call: ## lm(formula = primary2006 ~ Control + Hawthorne + Neighbors, data = social) ## ## Coefficients: ## (Intercept) Control Hawthorne Neighbors ## 0.314538 -0.017899 0.007837 0.063411
add_predictions() do pacote modelr. Para isso vamos usar a função data_grid(), também do modelr, para criar um data.frame com cada valor único da variável messages. A função data_grid() cria um data.frame com combinações únicas das colunas especificadas.data_grid(social, messages)
## # A tibble: 4 × 1 ## messages ## <chr> ## 1 Civic Duty ## 2 Control ## 3 Hawthorne ## 4 Neighbors
unique_messages <- data_grid(social, messages) %>% add_predictions(fit) unique_messages
## # A tibble: 4 × 2 ## messages pred ## <chr> <dbl> ## 1 Civic Duty 0.315 ## 2 Control 0.297 ## 3 Hawthorne 0.322 ## 4 Neighbors 0.378
social %>% group_by(messages) %>% summarize(mean(primary2006))
## # A tibble: 4 × 2 ## messages `mean(primary2006)` ## <chr> <dbl> ## 1 Civic Duty 0.315 ## 2 Control 0.297 ## 3 Hawthorne 0.322 ## 4 Neighbors 0.378
fit.noint <- lm(primary2006 ~ -1 + messages, data=social) fit.noint
## ## Call: ## lm(formula = primary2006 ~ -1 + messages, data = social) ## ## Coefficients: ## messagesCivic Duty messagesControl messagesHawthorne messagesNeighbors ## 0.3145 0.2966 0.3224 0.3779
relevel() para mudar o nível de referência do fator.social.f <- social %>%
mutate(messages.f = as.factor(messages),
mes.C = relevel(messages.f, ref = "Control"),
mes.H = relevel(messages.f, ref = "Hawthorne"),
mes.N = relevel(messages.f, ref = "Neighbors"))
lm(primary2006 ~ mes.C, data = social.f)
## ## Call: ## lm(formula = primary2006 ~ mes.C, data = social.f) ## ## Coefficients: ## (Intercept) mes.CCivic Duty mes.CHawthorne mes.CNeighbors ## 0.29664 0.01790 0.02574 0.08131
lm(primary2006 ~ mes.H, data = social.f)
## ## Call: ## lm(formula = primary2006 ~ mes.H, data = social.f) ## ## Coefficients: ## (Intercept) mes.HCivic Duty mes.HControl mes.HNeighbors ## 0.322375 -0.007837 -0.025736 0.055574
lm(primary2006 ~ mes.N, data = social.f)
## ## Call: ## lm(formula = primary2006 ~ mes.N, data = social.f) ## ## Coefficients: ## (Intercept) mes.NCivic Duty mes.NControl mes.NHawthorne ## 0.37795 -0.06341 -0.08131 -0.05557
group_by() e calcular a diferença em relação ao grupo de interesse.social %>%
group_by(messages) %>%
summarize(primary2006 = mean(primary2006)) %>%
mutate(Control = primary2006[messages == "Control"],
diff = primary2006 - Control)
## # A tibble: 4 × 4 ## messages primary2006 Control diff ## <chr> <dbl> <dbl> <dbl> ## 1 Civic Duty 0.315 0.297 0.0179 ## 2 Control 0.297 0.297 0 ## 3 Hawthorne 0.322 0.297 0.0257 ## 4 Neighbors 0.378 0.297 0.0813
adjR2 <- function(fit) {
resid <- resid(fit)
y <- fitted(fit) + resid
n <- length(y)
TSS.adj <- sum((y - mean(y))^2)/(n - 1)
SSR.adj <- sum(resid^2)/(n - length(coef(fit)))
R2.adj <- 1 - SSR.adj/TSS.adj
return(R2.adj)
}
adjR2(fit)
## [1] 0.003272788
R2(fit)
## [1] 0.003282564
summary(fit)$adj.r.squared
## [1] 0.003272788
ate <- social %>% group_by(primary2004, messages) %>% summarize(primary2006 = mean(primary2006)) %>% pivot_wider(names_from = messages, values_from = primary2006) %>% mutate(ate_Neighbors = Neighbors - Control) %>% select(primary2004, Neighbors, Control, ate_Neighbors) %>% ungroup()
ate
## # A tibble: 2 × 4 ## primary2004 Neighbors Control ate_Neighbors ## <int> <dbl> <dbl> <dbl> ## 1 0 0.306 0.237 0.0693 ## 2 1 0.482 0.386 0.0965
ate.voter <- filter(ate, primary2004 == 1) %>% select(ate_Neighbors) %>% pull() ate.nonvoter <- filter(ate, primary2004 == 0) %>% select(ate_Neighbors) %>% pull() ate.voter - ate.nonvoter
## [1] 0.02722908
social.neighbor <- social %>%
filter(messages == "Control" | messages == "Neighbors")
fit.int <- lm(primary2006 ~ primary2004 + messages + primary2004:messages,
data = social.neighbor)
fit.int
## ## Call: ## lm(formula = primary2006 ~ primary2004 + messages + primary2004:messages, ## data = social.neighbor) ## ## Coefficients: ## (Intercept) primary2004 ## 0.23711 0.14870 ## messagesNeighbors primary2004:messagesNeighbors ## 0.06930 0.02723
lm(primary2006 ~ primary2004*messages, data = social.neighbor)
## ## Call: ## lm(formula = primary2006 ~ primary2004 * messages, data = social.neighbor) ## ## Coefficients: ## (Intercept) primary2004 ## 0.23711 0.14870 ## messagesNeighbors primary2004:messagesNeighbors ## 0.06930 0.02723
social.neighbor <- social.neighbor %>% mutate(age = 2008 - yearofbirth) summary(social.neighbor$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 22.00 43.00 52.00 51.82 61.00 108.00
fit.age <- lm(primary2006 ~ age*messages, data = social.neighbor) fit.age
## ## Call: ## lm(formula = primary2006 ~ age * messages, data = social.neighbor) ## ## Coefficients: ## (Intercept) age messagesNeighbors ## 0.0894768 0.0039982 0.0485728 ## age:messagesNeighbors ## 0.0006283
crossing() do pacote tidyr que será utilizada para criar as combinações entre idade e tartamento recebido.crossing(age = seq(from=20, to=80, by=20), messages = c("Neighbors", "Control"))
## # A tibble: 8 × 2 ## age messages ## <dbl> <chr> ## 1 20 Control ## 2 20 Neighbors ## 3 40 Control ## 4 40 Neighbors ## 5 60 Control ## 6 60 Neighbors ## 7 80 Control ## 8 80 Neighbors
ate.age <- tidyr::crossing(age = seq(from=20, to=80, by=20),
messages = c("Neighbors", "Control")) %>%
add_predictions(fit.age) %>%
pivot_wider(names_from = messages, values_from = pred) %>%
mutate(diff = Neighbors - Control)
ate.age
## # A tibble: 4 × 4 ## age Control Neighbors diff ## <dbl> <dbl> <dbl> <dbl> ## 1 20 0.169 0.231 0.0611 ## 2 40 0.249 0.323 0.0737 ## 3 60 0.329 0.416 0.0863 ## 4 80 0.409 0.508 0.0988
I().fit.age2 <- lm(primary2006 ~ age + I(age^2) + messages + age:messages + I(age^2):messages,
data = social.neighbor)
fit.age2
## ## Call: ## lm(formula = primary2006 ~ age + I(age^2) + messages + age:messages + ## I(age^2):messages, data = social.neighbor) ## ## Coefficients: ## (Intercept) age ## -9.700e-02 1.172e-02 ## I(age^2) messagesNeighbors ## -7.389e-05 -5.275e-02 ## age:messagesNeighbors I(age^2):messagesNeighbors ## 4.804e-03 -3.961e-05
data_grid() do pacote modelr para criar todas as combinações possíveis de idade e tratamento, estas combinações serão os cenários para os quais faremos as previsões.social.neighbor %>% data_grid(age, messages)
## # A tibble: 172 × 2 ## age messages ## <dbl> <chr> ## 1 22 Control ## 2 22 Neighbors ## 3 23 Control ## 4 23 Neighbors ## 5 24 Control ## 6 24 Neighbors ## 7 25 Control ## 8 25 Neighbors ## 9 26 Control ## 10 26 Neighbors ## # ℹ 162 more rows
y.hat <- social.neighbor %>% data_grid(age, messages) %>% add_predictions(fit.age2) head(y.hat)
## # A tibble: 6 × 3 ## age messages pred ## <dbl> <chr> <dbl> ## 1 22 Control 0.125 ## 2 22 Neighbors 0.159 ## 3 23 Control 0.134 ## 4 23 Neighbors 0.170 ## 5 24 Control 0.142 ## 6 24 Neighbors 0.182
ggplot(y.hat, aes(age, pred)) +
geom_line(aes(linetype = messages, color = messages)) +
labs(color = "", linetype="",
y = "Predicited turnout rate", x="Age") +
xlim(20,90) +
theme_classic()
social.neighbor.ate <- y.hat %>% pivot_wider(names_from = messages, values_from = pred) %>% mutate(ate = Neighbors - Control) head(social.neighbor.ate)
## # A tibble: 6 × 4 ## age Control Neighbors ate ## <dbl> <dbl> <dbl> <dbl> ## 1 22 0.125 0.159 0.0338 ## 2 23 0.134 0.170 0.0368 ## 3 24 0.142 0.182 0.0397 ## 4 25 0.150 0.192 0.0426 ## 5 26 0.158 0.203 0.0454 ## 6 27 0.166 0.214 0.0481
head(social.neighbor.ate)
## # A tibble: 6 × 4 ## age Control Neighbors ate ## <dbl> <dbl> <dbl> <dbl> ## 1 22 0.125 0.159 0.0338 ## 2 23 0.134 0.170 0.0368 ## 3 24 0.142 0.182 0.0397 ## 4 25 0.150 0.192 0.0426 ## 5 26 0.158 0.203 0.0454 ## 6 27 0.166 0.214 0.0481
social.neighbor.ate %>%
ggplot(aes(age, ate)) +
geom_line() +
labs(y="Estimated average treatment effect",
x = "Age") +
xlim(20,90) +
ylim(0,0.1) +
theme_classic()