Data Analysis with multivariate Linear Regression on data about polls for the 2006 and 2010 elections in Brazil for the “Câmara Federal de Deputados”. Data was taken from the TSE portal and encompasses approximately 7300 candidates.
eleicoes_data <- readr::read_csv(
here::here('data/eleicoes_2006_e_2010.csv'),
progress = FALSE,
local=readr::locale("br"),
col_types = cols(
ano = col_integer(),
sequencial_candidato = col_character(),
quantidade_doacoes = col_integer(),
quantidade_doadores = col_integer(),
total_receita = col_double(),
media_receita = col_double(),
recursos_de_outros_candidatos.comites = col_double(),
recursos_de_pessoas_fisicas = col_double(),
recursos_de_pessoas_juridicas = col_double(),
recursos_proprios = col_double(),
`recursos_de_partido_politico` = col_double(),
quantidade_despesas = col_integer(),
quantidade_fornecedores = col_integer(),
total_despesa = col_double(),
media_despesa = col_double(),
votos = col_integer(),
.default = col_character()))
# Let's put everything in Upper case for uniformity
eleicoes_data %>%
mutate(nome = toupper(nome),
sexo = toupper(sexo),
grau = toupper(grau),
nome = toupper(nome),
cargo = toupper(cargo),
ocupacao = toupper(ocupacao),
partido = toupper(partido),
estado_civil = toupper(estado_civil),
sequencial_candidato = as.numeric(sequencial_candidato)) -> eleicoes_data
# Adding surrogate key to dataframe
eleicoes_data$id <- 1:nrow(eleicoes_data)
eleicoes_data %>%
glimpse()
## Observations: 7,476
## Variables: 25
## $ ano <int> 2006, 2006, 2006, 2006, ...
## $ sequencial_candidato <dbl> 10001, 10002, 10002, 100...
## $ nome <chr> "JOSÉ LUIZ NOGUEIRA DE S...
## $ uf <chr> "AP", "RO", "AP", "MS", ...
## $ partido <chr> "PT", "PT", "PT", "PRONA...
## $ quantidade_doacoes <int> 6, 13, 17, 6, 48, 6, 14,...
## $ quantidade_doadores <int> 6, 13, 16, 6, 48, 6, 7, ...
## $ total_receita <dbl> 16600.00, 22826.00, 1581...
## $ media_receita <dbl> 2766.67, 1755.85, 9301.2...
## $ recursos_de_outros_candidatos.comites <dbl> 0.00, 6625.00, 2250.00, ...
## $ recursos_de_pessoas_fisicas <dbl> 9000.00, 15000.00, 34150...
## $ recursos_de_pessoas_juridicas <dbl> 6300.00, 1000.00, 62220....
## $ recursos_proprios <dbl> 1300.00, 201.00, 59500.0...
## $ recursos_de_partido_politico <dbl> 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ quantidade_despesas <int> 14, 24, 123, 8, 133, 9, ...
## $ quantidade_fornecedores <int> 14, 23, 108, 8, 120, 9, ...
## $ total_despesa <dbl> 16583.60, 20325.99, 1460...
## $ media_despesa <dbl> 1184.54, 846.92, 1187.09...
## $ cargo <chr> "DEPUTADO FEDERAL", "DEP...
## $ sexo <chr> "MASCULINO", "FEMININO",...
## $ grau <chr> "ENSINO MÉDIO COMPLETO",...
## $ estado_civil <chr> "CASADO(A)", "SOLTEIRO(A...
## $ ocupacao <chr> "VEREADOR", "SERVIDOR PÚ...
## $ votos <int> 8579, 2757, 17428, 1193,...
## $ id <int> 1, 2, 3, 4, 5, 6, 7, 8, ...
The response variable is the variable that you are interesting in making measurements and conclusions on.
A predictor variable is a variable used to predict another variable.
Our response variable will be "votos", we want to study how well the predictor variables can help predict its behavior and how they impact in the linear regression.
eleicoes_data %>%
filter(ano == 2006) %>%
group_by(partido) %>%
summarize(n = sum(votos)) %>%
ggplot(aes(reorder(partido,n), n)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90,
hjust = 1)) +
labs(x="Political Party",
title="2006 elections",
y="Number of votes") -> p1
eleicoes_data %>%
filter(ano == 2010) %>%
group_by(partido) %>%
summarize(n = sum(votos)) %>%
ggplot(aes(reorder(partido,n), n)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90,
hjust = 1)) +
labs(x="Political Party",
title="2010 elections",
y="Number of votes") -> p2
grid.arrange(p1, p2, ncol=1)
eleicoes_data %>%
ggplot(aes(total_receita)) +
geom_histogram(bins = 30) +
labs(x="Total Revenue",
y="Absolute Frequency") +
facet_grid(. ~ ano) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
eleicoes_data %>%
ggplot(aes(media_receita)) +
geom_histogram(bins = 30) +
labs(x="Mean Revenue",
y="Absolute Frequency") +
facet_grid(. ~ ano)
eleicoes_data %>%
ggplot(aes(total_despesa)) +
geom_histogram(bins = 30) +
labs(x="Total Expenditure",
y="Absolute Frequency") +
facet_grid(. ~ ano)
eleicoes_data %>%
ggplot(aes(media_despesa)) +
geom_histogram(bins = 30) +
labs(x="Mean Expenditure",
y="Absolute Frequency") +
facet_grid(. ~ ano)
eleicoes_data %>%
ggplot(aes(recursos_proprios)) +
geom_histogram(bins = 30) +
labs(x="Own resources",
y="Absolute Frequency") +
facet_grid(. ~ ano)
eleicoes_data %>%
mutate(ano = as.factor(ano)) %>%
group_by(estado_civil, ano) %>%
summarize(n = n()) %>%
ggplot(aes(reorder(estado_civil,n), n,
fill= ano)) +
geom_bar(stat = "identity",
position = position_dodge(width = 0.5)) +
labs(x="Marital status of candidate",
y="Absolute Frequency") +
guides(fill = guide_legend(title = "year")) +
coord_flip()
2010 overall tops 2006 in the number of candidates of each and every marital status with one exception:
eleicoes_data %>%
mutate(ano = as.factor(ano)) %>%
group_by(grau, ano) %>%
summarize(n = n()) %>%
ggplot(aes(reorder(grau,n), n,
fill= ano)) +
geom_bar(stat = "identity",
position = position_dodge(width = 0.5)) +
labs(x="Education level",
y="Absolute Frequency") +
guides(fill = guide_legend(title = "year")) +
coord_flip()
2010 overall tops 2006 in the number of candidates of each and every education level with one exception:
eleicoes_data %>%
group_by(sexo, ano) %>%
summarize(n = n()) %>%
ggplot(aes(reorder(sexo,n), n)) +
geom_bar(stat = "identity") +
labs(x="Sex",
y="Absolute Frequency") +
facet_grid(. ~ano)
eleicoes_data %>%
select(id,
quantidade_despesas,
quantidade_fornecedores,
recursos_de_partido_politico,
recursos_de_pessoas_juridicas,
recursos_de_pessoas_fisicas,
recursos_de_outros_candidatos.comites) %>%
melt(id=c("id")) %>%
ggplot(aes(x = value)) +
geom_histogram(bins = 30) +
facet_wrap(. ~ variable,
ncol = 2,
scales = "free_x") +
labs(x="Predictor",y="Absolute Frequency")
eleicoes_data %>%
select(id,
total_receita,
media_receita,
total_despesa,
media_despesa,
recursos_proprios,
quantidade_doacoes,
quantidade_doadores) %>%
melt(id=c("id")) %>%
ggplot(aes(x = value)) +
geom_histogram(bins = 30) +
facet_wrap(. ~ variable,
scales = "free_x") +
labs(x="Predictor",y="Absolute Frequency")
The standard method to deal with a positive skew is to apply a logarithmic transformation to the affected predictor. However, to apply the aforementioned transformation the predictor must not contain any 0.
eleicoes_data %>%
select(quantidade_doacoes,
quantidade_doadores,
total_receita,
media_receita,
recursos_de_outros_candidatos.comites,
recursos_de_pessoas_fisicas,
recursos_de_pessoas_juridicas,
recursos_proprios,
recursos_de_partido_politico,
quantidade_despesas,
quantidade_fornecedores,
total_despesa,
media_despesa) %>%
sapply(., function(x) 0 %in% x) %>%
as.data.frame(row.names = NULL) %>%
tibble::rownames_to_column() %>%
set_colnames(c("predictor","contains_zero")) %>%
arrange(contains_zero)
# apply logarithmic transformation
eleicoes_data %>%
mutate(log.quantidade_doacoes = log10(quantidade_doacoes),
log.quantidade_doadores = log10(quantidade_doadores),
log.quantidade_despesas = log10(quantidade_despesas),
log.quantidade_fornecedores = log10(quantidade_fornecedores),
sqrt.total_receita = sqrt(total_receita),
sqrt.media_receita = sqrt(media_receita),
sqrt.total_despesa = sqrt(total_despesa),
sqrt.media_despesa = sqrt(media_despesa),
sqrt.recursos_proprios = sqrt(recursos_proprios),
sqrt.recursos_de_pessoas_juridicas = sqrt(recursos_de_pessoas_juridicas),
sqrt.recursos_de_partido_politico = sqrt(recursos_de_partido_politico),
sqrt.recursos_de_outros_candidatos.comites = sqrt(recursos_de_outros_candidatos.comites),
sqrt.recursos_de_pessoas_fisicas = sqrt(recursos_de_pessoas_fisicas)) -> eleicoes_data
# put all quantitative predictors (of interest) in same scale
eleicoes_data %>%
mutate_at(.vars = vars(quantidade_doacoes,
quantidade_doadores,
total_receita,
media_receita,
sqrt.total_receita,
sqrt.media_receita,
sqrt.recursos_de_outros_candidatos.comites,
sqrt.recursos_de_pessoas_fisicas,
sqrt.recursos_de_pessoas_juridicas,
sqrt.recursos_proprios,
sqrt.recursos_de_partido_politico,
sqrt.total_despesa,
sqrt.media_despesa,
log.quantidade_doacoes,
log.quantidade_doadores,
log.quantidade_despesas,
log.quantidade_fornecedores,
sequencial_candidato,
recursos_de_outros_candidatos.comites,
recursos_de_pessoas_fisicas,
recursos_de_pessoas_juridicas,
recursos_proprios,
recursos_de_partido_politico,
quantidade_despesas,
quantidade_fornecedores,
total_despesa,
media_despesa),
.funs = funs(as.numeric(scale(.)))) -> scaled_data
eleicoes_data %>%
select(id,
log.quantidade_doacoes,
log.quantidade_doadores,
log.quantidade_despesas,
log.quantidade_fornecedores) %>%
melt(id=c("id")) %>%
ggplot(aes(x = value)) +
geom_histogram(bins = 30) +
facet_wrap(. ~ variable,
scales = "free_x") +
labs(x="Predictor",y="Absolute Frequency")
eleicoes_data %>%
select(id,
sqrt.total_receita,
sqrt.media_receita,
sqrt.recursos_de_outros_candidatos.comites,
sqrt.recursos_de_pessoas_fisicas,
sqrt.recursos_de_pessoas_juridicas,
sqrt.recursos_proprios,
sqrt.recursos_de_partido_politico,
sqrt.total_despesa,
sqrt.media_despesa) %>%
melt(id=c("id")) %>%
ggplot(aes(x = value)) +
geom_histogram(bins = 30) +
facet_wrap(. ~ variable,
scales = "free_x") +
labs(x="Predictor",y="Absolute Frequency")
eleicoes_data %>%
filter(ano == 2006) %>%
select(-partido,
-uf,-nome,-id,
-estado_civil,
-ocupacao,-ano,
-total_receita,
-media_receita,
-total_despesa,
-media_despesa,
-recursos_proprios,
-cargo,-grau,-sexo,
-quantidade_doacoes,
-quantidade_doadores,
-quantidade_despesas,
-quantidade_fornecedores,
-recursos_de_pessoas_fisicas,
-recursos_de_partido_politico,
-recursos_de_pessoas_juridicas,
-recursos_de_outros_candidatos.comites) %>%
na.omit() %>%
ggcorr(palette = "RdBu",
color = "grey50",
label = TRUE, hjust = 1,
label_size = 3, size = 4,
nbreaks = 5, layout.exp = 7) +
ggtitle("Correlation plot for 2006 elections")
eleicoes_data %>%
filter(ano == 2010) %>%
select(-partido,
-uf,-nome,-id,
-estado_civil,
-ocupacao,-ano,
-total_receita,
-media_receita,
-total_despesa,
-media_despesa,
-recursos_proprios,
-cargo,-grau,-sexo,
-quantidade_doacoes,
-quantidade_doadores,
-quantidade_despesas,
-quantidade_fornecedores,
-recursos_de_pessoas_fisicas,
-recursos_de_partido_politico,
-recursos_de_pessoas_juridicas,
-recursos_de_outros_candidatos.comites) %>%
na.omit() %>%
ggcorr(palette = "RdBu",
color = "grey50",
label = TRUE, hjust = 1,
label_size = 3, size = 4,
nbreaks = 5, layout.exp = 7) +
ggtitle("Correlation plot for 2010 elections")
eleicoes_data %>%
select(-partido,
-uf,-nome,-id,
-estado_civil,
-ocupacao,-ano,
-total_receita,
-media_receita,
-total_despesa,
-media_despesa,
-recursos_proprios,
-cargo,-grau,-sexo,
-quantidade_doacoes,
-quantidade_doadores,
-quantidade_despesas,
-quantidade_fornecedores,
-recursos_de_pessoas_fisicas,
-recursos_de_partido_politico,
-recursos_de_pessoas_juridicas,
-recursos_de_outros_candidatos.comites) %>%
na.omit() %>%
ggcorr(palette = "RdBu",
color = "grey50",
label = TRUE, hjust = 1,
label_size = 3, size = 4,
nbreaks = 5, layout.exp = 7) +
ggtitle("Correlation plot for both elections")
For the aforementioned reasons a multivariate linear regression model made with all variables isn’t plausible.
scaled_data %>%
filter(ano == 2006) -> scaled_data_2006
scaled_data_2006 %>%
sample_n(5)
set.seed(11) # We set the set for reason of reproducibility
scaled_data_2006 %>%
dplyr::sample_frac(.5) -> train_data_2006
encoding <- build_encoding(dataSet = train_data_2006,
cols = c("uf","sexo","grau",
"partido","estado_civil"),
verbose = F)
train_data_2006 <- one_hot_encoder(dataSet = train_data_2006,
encoding = encoding,
drop = TRUE,
verbose = F)
cat("#### Train Data ",
"\n##### Observations: ",nrow(train_data_2006),
"\n##### Variables: ",ncol(train_data_2006))
set.seed(11) # We set the set for reason of reproducibility
dplyr::anti_join(scaled_data_2006,
train_data_2006,
by = 'id') -> intermediate_data
intermediate_data %>%
dplyr::sample_frac(.5) -> test_data_2006
test_data_2006 <- one_hot_encoder(dataSet = test_data_2006,
encoding = encoding,
drop = TRUE,
verbose = F)
cat("#### Test Data ",
"\n##### Observations: ",nrow(test_data_2006),
"\n##### Variables: ",ncol(test_data_2006))
set.seed(11) # We set the set for reason of reproducibility
dplyr::anti_join(intermediate_data,
test_data_2006,
by = 'id') -> validate_data_2006
validate_data_2006 <- one_hot_encoder(dataSet = validate_data_2006,
encoding = encoding,
drop = TRUE,
verbose = F)
rm(intermediate_data)
cat("#### Validate Data ",
"\n##### Observations: ",nrow(validate_data_2006),
"\n##### Variables: ",ncol(validate_data_2006))
mod_2006 <- lm(votos ~ log.quantidade_fornecedores * partido.PSDB + sqrt.media_despesa * partido.PMDB +
sqrt.total_receita * sqrt.total_despesa + uf.SP * estado.civil.CASADO.A. +
uf.RJ * sqrt.total_despesa + sqrt.total_receita * `grau.SUPERIOR COMPLETO`,
data = train_data_2006)
broom::glance(mod_2006)
broom::tidy(mod_2006,
conf.int = TRUE,
conf.level = 0.95)
scaled_data %>%
filter(ano == 2010) -> scaled_data_2010
scaled_data_2010 %>%
sample_n(5)
set.seed(11) # We set the set for reason of reproducibility
scaled_data_2010 %>%
dplyr::sample_frac(.5) -> train_data_2010
encoding <- build_encoding(dataSet = train_data_2010,
cols = c("uf","sexo","grau",
"partido","estado_civil"),
verbose = F)
train_data_2010 <- one_hot_encoder(dataSet = train_data_2010,
encoding = encoding,
drop = TRUE,
verbose = F)
cat("#### Train Data ",
"\n##### Observations: ",nrow(train_data_2010),
"\n##### Variables: ",ncol(train_data_2010))
set.seed(11) # We set the set for reason of reproducibility
dplyr::anti_join(scaled_data_2010,
train_data_2010,
by = 'id') -> intermediate_data
intermediate_data %>%
dplyr::sample_frac(.5) -> test_data_2010
test_data_2010 <- one_hot_encoder(dataSet = test_data_2010,
encoding = encoding,
drop = TRUE,
verbose = F)
cat("#### Test Data ",
"\n##### Observations: ",nrow(test_data_2010),
"\n##### Variables: ",ncol(test_data_2010))
set.seed(11) # We set the set for reason of reproducibility
dplyr::anti_join(intermediate_data,
test_data_2010,
by = 'id') -> validate_data_2010
validate_data_2010 <- one_hot_encoder(dataSet = validate_data_2010,
encoding = encoding,
drop = TRUE,
verbose = F)
rm(intermediate_data)
cat("#### Validate Data ",
"\n##### Observations: ",nrow(validate_data_2010),
"\n##### Variables: ",ncol(validate_data_2010))
mod_2010 <- lm(votos ~ log.quantidade_fornecedores * partido.PSDB + sqrt.media_despesa * partido.PMDB +
sqrt.total_receita * sqrt.total_despesa + uf.SP * estado.civil.CASADO.A. +
uf.RJ * sqrt.total_despesa + sqrt.total_receita * `grau.SUPERIOR COMPLETO`,
data = train_data_2010)
broom::glance(mod_2010)
broom::tidy(mod_2010,
conf.int = TRUE,
conf.level = 0.95)
broom::tidy(mod_2006,
conf.int = TRUE,
conf.level = 0.95,
sep=":") %>%
arrange(desc(p.value)) %>%
slice(1:3) %>%
ggplot(aes(reorder(term,p.value), p.value)) +
geom_point(size = 2) +
labs(x = "Predictor variable",
y = "Estimated p-value",
title="Predictors of of biggest p.value (2006 elections)")
broom::tidy(mod_2010,
conf.int = TRUE,
conf.level = 0.95,
sep=":") %>%
arrange(desc(p.value)) %>%
slice(1:3) %>%
ggplot(aes(reorder(term,p.value), p.value)) +
geom_point(size = 2) +
labs(x = "Predictor variable",
y = "Estimated p-value",
title="Predictors of biggest p.value (2010 elections)")
On both models we can see that predictors related to partido perform poorly. Also we could see the sizable appearance of categorical variable among underperformer predictors.
broom::tidy(mod_2006,
conf.int = TRUE,
conf.level = 0.95,
sep=":") %>%
filter(term != "(Intercept)") %>%
arrange(p.value) %>%
slice(1:3) %>%
ggplot(aes(reorder(term,p.value), p.value)) +
geom_hline(yintercept = 0.05, colour = "darkred") +
geom_point(size = 2) +
labs(x = "Predictor variable",
y = "Estimated p-value",
title="Predictors of smallest p.value (2006 elections)")
broom::tidy(mod_2010,
conf.int = TRUE,
conf.level = 0.95,
sep=":") %>%
filter(term != "(Intercept)") %>%
arrange(p.value) %>%
slice(1:3) %>%
ggplot(aes(reorder(term,p.value), p.value)) +
geom_hline(yintercept = 0.05, colour = "darkred") +
geom_point(size = 2) +
labs(x = "Predictor variable",
y = "Estimated p-value",
title="Predictors of smallest p.value (2010 elections)")
On both models different combinations of the predictors sqrt.total_receita and sqrt.total_despesa were clearly the best predictors (those that could explain the votes the most).
broom::tidy(mod_2006,
conf.int = TRUE,
conf.level = 0.95,
sep=":") %>%
filter(term != "(Intercept)") %>%
ggplot(aes(reorder(term, estimate),
estimate )) +
geom_bar(stat = "identity") +
coord_flip() +
labs(x="Coefficient",y="Predictor")
broom::tidy(mod_2010,
conf.int = TRUE,
conf.level = 0.95,
sep=":") %>%
filter(term != "(Intercept)") %>%
ggplot(aes(reorder(term, estimate),
estimate )) +
geom_bar(stat = "identity") +
coord_flip() +
labs(x="Coefficient",y="Predictor")
broom::tidy(mod_2006,
conf.int = TRUE,
conf.level = 0.95) %>%
filter(term != "(Intercept)") %>%
ggplot(aes(reorder(term, estimate),
estimate, ymin = conf.low,
ymax = conf.high)) +
geom_errorbar(size = 0.8, width= 0.4) +
geom_point(color = "red", size = 2) +
geom_hline(yintercept = 0, colour = "darkred") +
labs(x = "Predictor",
y = "Estimated coefficient (95% of confidence)") +
coord_flip()
broom::tidy(mod_2010,
conf.int = TRUE,
conf.level = 0.95) %>%
filter(term != "(Intercept)") %>%
ggplot(aes(reorder(term, estimate),
estimate, ymin = conf.low,
ymax = conf.high)) +
geom_errorbar(size = 0.8, width= 0.4) +
geom_point(color = "red", size = 2) +
geom_hline(yintercept = 0, colour = "darkred") +
labs(x = "Predictor",
y = "Estimated coefficient (95% of confidence)") +
coord_flip()
mod_2006 %>%
ggplot(aes(.fitted, .resid)) +
geom_point() +
stat_smooth(method="loess") +
geom_hline(col="red",
yintercept=0,
linetype="dashed") +
labs(y="Residuals",
x="Fitted values",
title="Residual vs Fitted Plot (2006 elections)")
mod_2010 %>%
ggplot(aes(.fitted, .resid)) +
geom_point() +
stat_smooth(method="loess") +
geom_hline(col="red",
yintercept=0,
linetype="dashed") +
labs(y="Residuals",
x="Fitted values",
title="Residual vs Fitted Plot (2010 elections)")
mod_2006 %>%
ggplot(aes(sample=rstandard(.))) +
stat_qq(na.rm = TRUE,
shape=1,size=3) + # open circles
labs(title="Normal Q-Q (2006 elections)", # plot title
x="Theoretical Quantiles", # x-axis label
y="Standardized Residuals") + # y-axis label +
geom_abline(color = "red",
size = 0.8,
linetype="dashed") # dashed reference line
mod_2010 %>%
ggplot(aes(sample=rstandard(.))) +
stat_qq(na.rm = TRUE,
shape=1,size=3) + # open circles
labs(title="Normal Q-Q (2010 elections)", # plot title
x="Theoretical Quantiles", # x-axis label
y="Standardized Residuals") + # y-axis label +
geom_abline(color = "red",
size = 0.8,
linetype="dashed") # dashed reference line
mod_2006 %>%
ggplot(aes(.fitted,
sqrt(abs(.stdresid)))) +
geom_point(na.rm=TRUE) +
stat_smooth(method="loess",
na.rm = TRUE) +
labs(title = "Scale-Location (2006 elections)",
x= "Fitted Value",
y = expression(sqrt("|Standardized residuals|")))
mod_2010 %>%
ggplot(aes(.fitted,
sqrt(abs(.stdresid)))) +
geom_point(na.rm=TRUE) +
stat_smooth(method="loess",
na.rm = TRUE) +
labs(title = "Scale-Location (2010 elections)",
x= "Fitted Value",
y = expression(sqrt("|Standardized residuals|")))
mod_2006 %>%
ggplot(aes(.hat, .stdresid)) +
geom_point(aes(size=.cooksd), na.rm=TRUE) +
stat_smooth(method="loess", na.rm=TRUE) +
xlab("Leverage")+ylab("Standardized Residuals") +
ggtitle("Residual vs Leverage Plot (2006 elections)") +
scale_size_continuous("Cook's Distance", range=c(1,5)) +
theme(legend.position="bottom")
mod_2010 %>%
ggplot(aes(.hat, .stdresid)) +
geom_point(aes(size=.cooksd), na.rm=TRUE) +
stat_smooth(method="loess", na.rm=TRUE) +
xlab("Leverage")+ylab("Standardized Residuals") +
ggtitle("Residual vs Leverage Plot (2010 elections)") +
scale_size_continuous("Cook's Distance", range=c(1,5)) +
theme(legend.position="bottom")
mod_2006 %>%
ggplot(aes(.hat, .cooksd)) +
geom_point(na.rm=TRUE) +
stat_smooth(method="loess", na.rm=TRUE) +
xlab("Leverage hii")+ylab("Cook's Distance") +
ggtitle("Cook's dist vs Leverage hii/(1-hii) (2006 elections)") +
geom_abline(slope=seq(0,3,0.5), color="gray", linetype="dashed")
mod_2010 %>%
ggplot(aes(.hat, .cooksd)) +
geom_point(na.rm=TRUE) +
stat_smooth(method="loess", na.rm=TRUE) +
xlab("Leverage hii")+ylab("Cook's Distance") +
ggtitle("Cook's dist vs Leverage hii/(1-hii) (2010 elections)") +
geom_abline(slope=seq(0,3,0.5), color="gray", linetype="dashed")
predictions <- mod_2006 %>% predict(validate_data_2006)
data.frame( R2 = caret::R2(predictions, validate_data_2006$votos),
RMSE = caret::RMSE(predictions, validate_data_2006$votos),
MAE = caret::MAE(predictions, validate_data_2006$votos),
ERR = caret::RMSE(predictions, validate_data_2006$votos)/
mean(validate_data_2006$votos))
Now let’s talk about the results taken from the validate data for 2006 elections (more meaningful).
We got a decent 0.56 R² and adjusted R² approximately (notice the decrease). This means that this model explain approximately 56% of the response variable variability.
The prediction error rate (ERR) was 1.321718.
predictions <- mod_2010 %>% predict(validate_data_2010)
data.frame( R2 = caret::R2(predictions, validate_data_2010$votos),
RMSE = caret::RMSE(predictions, validate_data_2010$votos),
MAE = caret::MAE(predictions, validate_data_2010$votos),
ERR = caret::RMSE(predictions, validate_data_2010$votos)/
mean(validate_data_2010$votos))
Now let’s talk about the results taken from the validate data for 2010 elections (more meaningful).
The average difference between the observed known outcome values and the values predicted by the model (RMSE) was of approximately 27600 + Our model would miss the mark by approximately 27600 (RMSE), that is if candidate had one million votes we would predict 27600 more than we should (or less than we should). The average absolute difference between observed and predicted outcomes (MAE) was approximately 13981. * The prediction error rate (ERR) was 1.275393.
We have signs here of a decent although not brilliant fit of the model for 2006 elections. There’s no clear evidence of overfitting as test, validate and train rendered consonant results.
predictions <- mod_2006 %>% predict(test_data_2006)
data.frame( R2 = caret::R2(predictions, test_data_2006$votos),
RMSE = caret::RMSE(predictions, test_data_2006$votos),
MAE = caret::MAE(predictions, test_data_2006$votos),
ERR = caret::RMSE(predictions, test_data_2006$votos)/
mean(test_data_2006$votos))
Now let’s talk about the results taken from the test data for 2006 elections (most meaningful).
We got a decent 0.58 R² and adjusted R² approximately (notice the decrease). This means that this model explain approximately 58% of the response variable variability. (Smaller than the previous model)
The prediction error rate (ERR) was 1.236634.
predictions <- mod_2010 %>% predict(test_data_2010)
data.frame( R2 = caret::R2(predictions, test_data_2010$votos),
RMSE = caret::RMSE(predictions, test_data_2010$votos),
MAE = caret::MAE(predictions, test_data_2010$votos),
ERR = caret::RMSE(predictions, test_data_2010$votos)/
mean(test_data_2010$votos))
Now let’s talk about the results taken from the test data for 2006 elections (most meaningful).
We got a decent 0.59 R² and adjusted R² approximately (notice the decrease). This means that this model explain approximately 59% of the response variable variability. (Smaller than the previous model)
The prediction error rate (ERR) was 1.347477.
The model for the 2010 elections also showed of a decent although not brilliant fit. There’s no clear evidence of overfitting as test, validate and train rendered consonant results.
mod_2006 <- lm(votos ~ sqrt.total_receita * sqrt.total_despesa +
log.quantidade_fornecedores,
data = train_data_2006)
broom::glance(mod_2006)
mod_2006 %>%
ggplot(aes(.fitted, .resid)) +
geom_point() +
stat_smooth(method="loess") +
geom_hline(col="red",
yintercept=0,
linetype="dashed") +
labs(y="Residuals",
x="Fitted values",
title="Residual vs Fitted Plot (2006 elections)")
mod_2006 %>%
ggplot(aes(sample=rstandard(.))) +
stat_qq(na.rm = TRUE,
shape=1,size=3) + # open circles
labs(title="Normal Q-Q (2006 elections)", # plot title
x="Theoretical Quantiles", # x-axis label
y="Standardized Residuals") + # y-axis label +
geom_abline(color = "red",
size = 0.8,
linetype="dashed") # dashed reference line
mod_2006 %>%
ggplot(aes(.fitted,
sqrt(abs(.stdresid)))) +
geom_point(na.rm=TRUE) +
stat_smooth(method="loess",
na.rm = TRUE) +
labs(title = "Scale-Location (2006 elections)",
x= "Fitted Value",
y = expression(sqrt("|Standardized residuals|")))
mod_2006 %>%
ggplot(aes(.hat, .stdresid)) +
geom_point(aes(size=.cooksd), na.rm=TRUE) +
stat_smooth(method="loess", na.rm=TRUE) +
xlab("Leverage")+ylab("Standardized Residuals") +
ggtitle("Residual vs Leverage Plot (2006 elections)") +
scale_size_continuous("Cook's Distance", range=c(1,5)) +
theme(legend.position="bottom")
mod_2006 %>%
ggplot(aes(.hat, .cooksd)) +
geom_point(na.rm=TRUE) +
stat_smooth(method="loess", na.rm=TRUE) +
xlab("Leverage hii")+ylab("Cook's Distance") +
ggtitle("Cook's dist vs Leverage hii/(1-hii) (2006 elections)") +
geom_abline(slope=seq(0,3,0.5), color="gray", linetype="dashed")
predictions <- mod_2006 %>% predict(validate_data_2006)
data.frame( R2 = caret::R2(predictions, validate_data_2006$votos),
RMSE = caret::RMSE(predictions, validate_data_2006$votos),
MAE = caret::MAE(predictions, validate_data_2006$votos),
ERR = caret::RMSE(predictions, validate_data_2006$votos)/
mean(validate_data_2006$votos))
Now let’s talk about the results taken from the validate data (more meaningful).
predictions <- mod_2006 %>% predict(test_data_2006)
data.frame( R2 = caret::R2(predictions, test_data_2006$votos),
RMSE = caret::RMSE(predictions, test_data_2006$votos),
MAE = caret::MAE(predictions, test_data_2006$votos),
ERR = caret::RMSE(predictions, test_data_2006$votos)/
mean(test_data_2006$votos))
Now let’s talk about the results taken from the test data (most meaningful).
Our skimmed model fared quite similarly to the original model especially in the cross validation.
mod_2010 <- lm(votos ~ sqrt.total_receita * sqrt.total_despesa,
data = train_data_2010)
broom::glance(mod_2010)
mod_2010 %>%
ggplot(aes(.fitted, .resid)) +
geom_point() +
stat_smooth(method="loess") +
geom_hline(col="red",
yintercept=0,
linetype="dashed") +
labs(y="Residuals",
x="Fitted values",
title="Residual vs Fitted Plot (2010 elections)")
mod_2010 %>%
ggplot(aes(sample=rstandard(.))) +
stat_qq(na.rm = TRUE,
shape=1,size=3) + # open circles
labs(title="Normal Q-Q (2010 elections)", # plot title
x="Theoretical Quantiles", # x-axis label
y="Standardized Residuals") + # y-axis label +
geom_abline(color = "red",
size = 0.8,
linetype="dashed") # dashed reference line
mod_2010 %>%
ggplot(aes(.fitted,
sqrt(abs(.stdresid)))) +
geom_point(na.rm=TRUE) +
stat_smooth(method="loess",
na.rm = TRUE) +
labs(title = "Scale-Location (2010 elections)",
x= "Fitted Value",
y = expression(sqrt("|Standardized residuals|")))
mod_2010 %>%
ggplot(aes(.hat, .stdresid)) +
geom_point(aes(size=.cooksd), na.rm=TRUE) +
stat_smooth(method="loess", na.rm=TRUE) +
xlab("Leverage")+ylab("Standardized Residuals") +
ggtitle("Residual vs Leverage Plot (2010 elections)") +
scale_size_continuous("Cook's Distance", range=c(1,5)) +
theme(legend.position="bottom")
mod_2010 %>%
ggplot(aes(.hat, .cooksd)) +
geom_point(na.rm=TRUE) +
stat_smooth(method="loess", na.rm=TRUE) +
xlab("Leverage hii")+ylab("Cook's Distance") +
ggtitle("Cook's dist vs Leverage hii/(1-hii) (2010 elections)") +
geom_abline(slope=seq(0,3,0.5), color="gray", linetype="dashed")
predictions <- mod_2010 %>% predict(validate_data_2010)
data.frame( R2 = caret::R2(predictions, validate_data_2010$votos),
RMSE = caret::RMSE(predictions, validate_data_2010$votos),
MAE = caret::MAE(predictions, validate_data_2010$votos),
ERR = caret::RMSE(predictions, validate_data_2010$votos)/
mean(validate_data_2010$votos))
Now let’s talk about the results taken from the validate data (more meaningful).
predictions <- mod_2010 %>% predict(test_data_2010)
data.frame( R2 = caret::R2(predictions, test_data_2010$votos),
RMSE = caret::RMSE(predictions, test_data_2010$votos),
MAE = caret::MAE(predictions, test_data_2010$votos),
ERR = caret::RMSE(predictions, test_data_2010$votos)/
mean(test_data_2010$votos))
Now let’s talk about the results taken from the test data (most meaningful).
We got a decent 0.60 R² and adjusted R² approximately (notice the decrease). This means that this model explain approximately 60% of the response variable variability.
The prediction error rate (ERR) was 1.329999.
The skimmed model fared quite similarly to the original model in terms of cross validation. Furthermore, we got rid of the overly influential outlier.
set.seed(11) # We set the set for reason of reproducibility
scaled_data %>%
dplyr::sample_frac(.5) -> train_data
encoding <- build_encoding(dataSet = train_data,
cols = c("uf","sexo","grau",
"partido","estado_civil"),
verbose = F)
train_data <- one_hot_encoder(dataSet = train_data,
encoding = encoding,
drop = TRUE,
verbose = F)
cat("#### Train Data ",
"\n##### Observations: ",nrow(train_data),
"\n##### Variables: ",ncol(train_data))
set.seed(11) # We set the set for reason of reproducibility
dplyr::anti_join(scaled_data,
train_data,
by = 'id') -> intermediate_data
intermediate_data %>%
dplyr::sample_frac(.5) -> test_data
test_data <- one_hot_encoder(dataSet = test_data,
encoding = encoding,
drop = TRUE,
verbose = F)
cat("#### Test Data ",
"\n##### Observations: ",nrow(test_data),
"\n##### Variables: ",ncol(test_data))
set.seed(11) # We set the set for reason of reproducibility
dplyr::anti_join(intermediate_data,
test_data,
by = 'id') -> validate_data
validate_data <- one_hot_encoder(dataSet = validate_data,
encoding = encoding,
drop = TRUE,
verbose = F)
rm(intermediate_data)
cat("#### Validate Data ",
"\n##### Observations: ",nrow(validate_data),
"\n##### Variables: ",ncol(validate_data))
mod <- lm(votos ~ sqrt.total_receita * sqrt.total_despesa +
log.quantidade_fornecedores,
data = train_data)
broom::glance(mod)
mod %>%
ggplot(aes(.fitted, .resid)) +
geom_point() +
stat_smooth(method="loess") +
geom_hline(col="red",
yintercept=0,
linetype="dashed") +
labs(y="Residuals",
x="Fitted values",
title="Residual vs Fitted Plot")
mod %>%
ggplot(aes(sample=rstandard(.))) +
stat_qq(na.rm = TRUE,
shape=1,size=3) + # open circles
labs(title="Normal Q-Q", # plot title
x="Theoretical Quantiles", # x-axis label
y="Standardized Residuals") + # y-axis label +
geom_abline(color = "red",
size = 0.8,
linetype="dashed") # dashed reference line
mod %>%
ggplot(aes(.fitted,
sqrt(abs(.stdresid)))) +
geom_point(na.rm=TRUE) +
stat_smooth(method="loess",
na.rm = TRUE) +
labs(title = "Scale-Location",
x= "Fitted Value",
y = expression(sqrt("|Standardized residuals|")))
mod %>%
ggplot(aes(.hat, .stdresid)) +
geom_point(aes(size=.cooksd), na.rm=TRUE) +
stat_smooth(method="loess", na.rm=TRUE) +
xlab("Leverage")+ylab("Standardized Residuals") +
ggtitle("Residual vs Leverage Plot") +
scale_size_continuous("Cook's Distance", range=c(1,5)) +
theme(legend.position="bottom")
mod %>%
ggplot(aes(.hat, .cooksd)) +
geom_point(na.rm=TRUE) +
stat_smooth(method="loess", na.rm=TRUE) +
xlab("Leverage hii")+ylab("Cook's Distance") +
ggtitle("Cook's dist vs Leverage hii/(1-hii)") +
geom_abline(slope=seq(0,3,0.5), color="gray", linetype="dashed")
predictions <- mod %>% predict(validate_data)
data.frame( R2 = caret::R2(predictions, validate_data$votos),
RMSE = caret::RMSE(predictions, validate_data$votos),
MAE = caret::MAE(predictions, validate_data$votos),
ERR = caret::RMSE(predictions, validate_data$votos)/
mean(validate_data$votos))
Now let’s talk about the results taken from the validate data (more meaningful).
predictions <- mod %>% predict(test_data)
data.frame( R2 = caret::R2(predictions, test_data$votos),
RMSE = caret::RMSE(predictions, test_data$votos),
MAE = caret::MAE(predictions, test_data$votos),
ERR = caret::RMSE(predictions, test_data$votos)/
mean(test_data$votos))
Now let’s talk about the results taken from the test data (most meaningful).
In the model for both elections we could see signs of the model stuggling to fit the data. At the stage of cross validation as expected, and the model for both elections fared worse than the rest, especially in terms of test.