Data Analysis and Classification with Neural Networks on a subset of data about polls for the 2006 and 2010 elections in Brazil for the "Câmara Federal de Deputados". Data was taken from the TSE portal which originally encompassed approximately 7300 candidates.
The response variable is the variable that you are interesting in making measurements and conclusions on.
A predictor variable is a variable used to predict another variable.
Our response variable will be "situacao", we want to study how well the predictor variables can help predict its behavior and how they impact in the linear regression.
library(dataPreparation)
library(tidyverse)
library(janitor)
library(magrittr)
library(GGally)
library(caret)
library(keras)
library(ROSE)
library(here)
theme_set(theme_bw())
readr::read_csv(here::here('../input/train_class.csv'),
progress = FALSE,
local=readr::locale("br"),
col_types = cols(ano = col_integer(),
sequencial_candidato = col_character(),
quantidade_doacoes = col_integer(),
quantidade_doadores = col_integer(),
total_receita = col_double(),
media_receita = col_double(),
recursos_de_outros_candidatos.comites = col_double(),
recursos_de_pessoas_fisicas = col_double(),
recursos_de_pessoas_juridicas = col_double(),
recursos_proprios = col_double(),
`recursos_de_partido_politico` = col_double(),
quantidade_despesas = col_integer(),
quantidade_fornecedores = col_integer(),
total_despesa = col_double(),
media_despesa = col_double(),
situacao = col_character(),
.default = col_character())) %>%
mutate(sequencial_candidato = as.numeric(sequencial_candidato),
estado_civil = as.factor(estado_civil),
ocupacao = as.factor(ocupacao),
partido = as.factor(partido),
cargo = as.factor(cargo),
nome = as.factor(nome),
grau = as.factor(grau),
sexo = as.factor(sexo),
uf = as.factor(uf)) -> data
data %>%
glimpse()
data %>%
map_df(function(x) sum(is.na(x))) %>%
gather(feature, num_nulls) %>%
arrange(desc(num_nulls))
data %>%
ggplot(aes(situacao)) +
geom_bar() +
labs(x="Situation", y="Absolute Frequency")
data %>%
group_by(situacao) %>%
summarise(num = n()) %>%
ungroup() %>%
mutate(total = sum(num),
proportion = num/total)
br>
data %>%
select(-ano,
-sequencial_candidato,
-nome) %>%
select(
quantidade_doacoes,
quantidade_doadores,
total_receita,
media_receita,
recursos_de_outros_candidatos.comites,
recursos_de_pessoas_fisicas,
recursos_de_pessoas_juridicas,
recursos_proprios,
`recursos_de_partido_politico`) %>%
na.omit() %>%
ggcorr(palette = "RdBu", label = TRUE,
hjust = 0.95, label_size = 3,size = 3,
nbreaks = 5, layout.exp = 5) +
ggtitle("Correlation plot for employed variables")
set.seed(107)
data$id <- 1:nrow(data)
data %>%
dplyr::sample_frac(.8) -> train
cat("#### Train Shape",
"\n##### Observations: ",nrow(train),
"\n##### Variables: ",ncol(train))
dplyr::anti_join(data,
train,
by = 'id') -> test
cat("#### Test Shape",
"\n##### Observations: ",nrow(test),
"\n##### Variables: ",ncol(test))
train %>%
select(-ano,-nome,-id,-sequencial_candidato) -> train
test %>%
select(-ano,-nome,-id,-sequencial_candidato) -> test
train %>%
dplyr::select_if(.,is.numeric) -> train.numeric
train %>%
dplyr::select_if(.,negate(is.numeric)) -> train.categorical
test %>%
dplyr::select_if(.,is.numeric) -> test.numeric
test %>%
dplyr::select_if(.,negate(is.numeric)) -> test.categorical
train.numeric %>%
preProcess(.,method = c("center","scale")) -> processParams
processParams %>%
predict(.,train.numeric) -> train.numeric
processParams %>%
predict(.,test.numeric) -> test.numeric
processParams
train.numeric %>%
dplyr::bind_cols(train.categorical) -> train
test.numeric %>%
dplyr::bind_cols(test.categorical) -> test
train %>%
clean_names() %>%
ROSE(situacao ~ .,
data =.,
seed = 107) %$%
data -> train.rose
cat("#### Train Shape",
"\n##### Observations: ",nrow(train.rose),
"\n##### Variables: ",ncol(train.rose))
train.rose %>%
group_by(situacao) %>%
summarise(num = n()) %>%
ungroup() %>%
mutate(total = sum(num),
proportion = num/total)
encoding <- build_encoding(dataSet = train,
cols = c("uf","sexo","grau","ocupacao",
"partido","estado_civil"),
verbose = F)
train <- one_hot_encoder(dataSet = train,
encoding = encoding,
drop = TRUE,
verbose = F)
cat("#### Train Shape",
"\n##### Observations: ",nrow(train),
"\n##### Variables: ",ncol(train))
train.rose <- one_hot_encoder(dataSet = train.rose,
encoding = encoding,
drop = TRUE,
verbose = F)
cat("#### Balanced Data Shape",
"\n##### Observations: ",nrow(train.rose),
"\n##### Variables: ",ncol(train.rose))
test <- one_hot_encoder(dataSet = test,
encoding = encoding,
drop = TRUE,
verbose = F)
cat("#### Test Data Shape",
"\n##### Observations: ",nrow(test),
"\n##### Variables: ",ncol(test))
train %>%
nearZeroVar(saveMetrics = TRUE) %>%
tibble::rownames_to_column("variable") %>%
filter(nzv == T) %>%
pull(variable) -> near_zero_vars
train %>%
select(-one_of(near_zero_vars)) -> train
train.rose %>%
select(-one_of(near_zero_vars)) -> train.rose
test %>%
select(-one_of(near_zero_vars)) -> test
near_zero_vars %>%
glimpse()
split_target_predictors <- function(df) {
df %>%
select(-situacao) %>%
as.matrix() -> x_data
df %>%
select(situacao) %>%
dummyVars(" ~ situacao", data = .,levelsOnly = TRUE) -> dmy
df %>%
select(situacao) %>%
data.frame(predict(dmy, newdata = .)) %>%
select(-situacao) %>%
as.matrix() -> y_data
dimnames(x_data) <- NULL
dimnames(y_data) <- NULL
newData <- list("predictors" = x_data, "target" = y_data)
return(newData)
}
test %>%
split_target_predictors() -> x_test
# extract target and predictors
y_test <- x_test$target
x_test <- x_test$predictors
readr::read_csv(here::here('../input/test_class.csv'),
progress = FALSE,
local=readr::locale("br"),
col_types = cols(ano = col_integer(),
sequencial_candidato = col_character(),
quantidade_doacoes = col_integer(),
quantidade_doadores = col_integer(),
total_receita = col_double(),
media_receita = col_double(),
recursos_de_outros_candidatos.comites = col_double(),
recursos_de_pessoas_fisicas = col_double(),
recursos_de_pessoas_juridicas = col_double(),
recursos_proprios = col_double(),
`recursos_de_partido_politico` = col_double(),
quantidade_despesas = col_integer(),
quantidade_fornecedores = col_integer(),
total_despesa = col_double(),
media_despesa = col_double(),
.default = col_character())) %>%
mutate(sequencial_candidato = as.numeric(sequencial_candidato),
estado_civil = as.factor(estado_civil),
ocupacao = as.factor(ocupacao),
partido = as.factor(partido),
cargo = as.factor(cargo),
nome = as.factor(nome),
grau = as.factor(grau),
sexo = as.factor(sexo),
uf = as.factor(uf)) -> submit_data
submit_data %$%
sequencial_candidato -> Id
submit_data %>%
glimpse()
submit_data %>%
select(-ano,-nome,-sequencial_candidato) -> submit_data
submit_data %>%
dplyr::select_if(.,is.numeric) -> submit_data.numeric
submit_data %>%
dplyr::select_if(.,negate(is.numeric)) -> submit_data.categorical
processParams %>%
predict(.,submit_data.numeric) -> submit_data.numeric
submit_data.numeric %>%
dplyr::bind_cols(submit_data.categorical) -> submit_data
submit_data <- one_hot_encoder(dataSet = submit_data,
encoding = encoding,
drop = TRUE,
verbose = F)
submit_data %>%
select(-one_of(near_zero_vars)) -> submit_data
submit_data %>%
glimpse()
submit_data %>%
as.matrix() -> x_submit
dimnames(x_submit) <- NULL
# Initialize a sequential model
model.simple <- keras_model_sequential()
# Add layers to the model
model.simple %>%
layer_dense(units = 8, activation = 'relu', input_shape = c(38)) %>%
layer_dense(units = 2, activation = 'softmax')
summary(model.simple)
# Initialize a sequential model
model.complex <- keras_model_sequential()
# Add layers to the model
model.complex %>%
layer_dense(units = 8, activation = 'relu', input_shape = c(38)) %>%
layer_dense(units = 6, activation = 'relu') %>%
layer_dense(units = 10) %>%
layer_dense(units = 2, activation = 'softmax')
summary(model.complex)
neuralNetCV <- function(df,
model,
k=5,
loss_method = 'binary_crossentropy',
optim_approach = 'adam',
summary_metrics='accuracy',
epochs = 200,
batch_size = 5,
validation_split = 0.2) {
model %>%
compile(
loss = loss_method,
optimizer = optim_approach,
metrics = summary_metrics)
df %>%
mutate(folds = sample(rep_len(1:k, nrow(.)))) -> df
result <- data.frame("loss"=c(),metrics=c())
for(f in unique(df$folds)){
# split into train/validation
df %>%
filter(folds == f) -> train_df
df %>%
filter(folds != f) -> valid_df
# Remove auxiliary column
train_df %>%
select(-folds) -> train_df
valid_df %>%
select(-folds) -> valid_df
# create matrices
train_df %>%
split_target_predictors() -> x_train
valid_df %>%
split_target_predictors() -> x_valid
# extract target and predictors
y_train <- x_train$target
x_train <- x_train$predictors
y_valid <- x_valid$target
x_valid <- x_valid$predictors
# Train model
history <- model %>% fit(
x_train, y_train,
epochs = epochs,
batch_size = batch_size,
validation_split = validation_split)
# Evaluate the model
model %>%
evaluate(x_valid,
y_valid,
batch_size = 128) -> score
score %>%
as.data.frame() -> temporary
result <- rbind(result,temporary)
}
cvsummary <- list("result"=result,
"history"=history,
"model"=model)
return(cvsummary);
}
tuneNeuralNetwork <- function(model,data,paramsGrid,
target,k=5) {
environment(neuralNetCV) <- environment()
best_accuracy <- 0
best_loss <- 0
best_tune <- NULL
best_history <- NULL
best_model <- NULL
optmizer <- NULL
losses <- c()
acc <- c()
for(i in 1:nrow(paramsGrid)) {
row <- paramsGrid[i,]
if (row$optim_approach == "sgd")
optmizer <- optimizer_sgd(lr = 0.01)
else if (row$optim_approach == "rmsprop")
optmizer <- optimizer_rmsprop(lr = 0.001, rho = 0.9)
else if (row$optim_approach == "adam")
optmizer <- optimizer_adam(lr = 0.001, beta_1 = 0.9, beta_2 = 0.999)
# apply CV Kfold to particular set of params
neuralNetCV(data,
model=model,
k=k,
loss_method = paste0(row$loss_method),
optim_approach = optmizer,
summary_metrics= paste0(row$summary_metrics),
epochs = paste0(row$epochs),
batch_size = paste0(row$batch_size),
validation_split = paste0(row$validation_split)) -> cvsummary
cvsummary %$%
result %$%
acc %>%
mean() -> mean_accuracy
cvsummary %$%
result %$%
loss %>%
mean() -> mean_loss
losses <- append(losses,mean_loss)
acc <- append(acc, mean_accuracy)
if(mean_accuracy > best_accuracy) {
best_accuracy <- mean_accuracy
best_history <- cvsummary$history
best_model <- cvsummary$model
best_result <- mean_accuracy
best_loss <- mean_loss
best_tune <- row
}
}
paramsGrid$loss <- losses
paramsGrid$accuracy <- acc
best_tune$accuracy <- best_accuracy
best_tune$loss <- best_loss
result <- list("history"=best_history,
"best_tune"=best_tune,
"iterations"=paramsGrid,
"best_model"=best_model)
return(result)
}
optm <- c("sgd", "adam")
validation_split <- c(0.2,0.4)
paramsGrid <- expand.grid(optim_approach=optm,
loss_method = 'binary_crossentropy',
summary_metrics='accuracy',
epochs = 200,
batch_size = 5,
validation_split = validation_split)
paramsGrid
tuneNeuralNetwork(model.simple,
train,paramsGrid,
k=5) -> tunedSimple
tunedSimple %$%
best_model %>%
evaluate(x_test,
y_test,
batch_size = 128) -> score.simple
score.simple
tunedSimple %$%
history %>%
plot()
tunedSimple %$%
best_model %>%
predict_classes(x_submit, batch_size = 128) -> classes.simple
data.frame(Id=Id,
Predicted=classes.simple) -> submission.simple
submission.simple %>%
mutate(Id = as.character(Id)) %>%
write_csv("nn_submission_simple.csv")
tuneNeuralNetwork(model.simple,
train.rose,paramsGrid,
k=5) -> tunedSimpleRose
tunedSimpleRose %$%
best_model %>%
evaluate(x_test,
y_test,
batch_size = 128) -> score.simple.rose
score.simple.rose
tunedSimpleRose %$%
history %>%
plot()
tunedSimpleRose %$%
best_model %>%
predict_classes(x_submit, batch_size = 128) -> classes.simple.rose
data.frame(Id=Id,
Predicted=classes.simple.rose) -> submission.simple.rose
submission.simple.rose %>%
mutate(Id = as.character(Id)) %>%
write_csv("nn_submission_simple_rose.csv")
tuneNeuralNetwork(model.complex,
train,paramsGrid,
k=5) -> tunedComplex
tunedComplex %$%
best_model %>%
evaluate(x_test,
y_test,
batch_size = 128) -> score.complex
score.complex
tunedComplex %$%
history %>%
plot()
tunedComplex %$%
best_model %>%
predict_classes(x_submit, batch_size = 128) -> classes.complex
data.frame(Id=Id,
Predicted=classes.complex) -> submission.complex
submission.complex %>%
mutate(Id = as.character(Id)) %>%
write_csv("nn_submission_complex.csv")
tuneNeuralNetwork(model.complex,
train.rose,paramsGrid,
k=5) -> tunedComplexRose
tunedComplexRose %$%
best_model %>%
evaluate(x_test,
y_test,
batch_size = 128) -> score.complex.rose
score.complex.rose
tunedComplexRose %$%
history %>%
plot()
tunedComplexRose %$%
best_model %>%
predict_classes(x_submit, batch_size = 128) -> classes.complex.rose
data.frame(Id=Id,
Predicted=classes.complex.rose) -> submission.complex.rose
submission.complex.rose %>%
mutate(Id = as.character(Id)) %>%
write_csv("nn_submission_complex_rose.csv")