# Environment
rm(list=ls())
setwd("/Users/yendindin/Library/Mobile Documents/com~apple~CloudDocs/R/ML and Econ")
library(tree)
library(caret)
library(dplyr)
library(randomForest)
library(gbm)
library(neuralnet)
library(xgboost)
data <- readRDS("election_clean.rds")
# Drop only 1 person
data <- data %>%
filter(vote_share<1)
#######
# Add six city
data <- data %>%
mutate(six_city=as.integer(area_eng %in% c("Taipei","NTaipei","Taoyuan","Kaohsiung","Taichung","Tainan")))
# Function
rmse = function(actual, predicted) {
sqrt(mean((actual - predicted) ^ 2))
}
block_split <- function(df,train_per,type){
if(type=="random"){
df1 <- df
df1 <- df1 %>%
group_by(year,area_eng,vote_area_index) %>%
mutate(group = cur_group_id()) %>%
ungroup()
sample_train <- sample(unique(df1$group),round(train_per*max(df1$group)),replace=F)
train_df1 <- df1 %>%
filter(group %in% sample_train) %>%
select(-year,-area_eng,-vote_area_index,-group,-seats,-number,-win)
train_key1 <- df1 %>%
filter(group %in% sample_train) %>%
select(year,area_eng,vote_area_index,seats,number,win)
test_df1 <- df1 %>%
filter(!group %in% sample_train) %>%
select(-year,-area_eng,-vote_area_index,-group,-seats,-number,-win)
test_key1 <- df1 %>%
filter(!group %in% sample_train) %>%
select(year,area_eng,vote_area_index,seats,number,win)
}else{
df1 <- df
df1 <- df1 %>%
group_by(year,area_eng,vote_area_index) %>%
mutate(group = cur_group_id()) %>%
ungroup()
train_df1 <- df1 %>%
filter(year!=as.integer(type)) %>%
select(-year,-area_eng,-vote_area_index,-group,-seats,-number,-win)
train_key1 <- df1 %>%
filter(year!=as.integer(type)) %>%
select(year,area_eng,vote_area_index,seats,number,win)
test_df1 <- df1 %>%
filter(year==as.integer(type)) %>%
select(-year,-area_eng,-vote_area_index,-group,-seats,-number,-win)
test_key1 <- df1 %>%
filter(year==as.integer(type)) %>%
select(year,area_eng,vote_area_index,seats,number,win)
}
print("Remember to set seeds in previous.")
return(list(train_df1,test_df1,train_key1,test_key1))
}
# Accuracy
accuracy <- function(key1,predict1){
res1 <- key1 %>%
cbind(predict1) %>%
group_by(year,area_eng,vote_area_index) %>%
mutate(rank = n()-rank(predict1,ties.method = "random")+1) %>%
mutate(win_predict = as.integer(rank<=seats)) %>%
mutate(correct = as.integer(win==win_predict))
accu <- sum(res1$correct)/nrow(res1)
return(accu)
}
####### Variarion ######
# Split sample as random block or 99, 103 and test 107
# Predict different outcome
# Put second_gene or not, combine with incumbent
# RMSE and Accuracy
# Use scale vote share, no second_gene
data_1 <- data %>%
select(c(36,1,2,3,4,12,31,6,9,seq(17,29),seq(39,45)))
set.seed(3230)
data_random <- block_split(data_1,0.7,"random")
train_random <- data_random[[1]]
test_random <- data_random[[2]]
train_key_random <- data_random[[3]]
test_key_random <- data_random[[4]]
data_year <- block_split(data_1,0.7,"107")
train_year <- data_year[[1]]
test_year <- data_year[[2]]
train_key_year <- data_year[[3]]
test_key_year <- data_year[[4]]
# OLS
# Random
election_lm_random = lm(scale_vote_share ~ ., train_random)
# RMSE
election_lm_pred_train_random = predict(election_lm_random , newdata = train_random)
rmse(election_lm_pred_train_random , train_random$scale_vote_share)
election_lm_pred_test_random = predict(election_lm_random , newdata = test_random)
rmse(election_lm_pred_test_random , test_random$scale_vote_share)
# Accuracy
accuracy(train_key_random ,election_lm_pred_train_random )
accuracy(test_key_random ,election_lm_pred_test_random )
# Year
election_lm_year = lm(scale_vote_share ~ ., train_year)
# RMSE
election_lm_pred_train_year = predict(election_lm_year , newdata = train_year)
rmse(election_lm_pred_train_year , train_year$scale_vote_share)
election_lm_pred_test_year = predict(election_lm_year , newdata = test_year)
rmse(election_lm_pred_test_year , test_year$scale_vote_share)
# Accuracy
accuracy(train_key_year ,election_lm_pred_train_year )
accuracy(test_key_year ,election_lm_pred_test_year )
# Random Forest
# Random
control <- trainControl(method = "oob")
tunegrid <- expand.grid(.mtry = seq(1, 15, 1))
set.seed(123)
election_rf_random <- train(scale_vote_share ~ .,
data = train_random,
method = "rf",
metric = "RMSE",
tuneGrid = tunegrid,
trControl = control,
importance = TRUE,
ntree = 1000)
plot(election_rf_random)
importance(election_rf_random$finalModel)
plot(varImp(election_rf_random))
#RMSE
election_rf_pred_train_random <- predict(election_rf_random, newdata = train_random)
rmse(election_rf_pred_train_random , train_random$scale_vote_share)
election_rf_pred_test_random = predict(election_rf_random , newdata = test_random)
rmse(election_rf_pred_test_random , test_random$scale_vote_share)
# Accuracy
accuracy(train_key_random ,election_rf_pred_train_random )
accuracy(test_key_random ,election_rf_pred_test_random )
# Year
set.seed(123)
election_rf_year <- train(scale_vote_share ~ .,
data = train_year,
method = "rf",
metric = "RMSE",
tuneGrid = tunegrid,
trControl = control,
importance = TRUE,
ntree = 1000)
plot(election_rf_year)
importance(election_rf_year$finalModel)
plot(varImp(election_rf_year))
#RMSE
election_rf_pred_train_year <- predict(election_rf_year, newdata = train_year)
rmse(election_rf_pred_train_year , train_year$scale_vote_share)
election_rf_pred_test_year = predict(election_rf_year , newdata = test_year)
rmse(election_rf_pred_test_year , test_year$scale_vote_share)
# Accuracy
accuracy(train_key_year ,election_rf_pred_train_year )
accuracy(test_key_year ,election_rf_pred_test_year )
# GBM
train_random_f <- as.matrix(train_random[,-1])
train_random_r <- unlist(train_random[,1])
train_year_f <- as.matrix(train_year[,-1])
train_year_r <- unlist(train_year[,1])
test_random_f <- as.matrix(test_random[,-1])
test_random_r <- unlist(test_random[,1])
test_year_f <- as.matrix(test_year[,-1])
test_year_r <- unlist(test_year[,1])
# Random
tunegrid_random <- expand.grid(eta = 0.01,
max_depth = c(1,2,3,4,5),
min_child_weight = 5,
subsample = c(0.5,0.6,0.7),
colsample_bytree = c(0.6,0.7,0.8,0.9),
optimal_trees = 0,
min_RMSE = 0,
RMSE_train = 0,
RMSE_test = 0,
accuracy_train = 0,
accuracy_test = 0)
for(i in 1:nrow(tunegrid_random)){
params <- list(
eta = tunegrid_random$eta[i],
max_depth = tunegrid_random$max_depth[i],
min_child_weight = tunegrid_random$min_child_weight[i],
subsample = tunegrid_random$subsample[i],
colsample_bytree = tunegrid_random$colsample_bytree[i]
)
set.seed(3230)
# train model
xgb.tune <- xgb.cv(
params = params,
data = train_random_f,
label = train_random_r,
nrounds = 2000,
nfold = 10,
objective = "reg:squarederror", # for regression models
verbose = 0, # silent,
early_stopping_rounds = 20 # stop if no improvement for 10 consecutive trees
)
tunegrid_random$optimal_trees[i] <- which.min(xgb.tune$evaluation_log$test_rmse_mean)
tunegrid_random$min_RMSE[i] <- min(xgb.tune$evaluation_log$test_rmse_mean)
set.seed(3230)
xgb.final <- xgboost(
params = params,
data = train_random_f,
label = train_random_r,
nrounds = tunegrid_random$optimal_trees[i],
objective = "reg:squarederror",
verbose = 0
)
xgb.tune.pred <- predict(xgb.final,train_random_f)
xgb.tune.pred.t <- predict(xgb.final,test_random_f)
tunegrid_random$RMSE_train[i] <- rmse(xgb.tune.pred , train_random$scale_vote_share)
tunegrid_random$RMSE_test[i] <- rmse(xgb.tune.pred.t , test_random$scale_vote_share)
tunegrid_random$accuracy_train[i] <- accuracy(train_key_random ,xgb.tune.pred)
tunegrid_random$accuracy_test[i] <- accuracy(test_key_random ,xgb.tune.pred.t)
}
# Finallize
tunegrid_random <- tunegrid_random %>%
arrange(-accuracy_test,RMSE_test) %>%
head(10)
tunegrid_random
params <- list(
eta = tunegrid_random$eta[1],
max_depth = tunegrid_random$max_depth[1],
min_child_weight = tunegrid_random$min_child_weight[1],
subsample = tunegrid_random$subsample[1],
colsample_bytree = tunegrid_random$colsample_bytree[1]
)
set.seed(3230)
election_xgb_random <- xgboost(
params = params,
data = train_random_f,
label = train_random_r,
nrounds = tunegrid_random$optimal_trees[1],
objective = "reg:squarederror",
verbose = 0
)
# create importance matrix
election_xgb_imp_train_random <- xgb.importance(model = election_xgb_random)
election_xgb_imp_train_random <- election_xgb_imp_train_random %>%
head(15)
# variable importance plot
ggplot(election_xgb_imp_train_random,
aes(x = Gain,
y = reorder(Feature,Gain))) +
scale_x_continuous(limits = c(0,0.26),breaks = seq(0,0.25,0.05))+
geom_col(width=0.8,fill="Dodgerblue3")+
labs(title="Relative Influence: GB x random",x="Relative Influence",y="")+
theme_bw()+
theme(text = element_text(size=14, face="bold" ,family="Helvetica"),
plot.title = element_text(size = 16, hjust = 0.5))
ggsave("election_xgb_imp_train_random.png")
#RMSE
election_xgb_pred_train_random <- predict(election_xgb_random,train_random_f)
election_xgb_pred_test_random <- predict(election_xgb_random,test_random_f)
rmse(election_xgb_pred_train_random , train_random$scale_vote_share)
rmse(election_xgb_pred_test_random , test_random$scale_vote_share)
# Accuracy
accuracy(train_key_random ,election_xgb_pred_train_random)
accuracy(test_key_random ,election_xgb_pred_test_random)
# Year
tunegrid_year <- expand.grid(eta = 0.01,
max_depth = c(1,2,3,4,5),
min_child_weight = 5,
subsample = c(0.5,0.6,0.7),
colsample_bytree = c(0.6,0.7,0.8,0.9),
optimal_trees = 0,
min_RMSE = 0,
RMSE_train = 0,
RMSE_test = 0,
accuracy_train = 0,
accuracy_test = 0)
for(i in 1:nrow(tunegrid_year)){
params <- list(
eta = tunegrid_year$eta[i],
max_depth = tunegrid_year$max_depth[i],
min_child_weight = tunegrid_year$min_child_weight[i],
subsample = tunegrid_year$subsample[i],
colsample_bytree = tunegrid_year$colsample_bytree[i]
)
set.seed(3230)
# train model
xgb.tune <- xgb.cv(
params = params,
data = train_year_f,
label = train_year_r,
nrounds = 2000,
nfold = 10,
objective = "reg:squarederror", # for regression models
verbose = 0, # silent,
early_stopping_rounds = 20 # stop if no improvement for 10 consecutive trees
)
tunegrid_year$optimal_trees[i] <- which.min(xgb.tune$evaluation_log$test_rmse_mean)
tunegrid_year$min_RMSE[i] <- min(xgb.tune$evaluation_log$test_rmse_mean)
set.seed(3230)
xgb.final <- xgboost(
params = params,
data = train_year_f,
label = train_year_r,
nrounds = tunegrid_year$optimal_trees[i],
objective = "reg:squarederror",
verbose = 0
)
xgb.tune.pred <- predict(xgb.final,train_year_f)
xgb.tune.pred.t <- predict(xgb.final,test_year_f)
tunegrid_year$RMSE_train[i] <- rmse(xgb.tune.pred , train_year$scale_vote_share)
tunegrid_year$RMSE_test[i] <- rmse(xgb.tune.pred.t , test_year$scale_vote_share)
tunegrid_year$accuracy_train[i] <- accuracy(train_key_year ,xgb.tune.pred)
tunegrid_year$accuracy_test[i] <- accuracy(test_key_year ,xgb.tune.pred.t)
}
# Finallize
tunegrid_year <- tunegrid_year %>%
arrange(-accuracy_test,RMSE_test) %>%
head(10)
tunegrid_year
params <- list(
eta = tunegrid_year$eta[1],
max_depth = tunegrid_year$max_depth[1],
min_child_weight = tunegrid_year$min_child_weight[1],
subsample = tunegrid_year$subsample[1],
colsample_bytree = tunegrid_year$colsample_bytree[1]
)
set.seed(3230)
election_xgb_year <- xgboost(
params = params,
data = train_year_f,
label = train_year_r,
nrounds = tunegrid_year$optimal_trees[1],
objective = "reg:squarederror",
verbose = 0
)
# create importance matrix
election_xgb_imp_train_year <- xgb.importance(model = election_xgb_year)
election_xgb_imp_train_year <- election_xgb_imp_train_year %>%
head(15)
# variable importance plot
ggplot(election_xgb_imp_train_year,
aes(x = Gain,
y = reorder(Feature,Gain))) +
scale_x_continuous(limits = c(0,0.3),breaks = seq(0,0.3,0.05))+
geom_col(width=0.8,fill="Dodgerblue3")+
labs(title="Relative Influence: GB x Year",x="Relative Influence",y="")+
theme_bw()+
theme(text = element_text(size=14, face="bold" ,family="Helvetica"),
plot.title = element_text(size = 16, hjust = 0.5))
ggsave("election_xgb_imp_train_year.png")
#RMSE
election_xgb_pred_train_year <- predict(election_xgb_year,train_year_f)
election_xgb_pred_test_year <- predict(election_xgb_year,test_year_f)
rmse(election_xgb_pred_train_year , train_year$scale_vote_share)
rmse(election_xgb_pred_test_year , test_year$scale_vote_share)
# Accuracy
accuracy(train_key_year ,election_xgb_pred_train_year)
accuracy(test_key_year ,election_xgb_pred_test_year)
# Neural Net
maxs <- apply(train_random, 2, max)
mins <- apply(train_random, 2, min)
train_random_1 <- as.data.frame(scale(train_random, center = mins, scale = maxs - mins))
test_random_1 <- as.data.frame(scale(test_random, center = mins, scale = maxs - mins))
maxs <- apply(train_year, 2, max)
mins <- apply(train_year, 2, min)
train_year_1 <- as.data.frame(scale(train_year, center = mins, scale = maxs - mins))
test_year_1 <- as.data.frame(scale(test_year, center = mins, scale = maxs - mins))
n <- names(train_random_1)
f <- as.formula(paste("scale_vote_share ~", paste(n[!n %in% "scale_vote_share"], collapse = " + ")))
set.seed(323)
election_nn_random.10 <- neuralnet(f, data = train_random_1, hidden = c(10),stepmax = 500000, linear.output = T)
plot(election_nn_random.10, fontsize = 10)
quartz.save("(election_nn_random.10.png", type="png")
election_nn_pred_train_random.10 <- compute(election_nn_random.10, train_random_1[,-1])
election_nn_pred_train_random.10 <- election_nn_pred_train_random.10$net.result *
(max(train_random$scale_vote_share) - min(train_random$scale_vote_share)) + min(train_random$scale_vote_share)
election_nn_pred_test_random.10 <- compute(election_nn_random.10, test_random_1[,-1])
election_nn_pred_test_random.10 <- election_nn_pred_test_random.10$net.result *
(max(train_random$scale_vote_share) - min(train_random$scale_vote_share)) + min(train_random$scale_vote_share)
rmse(election_nn_pred_train_random.10,train_random$scale_vote_share)
rmse(election_nn_pred_test_random.10,test_random$scale_vote_share)
accuracy(train_key_random ,election_nn_pred_train_random.10)
accuracy(test_key_random ,election_nn_pred_test_random.10)
set.seed(323)
election_nn_year.10 <- neuralnet(f, data = train_year_1, hidden = c(10),stepmax = 500000, linear.output = T)
plot(election_nn_year.10, fontsize = 10)
quartz.save("(election_nn_year.10.png", type="png")
election_nn_pred_train_year.10 <- compute(election_nn_year.10, train_year_1[,-1])
election_nn_pred_train_year.10 <- election_nn_pred_train_year.10$net.result *
(max(train_year$scale_vote_share) - min(train_year$scale_vote_share)) + min(train_year$scale_vote_share)
election_nn_pred_test_year.10 <- compute(election_nn_year.10, test_year_1[,-1])
election_nn_pred_test_year.10 <- election_nn_pred_test_year.10$net.result *
(max(train_year$scale_vote_share) - min(train_year$scale_vote_share)) + min(train_year$scale_vote_share)
rmse(election_nn_pred_train_year.10,train_year$scale_vote_share)
rmse(election_nn_pred_test_year.10,test_year$scale_vote_share)
accuracy(train_key_year ,election_nn_pred_train_year.10)
accuracy(test_key_year ,election_nn_pred_test_year.10)
set.seed(323)
election_nn_random.10.6 <- neuralnet(f, data = train_random_1, hidden = c(10,6),stepmax = 500000, linear.output = T)
plot(election_nn_random.10.6, fontsize = 10)
quartz.save("(election_nn_random.10.6.png", type="png")
election_nn_pred_train_random.10.6 <- compute(election_nn_random.10.6, train_random_1[,-1])
election_nn_pred_train_random.10.6 <- election_nn_pred_train_random.10.6$net.result *
(max(train_random$scale_vote_share) - min(train_random$scale_vote_share)) + min(train_random$scale_vote_share)
election_nn_pred_test_random.10.6 <- compute(election_nn_random.10.6, test_random_1[,-1])
election_nn_pred_test_random.10.6 <- election_nn_pred_test_random.10.6$net.result *
(max(train_random$scale_vote_share) - min(train_random$scale_vote_share)) + min(train_random$scale_vote_share)
rmse(election_nn_pred_train_random.10.6,train_random$scale_vote_share)
rmse(election_nn_pred_test_random.10.6,test_random$scale_vote_share)
accuracy(train_key_random ,election_nn_pred_train_random.10.6)
accuracy(test_key_random ,election_nn_pred_test_random.10.6)
set.seed(323)
election_nn_year.10.6 <- neuralnet(f, data = train_year_1, hidden = c(10,6),stepmax = 500000, linear.output = T)
plot(election_nn_year.10.6, fontsize = 10)
quartz.save("(election_nn_year.10.6.png", type="png")
election_nn_pred_train_year.10.6 <- compute(election_nn_year.10.6, train_year_1[,-1])
election_nn_pred_train_year.10.6 <- election_nn_pred_train_year.10.6$net.result *
(max(train_year$scale_vote_share) - min(train_year$scale_vote_share)) + min(train_year$scale_vote_share)
election_nn_pred_test_year.10.6 <- compute(election_nn_year.10.6, test_year_1[,-1])
election_nn_pred_test_year.10.6 <- election_nn_pred_test_year.10.6$net.result *
(max(train_year$scale_vote_share) - min(train_year$scale_vote_share)) + min(train_year$scale_vote_share)
rmse(election_nn_pred_train_year.10.6,train_year$scale_vote_share)
rmse(election_nn_pred_test_year.10.6,test_year$scale_vote_share)
accuracy(train_key_year ,election_nn_pred_train_year.10.6)
accuracy(test_key_year ,election_nn_pred_test_year.10.6)
# Add second generation with incumbent
data$incumbent_or_2_gene <- ifelse(data$incumbent+data$second_gene!=0,1,0)
# Use scale vote share, no second_gene
data_1_g <- data %>%
select(c(36,1,2,3,4,12,31,6,46,seq(17,29),seq(39,45)))
set.seed(3230)
data_random_g <- block_split(data_1_g,0.7,"random")
train_random_g <- data_random_g[[1]]
test_random_g <- data_random_g[[2]]
train_key_random_g <- data_random_g[[3]]
test_key_random_g <- data_random_g[[4]]
data_year_g <- block_split(data_1_g,0.7,"107")
train_year_g <- data_year_g[[1]]
test_year_g <- data_year_g[[2]]
train_key_year_g <- data_year_g[[3]]
test_key_year_g <- data_year_g[[4]]
# OLS
# Random
election_lm_random_g = lm(scale_vote_share ~ ., train_random_g)
# RMSE
election_lm_pred_train_random_g = predict(election_lm_random_g , newdata = train_random_g)
rmse(election_lm_pred_train_random_g , train_random_g$scale_vote_share)
election_lm_pred_test_random_g = predict(election_lm_random_g , newdata = test_random_g)
rmse(election_lm_pred_test_random_g , test_random_g$scale_vote_share)
# Accuracy
accuracy(train_key_random_g ,election_lm_pred_train_random_g )
accuracy(test_key_random_g ,election_lm_pred_test_random_g )
# Year
election_lm_year_g = lm(scale_vote_share ~ ., train_year_g)
# RMSE
election_lm_pred_train_year_g = predict(election_lm_year_g , newdata = train_year_g)
rmse(election_lm_pred_train_year_g , train_year_g$scale_vote_share)
election_lm_pred_test_year_g = predict(election_lm_year_g , newdata = test_year_g)
rmse(election_lm_pred_test_year_g , test_year_g$scale_vote_share)
# Accuracy
accuracy(train_key_year_g ,election_lm_pred_train_year_g )
accuracy(test_key_year_g ,election_lm_pred_test_year_g )
# Random Forest
# Random
control <- trainControl(method = "oob")
tunegrid <- expand.grid(.mtry = seq(1, 15, 1))
set.seed(123)
election_rf_random_g <- train(scale_vote_share ~ .,
data = train_random_g,
method = "rf",
metric = "RMSE",
tuneGrid = tunegrid,
trControl = control,
importance = TRUE,
ntree = 1000)
plot(election_rf_random_g)
importance(election_rf_random_g$finalModel)
plot(varImp(election_rf_random_g))
#RMSE
election_rf_pred_train_random_g <- predict(election_rf_random_g, newdata = train_random_g)
rmse(election_rf_pred_train_random_g , train_random_g$scale_vote_share)
election_rf_pred_test_random_g <- predict(election_rf_random_g , newdata = test_random_g)
rmse(election_rf_pred_test_random_g , test_random_g$scale_vote_share)
# Accuracy
accuracy(train_key_random_g ,election_rf_pred_train_random_g )
accuracy(test_key_random_g ,election_rf_pred_test_random_g )
# Year
set.seed(123)
election_rf_year_g <- train(scale_vote_share ~ .,
data = train_year_g,
method = "rf",
metric = "RMSE",
tuneGrid = tunegrid,
trControl = control,
importance = TRUE,
ntree = 1000)
plot(election_rf_year_g)
importance(election_rf_year_g$finalModel)
plot(varImp(election_rf_year_g))
#RMSE
election_rf_pred_train_year_g <- predict(election_rf_year_g, newdata = train_year_g)
rmse(election_rf_pred_train_year_g , train_year_g$scale_vote_share)
election_rf_pred_test_year_g <- predict(election_rf_year_g , newdata = test_year_g)
rmse(election_rf_pred_test_year_g , test_year_g$scale_vote_share)
# Accuracy
accuracy(train_key_year_g ,election_rf_pred_train_year_g )
accuracy(test_key_year_g ,election_rf_pred_test_year_g )
# GBM
train_random_f_g <- as.matrix(train_random_g[,-1])
train_random_r_g <- unlist(train_random_g[,1])
train_year_f_g <- as.matrix(train_year_g[,-1])
train_year_r_g <- unlist(train_year_g[,1])
test_random_f_g <- as.matrix(test_random_g[,-1])
test_random_r_g <- unlist(test_random_g[,1])
test_year_f_g <- as.matrix(test_year_g[,-1])
test_year_r_g <- unlist(test_year_g[,1])
# Random
tunegrid_random_g <- expand.grid(eta = 0.01,
max_depth = c(1,2,3,4,5),
min_child_weight = 5,
subsample = c(0.5,0.6,0.7),
colsample_bytree = c(0.6,0.7,0.8,0.9),
optimal_trees = 0,
min_RMSE = 0,
RMSE_train = 0,
RMSE_test = 0,
accuracy_train = 0,
accuracy_test = 0)
for(i in 1:nrow(tunegrid_random_g)){
params <- list(
eta = tunegrid_random_g$eta[i],
max_depth = tunegrid_random_g$max_depth[i],
min_child_weight = tunegrid_random_g$min_child_weight[i],
subsample = tunegrid_random_g$subsample[i],
colsample_bytree = tunegrid_random_g$colsample_bytree[i]
)
set.seed(3230)
# train model
xgb.tune <- xgb.cv(
params = params,
data = train_random_f_g,
label = train_random_r_g,
nrounds = 2000,
nfold = 10,
objective = "reg:squarederror", # for regression models
verbose = 0, # silent,
early_stopping_rounds = 20 # stop if no improvement for 10 consecutive trees
)
tunegrid_random_g$optimal_trees[i] <- which.min(xgb.tune$evaluation_log$test_rmse_mean)
tunegrid_random_g$min_RMSE[i] <- min(xgb.tune$evaluation_log$test_rmse_mean)
set.seed(3230)
xgb.final <- xgboost(
params = params,
data = train_random_f_g,
label = train_random_r_g,
nrounds = tunegrid_random_g$optimal_trees[i],
objective = "reg:squarederror",
verbose = 0
)
xgb.tune.pred <- predict(xgb.final,train_random_f_g)
xgb.tune.pred.t <- predict(xgb.final,test_random_f_g)
tunegrid_random_g$RMSE_train[i] <- rmse(xgb.tune.pred , train_random_g$scale_vote_share)
tunegrid_random_g$RMSE_test[i] <- rmse(xgb.tune.pred.t , test_random_g$scale_vote_share)
tunegrid_random_g$accuracy_train[i] <- accuracy(train_key_random_g ,xgb.tune.pred)
tunegrid_random_g$accuracy_test[i] <- accuracy(test_key_random_g ,xgb.tune.pred.t)
}
# Finallize
tunegrid_random_g <- tunegrid_random_g %>%
arrange(-accuracy_test,RMSE_test) %>%
head(10)
tunegrid_random_g
params <- list(
eta = tunegrid_random_g$eta[1],
max_depth = tunegrid_random_g$max_depth[1],
min_child_weight = tunegrid_random_g$min_child_weight[1],
subsample = tunegrid_random_g$subsample[1],
colsample_bytree = tunegrid_random_g$colsample_bytree[1]
)
set.seed(3230)
election_xgb_random_g <- xgboost(
params = params,
data = train_random_f_g,
label = train_random_r_g,
nrounds = tunegrid_random_g$optimal_trees[1],
objective = "reg:squarederror",
verbose = 0
)
# create importance matrix
election_xgb_imp_train_random_g <- xgb.importance(model = election_xgb_random_g)
election_xgb_imp_train_random_g <- election_xgb_imp_train_random_g %>%
head(15)
# variable importance plot
ggplot(election_xgb_imp_train_random_g,
aes(x = Gain,
y = reorder(Feature,Gain))) +
scale_x_continuous(limits = c(0,0.3),breaks = seq(0,0.3,0.05))+
geom_col(width=0.8,fill="darkred")+
labs(title="Relative Influence: GB x Random x Second",x="Relative Influence",y="")+
theme_bw()+
theme(text = element_text(size=14, face="bold" ,family="Helvetica"),
plot.title = element_text(size = 16, hjust = 0.5))
ggsave("election_xgb_imp_train_random_g.png")
#RMSE
election_xgb_pred_train_random_g <- predict(election_xgb_random_g,train_random_f_g)
election_xgb_pred_test_random_g <- predict(election_xgb_random_g,test_random_f_g)
rmse(election_xgb_pred_train_random_g , train_random_g$scale_vote_share)
rmse(election_xgb_pred_test_random_g , test_random_g$scale_vote_share)
# Accuracy
accuracy(train_key_random_g ,election_xgb_pred_train_random_g)
accuracy(test_key_random_g ,election_xgb_pred_test_random_g)
# Year
tunegrid_year_g <- expand.grid(eta = 0.01,
max_depth = c(1,2,3,4,5),
min_child_weight = 5,
subsample = c(0.5,0.6,0.7),
colsample_bytree = c(0.6,0.7,0.8,0.9),
optimal_trees = 0,
min_RMSE = 0,
RMSE_train = 0,
RMSE_test = 0,
accuracy_train = 0,
accuracy_test = 0)
for(i in 1:nrow(tunegrid_year_g)){
params <- list(
eta = tunegrid_year_g$eta[i],
max_depth = tunegrid_year_g$max_depth[i],
min_child_weight = tunegrid_year_g$min_child_weight[i],
subsample = tunegrid_year_g$subsample[i],
colsample_bytree = tunegrid_year_g$colsample_bytree[i]
)
set.seed(3230)
# train model
xgb.tune <- xgb.cv(
params = params,
data = train_year_f_g,
label = train_year_r_g,
nrounds = 2000,
nfold = 10,
objective = "reg:squarederror", # for regression models
verbose = 0, # silent,
early_stopping_rounds = 20 # stop if no improvement for 10 consecutive trees
)
tunegrid_year_g$optimal_trees[i] <- which.min(xgb.tune$evaluation_log$test_rmse_mean)
tunegrid_year_g$min_RMSE[i] <- min(xgb.tune$evaluation_log$test_rmse_mean)
set.seed(3230)
xgb.final <- xgboost(
params = params,
data = train_year_f_g,
label = train_year_r_g,
nrounds = tunegrid_year_g$optimal_trees[i],
objective = "reg:squarederror",
verbose = 0
)
xgb.tune.pred <- predict(xgb.final,train_year_f_g)
xgb.tune.pred.t <- predict(xgb.final,test_year_f_g)
tunegrid_year_g$RMSE_train[i] <- rmse(xgb.tune.pred , train_year_g$scale_vote_share)
tunegrid_year_g$RMSE_test[i] <- rmse(xgb.tune.pred.t , test_year_g$scale_vote_share)
tunegrid_year_g$accuracy_train[i] <- accuracy(train_key_year_g ,xgb.tune.pred)
tunegrid_year_g$accuracy_test[i] <- accuracy(test_key_year_g ,xgb.tune.pred.t)
}
# Finallize
tunegrid_year_g <- tunegrid_year_g %>%
arrange(-accuracy_test,RMSE_test) %>%
head(10)
tunegrid_year_g
params <- list(
eta = tunegrid_year_g$eta[1],
max_depth = tunegrid_year_g$max_depth[1],
min_child_weight = tunegrid_year_g$min_child_weight[1],
subsample = tunegrid_year_g$subsample[1],
colsample_bytree = tunegrid_year_g$colsample_bytree[1]
)
set.seed(3230)
election_xgb_year_g <- xgboost(
params = params,
data = train_year_f_g,
label = train_year_r_g,
nrounds = tunegrid_year_g$optimal_trees[1],
objective = "reg:squarederror",
verbose = 0
)
# create importance matrix
election_xgb_imp_train_year_g <- xgb.importance(model = election_xgb_year_g)
election_xgb_imp_train_year_g <- election_xgb_imp_train_year_g %>%
head(15)
# variable importance plot
ggplot(election_xgb_imp_train_year_g,
aes(x = Gain,
y = reorder(Feature,Gain))) +
scale_x_continuous(limits = c(0,0.3),breaks = seq(0,0.3,0.05))+
geom_col(width=0.8,fill="darkred")+
labs(title="Relative Influence: GB x Year x Second",x="Relative Influence",y="")+
theme_bw()+
theme(text = element_text(size=14, face="bold" ,family="Helvetica"),
plot.title = element_text(size = 16, hjust = 0.5))
ggsave("election_xgb_imp_train_year_g.png")
#RMSE
election_xgb_pred_train_year_g <- predict(election_xgb_year_g,train_year_f_g)
election_xgb_pred_test_year_g <- predict(election_xgb_year_g,test_year_f_g)
rmse(election_xgb_pred_train_year_g , train_year_g$scale_vote_share)
rmse(election_xgb_pred_test_year_g , test_year_g$scale_vote_share)
# Accuracy
accuracy(train_key_year_g ,election_xgb_pred_train_year_g)
accuracy(test_key_year_g ,election_xgb_pred_test_year_g)
# Neural Net
maxs <- apply(train_random_g, 2, max)
mins <- apply(train_random_g, 2, min)
train_random_1_g <- as.data.frame(scale(train_random_g, center = mins, scale = maxs - mins))
test_random_1_g <- as.data.frame(scale(test_random_g, center = mins, scale = maxs - mins))
maxs <- apply(train_year_g, 2, max)
mins <- apply(train_year_g, 2, min)
train_year_1_g <- as.data.frame(scale(train_year_g, center = mins, scale = maxs - mins))
test_year_1_g <- as.data.frame(scale(test_year_g, center = mins, scale = maxs - mins))
n <- names(train_random_1_g)
f <- as.formula(paste("scale_vote_share ~", paste(n[!n %in% "scale_vote_share"], collapse = " + ")))
set.seed(323)
election_nn_random.10_g <- neuralnet(f, data = train_random_1_g, hidden = c(10),stepmax = 1000000, linear.output = T)
plot(election_nn_random.10_g, fontsize = 10)
quartz.save("election_nn_random.10_g.png", type="png")
election_nn_pred_train_random.10_g <- compute(election_nn_random.10_g, train_random_1_g[,-1])
election_nn_pred_train_random.10_g <- election_nn_pred_train_random.10_g$net.result *
(max(train_random_g$scale_vote_share) - min(train_random_g$scale_vote_share)) + min(train_random_g$scale_vote_share)
election_nn_pred_test_random.10_g <- compute(election_nn_random.10_g, test_random_1_g[,-1])
election_nn_pred_test_random.10_g <- election_nn_pred_test_random.10_g$net.result *
(max(train_random_g$scale_vote_share) - min(train_random_g$scale_vote_share)) + min(train_random_g$scale_vote_share)
rmse(election_nn_pred_train_random.10_g,train_random_g$scale_vote_share)
rmse(election_nn_pred_test_random.10_g,test_random_g$scale_vote_share)
accuracy(train_key_random_g ,election_nn_pred_train_random.10_g)
accuracy(test_key_random_g ,election_nn_pred_test_random.10_g)
set.seed(323)
election_nn_year.10_g <- neuralnet(f, data = train_year_1_g, hidden = c(10),stepmax = 1000000, linear.output = T)
plot(election_nn_year.10_g, fontsize = 10)
quartz.save("election_nn_year.10_g.png", type="png")
election_nn_pred_train_year.10_g <- compute(election_nn_year.10_g, train_year_1_g[,-1])
election_nn_pred_train_year.10_g <- election_nn_pred_train_year.10_g$net.result *
(max(train_year_g$scale_vote_share) - min(train_year_g$scale_vote_share)) + min(train_year_g$scale_vote_share)
election_nn_pred_test_year.10_g <- compute(election_nn_year.10_g, test_year_1_g[,-1])
election_nn_pred_test_year.10_g <- election_nn_pred_test_year.10_g$net.result *
(max(train_year_g$scale_vote_share) - min(train_year_g$scale_vote_share)) + min(train_year_g$scale_vote_share)
rmse(election_nn_pred_train_year.10_g,train_year_g$scale_vote_share)
rmse(election_nn_pred_test_year.10_g,test_year_g$scale_vote_share)
accuracy(train_key_year_g ,election_nn_pred_train_year.10_g)
accuracy(test_key_year_g ,election_nn_pred_test_year.10_g)
set.seed(323)
election_nn_random.10.6_g <- neuralnet(f, data = train_random_1_g, hidden = c(10,6),stepmax = 1000000, linear.output = T)
plot(election_nn_random.10.6_g, fontsize = 10)
quartz.save("election_nn_random.10.6_g.png", type="png")
election_nn_pred_train_random.10.6_g <- compute(election_nn_random.10.6_g, train_random_1_g[,-1])
election_nn_pred_train_random.10.6_g <- election_nn_pred_train_random.10.6_g$net.result *
(max(train_random_g$scale_vote_share) - min(train_random_g$scale_vote_share)) + min(train_random_g$scale_vote_share)
election_nn_pred_test_random.10.6_g <- compute(election_nn_random.10.6_g, test_random_1_g[,-1])
election_nn_pred_test_random.10.6_g <- election_nn_pred_test_random.10.6_g$net.result *
(max(train_random_g$scale_vote_share) - min(train_random_g$scale_vote_share)) + min(train_random_g$scale_vote_share)
rmse(election_nn_pred_train_random.10.6_g,train_random_g$scale_vote_share)
rmse(election_nn_pred_test_random.10.6_g,test_random_g$scale_vote_share)
accuracy(train_key_random_g ,election_nn_pred_train_random.10.6_g)
accuracy(test_key_random_g ,election_nn_pred_test_random.10.6_g)
set.seed(323)
election_nn_year.10.6_g <- neuralnet(f, data = train_year_1_g, hidden = c(10,6),stepmax = 1000000, linear.output = T)
plot(election_nn_year.10.6_g, fontsize = 10)
quartz.save("election_nn_year.10.6_g.png", type="png")
election_nn_pred_train_year.10.6_g <- compute(election_nn_year.10.6_g, train_year_1_g[,-1])
election_nn_pred_train_year.10.6_g <- election_nn_pred_train_year.10.6_g$net.result *
(max(train_year_g$scale_vote_share) - min(train_year_g$scale_vote_share)) + min(train_year_g$scale_vote_share)
election_nn_pred_test_year.10.6_g <- compute(election_nn_year.10.6_g, test_year_1_g[,-1])
election_nn_pred_test_year.10.6_g <- election_nn_pred_test_year.10.6_g$net.result *
(max(train_year_g$scale_vote_share) - min(train_year_g$scale_vote_share)) + min(train_year_g$scale_vote_share)
rmse(election_nn_pred_train_year.10.6_g,train_year_g$scale_vote_share)
rmse(election_nn_pred_test_year.10.6_g,test_year_g$scale_vote_share)
accuracy(train_key_year_g ,election_nn_pred_train_year.10.6_g)
accuracy(test_key_year_g ,election_nn_pred_test_year.10.6_g)