# Environment
rm(list=ls())
setwd("/Users/yendindin/Library/Mobile Documents/com~apple~CloudDocs/R/ML and Econ")

library(tree)
library(caret)
library(dplyr)
library(randomForest)
library(gbm)
library(neuralnet)
library(xgboost)

data <- readRDS("election_clean.rds")

# Drop only 1 person
data <- data %>%
  filter(vote_share<1)
#######

# Add six city
data <- data %>%
  mutate(six_city=as.integer(area_eng %in% c("Taipei","NTaipei","Taoyuan","Kaohsiung","Taichung","Tainan")))

# Function
rmse = function(actual, predicted) {
  sqrt(mean((actual - predicted) ^ 2))
}

block_split <- function(df,train_per,type){
  if(type=="random"){
    df1 <- df
    df1 <- df1 %>%
      group_by(year,area_eng,vote_area_index) %>%
      mutate(group = cur_group_id()) %>% 
      ungroup()
    sample_train <- sample(unique(df1$group),round(train_per*max(df1$group)),replace=F)
    
    train_df1 <- df1 %>%
      filter(group %in% sample_train) %>%
      select(-year,-area_eng,-vote_area_index,-group,-seats,-number,-win)
    train_key1 <- df1 %>%
      filter(group %in% sample_train) %>%
      select(year,area_eng,vote_area_index,seats,number,win)
    test_df1 <- df1 %>%
      filter(!group %in% sample_train) %>%
      select(-year,-area_eng,-vote_area_index,-group,-seats,-number,-win)
    test_key1 <- df1 %>%
      filter(!group %in% sample_train) %>%
      select(year,area_eng,vote_area_index,seats,number,win)
  }else{
    df1 <- df
    df1 <- df1 %>%
      group_by(year,area_eng,vote_area_index) %>%
      mutate(group = cur_group_id()) %>% 
      ungroup()
    train_df1 <- df1 %>%
      filter(year!=as.integer(type)) %>%
      select(-year,-area_eng,-vote_area_index,-group,-seats,-number,-win)
    train_key1 <- df1 %>%
      filter(year!=as.integer(type)) %>%
      select(year,area_eng,vote_area_index,seats,number,win)
    test_df1 <- df1 %>%
      filter(year==as.integer(type)) %>%
      select(-year,-area_eng,-vote_area_index,-group,-seats,-number,-win)
    test_key1 <- df1 %>%
      filter(year==as.integer(type)) %>%
      select(year,area_eng,vote_area_index,seats,number,win)
  }
  print("Remember to set seeds in previous.")
  return(list(train_df1,test_df1,train_key1,test_key1))
}
# Accuracy
accuracy <- function(key1,predict1){
  res1 <- key1 %>%
    cbind(predict1) %>%
    group_by(year,area_eng,vote_area_index) %>%
    mutate(rank = n()-rank(predict1,ties.method = "random")+1) %>%
    mutate(win_predict = as.integer(rank<=seats)) %>%
    mutate(correct = as.integer(win==win_predict))
  accu <- sum(res1$correct)/nrow(res1)
  return(accu)
}

####### Variarion ######
# Split sample as random block or 99, 103 and test 107
# Predict different outcome
# Put second_gene or not, combine with incumbent
# RMSE and Accuracy


# Use scale vote share, no second_gene

data_1 <- data %>%
  select(c(36,1,2,3,4,12,31,6,9,seq(17,29),seq(39,45)))
set.seed(3230)
data_random <- block_split(data_1,0.7,"random")
train_random <- data_random[[1]]
test_random <- data_random[[2]]
train_key_random <- data_random[[3]]
test_key_random  <- data_random[[4]]

data_year <- block_split(data_1,0.7,"107")
train_year <- data_year[[1]]
test_year <- data_year[[2]]
train_key_year <- data_year[[3]]
test_key_year  <- data_year[[4]]

# OLS
# Random
election_lm_random = lm(scale_vote_share ~ ., train_random)
# RMSE
election_lm_pred_train_random  = predict(election_lm_random , newdata = train_random)
rmse(election_lm_pred_train_random , train_random$scale_vote_share)
election_lm_pred_test_random  = predict(election_lm_random , newdata = test_random)
rmse(election_lm_pred_test_random , test_random$scale_vote_share)
# Accuracy
accuracy(train_key_random ,election_lm_pred_train_random )
accuracy(test_key_random ,election_lm_pred_test_random )

# Year
election_lm_year = lm(scale_vote_share ~ ., train_year)
# RMSE
election_lm_pred_train_year  = predict(election_lm_year , newdata = train_year)
rmse(election_lm_pred_train_year , train_year$scale_vote_share)
election_lm_pred_test_year  = predict(election_lm_year , newdata = test_year)
rmse(election_lm_pred_test_year , test_year$scale_vote_share)
# Accuracy
accuracy(train_key_year ,election_lm_pred_train_year )
accuracy(test_key_year ,election_lm_pred_test_year )

# Random Forest
# Random
control <- trainControl(method = "oob")
tunegrid <- expand.grid(.mtry = seq(1, 15, 1))
set.seed(123)
election_rf_random <- train(scale_vote_share ~ .,
                            data = train_random,
                            method = "rf",
                            metric = "RMSE", 
                            tuneGrid = tunegrid,
                            trControl = control,
                            importance = TRUE,
                            ntree = 1000)
plot(election_rf_random)
importance(election_rf_random$finalModel)
plot(varImp(election_rf_random))

#RMSE
election_rf_pred_train_random <- predict(election_rf_random, newdata = train_random)
rmse(election_rf_pred_train_random , train_random$scale_vote_share)
election_rf_pred_test_random  = predict(election_rf_random , newdata = test_random)
rmse(election_rf_pred_test_random , test_random$scale_vote_share)

# Accuracy
accuracy(train_key_random ,election_rf_pred_train_random )
accuracy(test_key_random ,election_rf_pred_test_random )

# Year
set.seed(123)
election_rf_year <- train(scale_vote_share ~ .,
                            data = train_year,
                            method = "rf",
                            metric = "RMSE", 
                            tuneGrid = tunegrid,
                            trControl = control,
                            importance = TRUE,
                            ntree = 1000)
plot(election_rf_year)
importance(election_rf_year$finalModel)
plot(varImp(election_rf_year))

#RMSE
election_rf_pred_train_year <- predict(election_rf_year, newdata = train_year)
rmse(election_rf_pred_train_year , train_year$scale_vote_share)
election_rf_pred_test_year  = predict(election_rf_year , newdata = test_year)
rmse(election_rf_pred_test_year , test_year$scale_vote_share)

# Accuracy
accuracy(train_key_year ,election_rf_pred_train_year )
accuracy(test_key_year ,election_rf_pred_test_year )


# GBM
train_random_f <- as.matrix(train_random[,-1])
train_random_r <- unlist(train_random[,1])
train_year_f <- as.matrix(train_year[,-1])
train_year_r <- unlist(train_year[,1])

test_random_f <- as.matrix(test_random[,-1])
test_random_r <- unlist(test_random[,1])
test_year_f <- as.matrix(test_year[,-1])
test_year_r <- unlist(test_year[,1])

# Random
tunegrid_random <- expand.grid(eta = 0.01,
                        max_depth = c(1,2,3,4,5),
                        min_child_weight = 5,
                        subsample = c(0.5,0.6,0.7),
                        colsample_bytree = c(0.6,0.7,0.8,0.9),
                        optimal_trees = 0,
                        min_RMSE = 0,
                        RMSE_train = 0,
                        RMSE_test = 0,
                        accuracy_train = 0,
                        accuracy_test = 0)

for(i in 1:nrow(tunegrid_random)){
  params <- list(
    eta = tunegrid_random$eta[i],
    max_depth = tunegrid_random$max_depth[i],
    min_child_weight = tunegrid_random$min_child_weight[i],
    subsample = tunegrid_random$subsample[i],
    colsample_bytree = tunegrid_random$colsample_bytree[i]
  )
  set.seed(3230)
  # train model
  xgb.tune <- xgb.cv(
    params = params,
    data = train_random_f,
    label = train_random_r,
    nrounds = 2000,
    nfold = 10,
    objective = "reg:squarederror",  # for regression models
    verbose = 0,               # silent,
    early_stopping_rounds = 20 # stop if no improvement for 10 consecutive trees
  )
  tunegrid_random$optimal_trees[i] <- which.min(xgb.tune$evaluation_log$test_rmse_mean)
  tunegrid_random$min_RMSE[i] <- min(xgb.tune$evaluation_log$test_rmse_mean)
  set.seed(3230)
  xgb.final <- xgboost(
    params = params,
    data = train_random_f,
    label = train_random_r,
    nrounds = tunegrid_random$optimal_trees[i],
    objective = "reg:squarederror",
    verbose = 0
  )
  xgb.tune.pred <- predict(xgb.final,train_random_f)
  xgb.tune.pred.t <- predict(xgb.final,test_random_f)
  tunegrid_random$RMSE_train[i] <- rmse(xgb.tune.pred , train_random$scale_vote_share)
  tunegrid_random$RMSE_test[i] <- rmse(xgb.tune.pred.t , test_random$scale_vote_share)
  tunegrid_random$accuracy_train[i] <- accuracy(train_key_random ,xgb.tune.pred)
  tunegrid_random$accuracy_test[i] <- accuracy(test_key_random ,xgb.tune.pred.t)
}
# Finallize
tunegrid_random <- tunegrid_random %>%
  arrange(-accuracy_test,RMSE_test) %>%
  head(10)
tunegrid_random
params <- list(
  eta = tunegrid_random$eta[1],
  max_depth = tunegrid_random$max_depth[1],
  min_child_weight = tunegrid_random$min_child_weight[1],
  subsample = tunegrid_random$subsample[1],
  colsample_bytree = tunegrid_random$colsample_bytree[1]
)
set.seed(3230)
election_xgb_random <- xgboost(
  params = params,
  data = train_random_f,
  label = train_random_r,
  nrounds = tunegrid_random$optimal_trees[1],
  objective = "reg:squarederror",
  verbose = 0
)

# create importance matrix
election_xgb_imp_train_random <- xgb.importance(model = election_xgb_random)
election_xgb_imp_train_random <- election_xgb_imp_train_random %>%
  head(15)
# variable importance plot
ggplot(election_xgb_imp_train_random,
       aes(x = Gain,
           y = reorder(Feature,Gain))) +
  scale_x_continuous(limits = c(0,0.26),breaks = seq(0,0.25,0.05))+
  geom_col(width=0.8,fill="Dodgerblue3")+
  labs(title="Relative Influence: GB x random",x="Relative Influence",y="")+
  theme_bw()+
  theme(text = element_text(size=14, face="bold" ,family="Helvetica"),
        plot.title = element_text(size = 16, hjust = 0.5))
ggsave("election_xgb_imp_train_random.png")
#RMSE
election_xgb_pred_train_random <- predict(election_xgb_random,train_random_f)
election_xgb_pred_test_random <- predict(election_xgb_random,test_random_f)
rmse(election_xgb_pred_train_random , train_random$scale_vote_share)
rmse(election_xgb_pred_test_random , test_random$scale_vote_share)
# Accuracy
accuracy(train_key_random ,election_xgb_pred_train_random)
accuracy(test_key_random ,election_xgb_pred_test_random)

# Year
tunegrid_year <- expand.grid(eta = 0.01,
                             max_depth = c(1,2,3,4,5),
                             min_child_weight = 5,
                             subsample = c(0.5,0.6,0.7),
                             colsample_bytree = c(0.6,0.7,0.8,0.9),
                             optimal_trees = 0,
                             min_RMSE = 0,
                             RMSE_train = 0,
                             RMSE_test = 0,
                             accuracy_train = 0,
                             accuracy_test = 0)

for(i in 1:nrow(tunegrid_year)){
  params <- list(
    eta = tunegrid_year$eta[i],
    max_depth = tunegrid_year$max_depth[i],
    min_child_weight = tunegrid_year$min_child_weight[i],
    subsample = tunegrid_year$subsample[i],
    colsample_bytree = tunegrid_year$colsample_bytree[i]
  )
  set.seed(3230)
  # train model
  xgb.tune <- xgb.cv(
    params = params,
    data = train_year_f,
    label = train_year_r,
    nrounds = 2000,
    nfold = 10,
    objective = "reg:squarederror",  # for regression models
    verbose = 0,               # silent,
    early_stopping_rounds = 20 # stop if no improvement for 10 consecutive trees
  )
  tunegrid_year$optimal_trees[i] <- which.min(xgb.tune$evaluation_log$test_rmse_mean)
  tunegrid_year$min_RMSE[i] <- min(xgb.tune$evaluation_log$test_rmse_mean)
  set.seed(3230)
  xgb.final <- xgboost(
    params = params,
    data = train_year_f,
    label = train_year_r,
    nrounds = tunegrid_year$optimal_trees[i],
    objective = "reg:squarederror",
    verbose = 0
  )
  xgb.tune.pred <- predict(xgb.final,train_year_f)
  xgb.tune.pred.t <- predict(xgb.final,test_year_f)
  tunegrid_year$RMSE_train[i] <- rmse(xgb.tune.pred , train_year$scale_vote_share)
  tunegrid_year$RMSE_test[i] <- rmse(xgb.tune.pred.t , test_year$scale_vote_share)
  tunegrid_year$accuracy_train[i] <- accuracy(train_key_year ,xgb.tune.pred)
  tunegrid_year$accuracy_test[i] <- accuracy(test_key_year ,xgb.tune.pred.t)
}
# Finallize
tunegrid_year <- tunegrid_year %>%
  arrange(-accuracy_test,RMSE_test) %>%
  head(10)
tunegrid_year
params <- list(
  eta = tunegrid_year$eta[1],
  max_depth = tunegrid_year$max_depth[1],
  min_child_weight = tunegrid_year$min_child_weight[1],
  subsample = tunegrid_year$subsample[1],
  colsample_bytree = tunegrid_year$colsample_bytree[1]
)
set.seed(3230)
election_xgb_year <- xgboost(
  params = params,
  data = train_year_f,
  label = train_year_r,
  nrounds = tunegrid_year$optimal_trees[1],
  objective = "reg:squarederror",
  verbose = 0
)
# create importance matrix
election_xgb_imp_train_year <- xgb.importance(model = election_xgb_year)
election_xgb_imp_train_year <- election_xgb_imp_train_year %>%
  head(15)
# variable importance plot
ggplot(election_xgb_imp_train_year,
       aes(x = Gain,
           y = reorder(Feature,Gain))) +
  scale_x_continuous(limits = c(0,0.3),breaks = seq(0,0.3,0.05))+
  geom_col(width=0.8,fill="Dodgerblue3")+
  labs(title="Relative Influence: GB x Year",x="Relative Influence",y="")+
  theme_bw()+
  theme(text = element_text(size=14, face="bold" ,family="Helvetica"),
        plot.title = element_text(size = 16, hjust = 0.5))
ggsave("election_xgb_imp_train_year.png")
#RMSE
election_xgb_pred_train_year <- predict(election_xgb_year,train_year_f)
election_xgb_pred_test_year <- predict(election_xgb_year,test_year_f)
rmse(election_xgb_pred_train_year , train_year$scale_vote_share)
rmse(election_xgb_pred_test_year , test_year$scale_vote_share)
# Accuracy
accuracy(train_key_year ,election_xgb_pred_train_year)
accuracy(test_key_year ,election_xgb_pred_test_year)


# Neural Net

maxs <- apply(train_random, 2, max) 
mins <- apply(train_random, 2, min)
train_random_1 <- as.data.frame(scale(train_random, center = mins, scale = maxs - mins))
test_random_1 <- as.data.frame(scale(test_random, center = mins, scale = maxs - mins))

maxs <- apply(train_year, 2, max) 
mins <- apply(train_year, 2, min)
train_year_1 <- as.data.frame(scale(train_year, center = mins, scale = maxs - mins))
test_year_1 <- as.data.frame(scale(test_year, center = mins, scale = maxs - mins))

n <- names(train_random_1)
f <- as.formula(paste("scale_vote_share ~", paste(n[!n %in% "scale_vote_share"], collapse = " + ")))

set.seed(323)
election_nn_random.10 <- neuralnet(f, data = train_random_1, hidden = c(10),stepmax = 500000, linear.output = T)

plot(election_nn_random.10, fontsize = 10)
quartz.save("(election_nn_random.10.png", type="png")

election_nn_pred_train_random.10 <- compute(election_nn_random.10, train_random_1[,-1])
election_nn_pred_train_random.10 <- election_nn_pred_train_random.10$net.result *
  (max(train_random$scale_vote_share) - min(train_random$scale_vote_share)) + min(train_random$scale_vote_share)
election_nn_pred_test_random.10 <- compute(election_nn_random.10, test_random_1[,-1])
election_nn_pred_test_random.10 <- election_nn_pred_test_random.10$net.result *
  (max(train_random$scale_vote_share) - min(train_random$scale_vote_share)) + min(train_random$scale_vote_share)

rmse(election_nn_pred_train_random.10,train_random$scale_vote_share)
rmse(election_nn_pred_test_random.10,test_random$scale_vote_share)
accuracy(train_key_random ,election_nn_pred_train_random.10)
accuracy(test_key_random ,election_nn_pred_test_random.10)

set.seed(323)
election_nn_year.10 <- neuralnet(f, data = train_year_1, hidden = c(10),stepmax = 500000, linear.output = T)

plot(election_nn_year.10, fontsize = 10)
quartz.save("(election_nn_year.10.png", type="png")

election_nn_pred_train_year.10 <- compute(election_nn_year.10, train_year_1[,-1])
election_nn_pred_train_year.10 <- election_nn_pred_train_year.10$net.result *
  (max(train_year$scale_vote_share) - min(train_year$scale_vote_share)) + min(train_year$scale_vote_share)
election_nn_pred_test_year.10 <- compute(election_nn_year.10, test_year_1[,-1])
election_nn_pred_test_year.10 <- election_nn_pred_test_year.10$net.result *
  (max(train_year$scale_vote_share) - min(train_year$scale_vote_share)) + min(train_year$scale_vote_share)

rmse(election_nn_pred_train_year.10,train_year$scale_vote_share)
rmse(election_nn_pred_test_year.10,test_year$scale_vote_share)
accuracy(train_key_year ,election_nn_pred_train_year.10)
accuracy(test_key_year ,election_nn_pred_test_year.10)

set.seed(323)
election_nn_random.10.6 <- neuralnet(f, data = train_random_1, hidden = c(10,6),stepmax = 500000, linear.output = T)

plot(election_nn_random.10.6, fontsize = 10)
quartz.save("(election_nn_random.10.6.png", type="png")

election_nn_pred_train_random.10.6 <- compute(election_nn_random.10.6, train_random_1[,-1])
election_nn_pred_train_random.10.6 <- election_nn_pred_train_random.10.6$net.result *
  (max(train_random$scale_vote_share) - min(train_random$scale_vote_share)) + min(train_random$scale_vote_share)
election_nn_pred_test_random.10.6 <- compute(election_nn_random.10.6, test_random_1[,-1])
election_nn_pred_test_random.10.6 <- election_nn_pred_test_random.10.6$net.result *
  (max(train_random$scale_vote_share) - min(train_random$scale_vote_share)) + min(train_random$scale_vote_share)

rmse(election_nn_pred_train_random.10.6,train_random$scale_vote_share)
rmse(election_nn_pred_test_random.10.6,test_random$scale_vote_share)
accuracy(train_key_random ,election_nn_pred_train_random.10.6)
accuracy(test_key_random ,election_nn_pred_test_random.10.6)

set.seed(323)
election_nn_year.10.6 <- neuralnet(f, data = train_year_1, hidden = c(10,6),stepmax = 500000, linear.output = T)

plot(election_nn_year.10.6, fontsize = 10)
quartz.save("(election_nn_year.10.6.png", type="png")

election_nn_pred_train_year.10.6 <- compute(election_nn_year.10.6, train_year_1[,-1])
election_nn_pred_train_year.10.6 <- election_nn_pred_train_year.10.6$net.result *
  (max(train_year$scale_vote_share) - min(train_year$scale_vote_share)) + min(train_year$scale_vote_share)
election_nn_pred_test_year.10.6 <- compute(election_nn_year.10.6, test_year_1[,-1])
election_nn_pred_test_year.10.6 <- election_nn_pred_test_year.10.6$net.result *
  (max(train_year$scale_vote_share) - min(train_year$scale_vote_share)) + min(train_year$scale_vote_share)

rmse(election_nn_pred_train_year.10.6,train_year$scale_vote_share)
rmse(election_nn_pred_test_year.10.6,test_year$scale_vote_share)
accuracy(train_key_year ,election_nn_pred_train_year.10.6)
accuracy(test_key_year ,election_nn_pred_test_year.10.6)

# Add second generation with incumbent
data$incumbent_or_2_gene <- ifelse(data$incumbent+data$second_gene!=0,1,0)
# Use scale vote share, no second_gene

data_1_g <- data %>%
  select(c(36,1,2,3,4,12,31,6,46,seq(17,29),seq(39,45)))
set.seed(3230)
data_random_g <- block_split(data_1_g,0.7,"random")
train_random_g <- data_random_g[[1]]
test_random_g <- data_random_g[[2]]
train_key_random_g <- data_random_g[[3]]
test_key_random_g  <- data_random_g[[4]]

data_year_g <- block_split(data_1_g,0.7,"107")
train_year_g <- data_year_g[[1]]
test_year_g <- data_year_g[[2]]
train_key_year_g <- data_year_g[[3]]
test_key_year_g  <- data_year_g[[4]]

# OLS
# Random
election_lm_random_g = lm(scale_vote_share ~ ., train_random_g)
# RMSE
election_lm_pred_train_random_g  = predict(election_lm_random_g , newdata = train_random_g)
rmse(election_lm_pred_train_random_g , train_random_g$scale_vote_share)
election_lm_pred_test_random_g  = predict(election_lm_random_g , newdata = test_random_g)
rmse(election_lm_pred_test_random_g , test_random_g$scale_vote_share)
# Accuracy
accuracy(train_key_random_g ,election_lm_pred_train_random_g )
accuracy(test_key_random_g ,election_lm_pred_test_random_g )

# Year
election_lm_year_g = lm(scale_vote_share ~ ., train_year_g)
# RMSE
election_lm_pred_train_year_g  = predict(election_lm_year_g , newdata = train_year_g)
rmse(election_lm_pred_train_year_g , train_year_g$scale_vote_share)
election_lm_pred_test_year_g  = predict(election_lm_year_g , newdata = test_year_g)
rmse(election_lm_pred_test_year_g , test_year_g$scale_vote_share)
# Accuracy
accuracy(train_key_year_g ,election_lm_pred_train_year_g )
accuracy(test_key_year_g ,election_lm_pred_test_year_g )

# Random Forest
# Random
control <- trainControl(method = "oob")
tunegrid <- expand.grid(.mtry = seq(1, 15, 1))
set.seed(123)
election_rf_random_g <- train(scale_vote_share ~ .,
                            data = train_random_g,
                            method = "rf",
                            metric = "RMSE", 
                            tuneGrid = tunegrid,
                            trControl = control,
                            importance = TRUE,
                            ntree = 1000)
plot(election_rf_random_g)
importance(election_rf_random_g$finalModel)
plot(varImp(election_rf_random_g))

#RMSE
election_rf_pred_train_random_g <- predict(election_rf_random_g, newdata = train_random_g)
rmse(election_rf_pred_train_random_g , train_random_g$scale_vote_share)
election_rf_pred_test_random_g  <- predict(election_rf_random_g , newdata = test_random_g)
rmse(election_rf_pred_test_random_g , test_random_g$scale_vote_share)

# Accuracy
accuracy(train_key_random_g ,election_rf_pred_train_random_g )
accuracy(test_key_random_g ,election_rf_pred_test_random_g )

# Year
set.seed(123)
election_rf_year_g <- train(scale_vote_share ~ .,
                            data = train_year_g,
                            method = "rf",
                            metric = "RMSE", 
                            tuneGrid = tunegrid,
                            trControl = control,
                            importance = TRUE,
                            ntree = 1000)
plot(election_rf_year_g)
importance(election_rf_year_g$finalModel)
plot(varImp(election_rf_year_g))

#RMSE
election_rf_pred_train_year_g <- predict(election_rf_year_g, newdata = train_year_g)
rmse(election_rf_pred_train_year_g , train_year_g$scale_vote_share)
election_rf_pred_test_year_g <- predict(election_rf_year_g , newdata = test_year_g)
rmse(election_rf_pred_test_year_g , test_year_g$scale_vote_share)

# Accuracy
accuracy(train_key_year_g ,election_rf_pred_train_year_g )
accuracy(test_key_year_g ,election_rf_pred_test_year_g )

# GBM
train_random_f_g <- as.matrix(train_random_g[,-1])
train_random_r_g <- unlist(train_random_g[,1])
train_year_f_g <- as.matrix(train_year_g[,-1])
train_year_r_g <- unlist(train_year_g[,1])

test_random_f_g <- as.matrix(test_random_g[,-1])
test_random_r_g <- unlist(test_random_g[,1])
test_year_f_g <- as.matrix(test_year_g[,-1])
test_year_r_g <- unlist(test_year_g[,1])

# Random
tunegrid_random_g <- expand.grid(eta = 0.01,
                                 max_depth = c(1,2,3,4,5),
                                 min_child_weight = 5,
                                 subsample = c(0.5,0.6,0.7),
                                 colsample_bytree = c(0.6,0.7,0.8,0.9),
                                 optimal_trees = 0,
                                 min_RMSE = 0,
                                 RMSE_train = 0,
                                 RMSE_test = 0,
                                 accuracy_train = 0,
                                 accuracy_test = 0)

for(i in 1:nrow(tunegrid_random_g)){
  params <- list(
    eta = tunegrid_random_g$eta[i],
    max_depth = tunegrid_random_g$max_depth[i],
    min_child_weight = tunegrid_random_g$min_child_weight[i],
    subsample = tunegrid_random_g$subsample[i],
    colsample_bytree = tunegrid_random_g$colsample_bytree[i]
  )
  set.seed(3230)
  # train model
  xgb.tune <- xgb.cv(
    params = params,
    data = train_random_f_g,
    label = train_random_r_g,
    nrounds = 2000,
    nfold = 10,
    objective = "reg:squarederror",  # for regression models
    verbose = 0,               # silent,
    early_stopping_rounds = 20 # stop if no improvement for 10 consecutive trees
  )
  tunegrid_random_g$optimal_trees[i] <- which.min(xgb.tune$evaluation_log$test_rmse_mean)
  tunegrid_random_g$min_RMSE[i] <- min(xgb.tune$evaluation_log$test_rmse_mean)
  set.seed(3230)
  xgb.final <- xgboost(
    params = params,
    data = train_random_f_g,
    label = train_random_r_g,
    nrounds = tunegrid_random_g$optimal_trees[i],
    objective = "reg:squarederror",
    verbose = 0
  )
  xgb.tune.pred <- predict(xgb.final,train_random_f_g)
  xgb.tune.pred.t <- predict(xgb.final,test_random_f_g)
  tunegrid_random_g$RMSE_train[i] <- rmse(xgb.tune.pred , train_random_g$scale_vote_share)
  tunegrid_random_g$RMSE_test[i] <- rmse(xgb.tune.pred.t , test_random_g$scale_vote_share)
  tunegrid_random_g$accuracy_train[i] <- accuracy(train_key_random_g ,xgb.tune.pred)
  tunegrid_random_g$accuracy_test[i] <- accuracy(test_key_random_g ,xgb.tune.pred.t)
}
# Finallize
tunegrid_random_g <- tunegrid_random_g %>%
  arrange(-accuracy_test,RMSE_test) %>%
  head(10)
tunegrid_random_g
params <- list(
  eta = tunegrid_random_g$eta[1],
  max_depth = tunegrid_random_g$max_depth[1],
  min_child_weight = tunegrid_random_g$min_child_weight[1],
  subsample = tunegrid_random_g$subsample[1],
  colsample_bytree = tunegrid_random_g$colsample_bytree[1]
)
set.seed(3230)
election_xgb_random_g <- xgboost(
  params = params,
  data = train_random_f_g,
  label = train_random_r_g,
  nrounds = tunegrid_random_g$optimal_trees[1],
  objective = "reg:squarederror",
  verbose = 0
)
# create importance matrix
election_xgb_imp_train_random_g <- xgb.importance(model = election_xgb_random_g)
election_xgb_imp_train_random_g <- election_xgb_imp_train_random_g %>%
  head(15)
# variable importance plot
ggplot(election_xgb_imp_train_random_g,
       aes(x = Gain,
           y = reorder(Feature,Gain))) +
  scale_x_continuous(limits = c(0,0.3),breaks = seq(0,0.3,0.05))+
  geom_col(width=0.8,fill="darkred")+
  labs(title="Relative Influence: GB x Random x Second",x="Relative Influence",y="")+
  theme_bw()+
  theme(text = element_text(size=14, face="bold" ,family="Helvetica"),
        plot.title = element_text(size = 16, hjust = 0.5))
ggsave("election_xgb_imp_train_random_g.png")
#RMSE
election_xgb_pred_train_random_g <- predict(election_xgb_random_g,train_random_f_g)
election_xgb_pred_test_random_g <- predict(election_xgb_random_g,test_random_f_g)
rmse(election_xgb_pred_train_random_g , train_random_g$scale_vote_share)
rmse(election_xgb_pred_test_random_g , test_random_g$scale_vote_share)
# Accuracy
accuracy(train_key_random_g ,election_xgb_pred_train_random_g)
accuracy(test_key_random_g ,election_xgb_pred_test_random_g)

# Year
tunegrid_year_g <- expand.grid(eta = 0.01,
                               max_depth = c(1,2,3,4,5),
                               min_child_weight = 5,
                               subsample = c(0.5,0.6,0.7),
                               colsample_bytree = c(0.6,0.7,0.8,0.9),
                               optimal_trees = 0,
                               min_RMSE = 0,
                               RMSE_train = 0,
                               RMSE_test = 0,
                               accuracy_train = 0,
                               accuracy_test = 0)

for(i in 1:nrow(tunegrid_year_g)){
  params <- list(
    eta = tunegrid_year_g$eta[i],
    max_depth = tunegrid_year_g$max_depth[i],
    min_child_weight = tunegrid_year_g$min_child_weight[i],
    subsample = tunegrid_year_g$subsample[i],
    colsample_bytree = tunegrid_year_g$colsample_bytree[i]
  )
  set.seed(3230)
  # train model
  xgb.tune <- xgb.cv(
    params = params,
    data = train_year_f_g,
    label = train_year_r_g,
    nrounds = 2000,
    nfold = 10,
    objective = "reg:squarederror",  # for regression models
    verbose = 0,               # silent,
    early_stopping_rounds = 20 # stop if no improvement for 10 consecutive trees
  )
  tunegrid_year_g$optimal_trees[i] <- which.min(xgb.tune$evaluation_log$test_rmse_mean)
  tunegrid_year_g$min_RMSE[i] <- min(xgb.tune$evaluation_log$test_rmse_mean)
  set.seed(3230)
  xgb.final <- xgboost(
    params = params,
    data = train_year_f_g,
    label = train_year_r_g,
    nrounds = tunegrid_year_g$optimal_trees[i],
    objective = "reg:squarederror",
    verbose = 0
  )
  xgb.tune.pred <- predict(xgb.final,train_year_f_g)
  xgb.tune.pred.t <- predict(xgb.final,test_year_f_g)
  tunegrid_year_g$RMSE_train[i] <- rmse(xgb.tune.pred , train_year_g$scale_vote_share)
  tunegrid_year_g$RMSE_test[i] <- rmse(xgb.tune.pred.t , test_year_g$scale_vote_share)
  tunegrid_year_g$accuracy_train[i] <- accuracy(train_key_year_g ,xgb.tune.pred)
  tunegrid_year_g$accuracy_test[i] <- accuracy(test_key_year_g ,xgb.tune.pred.t)
}
# Finallize
tunegrid_year_g <- tunegrid_year_g %>%
  arrange(-accuracy_test,RMSE_test) %>%
  head(10)
tunegrid_year_g
params <- list(
  eta = tunegrid_year_g$eta[1],
  max_depth = tunegrid_year_g$max_depth[1],
  min_child_weight = tunegrid_year_g$min_child_weight[1],
  subsample = tunegrid_year_g$subsample[1],
  colsample_bytree = tunegrid_year_g$colsample_bytree[1]
)
set.seed(3230)
election_xgb_year_g <- xgboost(
  params = params,
  data = train_year_f_g,
  label = train_year_r_g,
  nrounds = tunegrid_year_g$optimal_trees[1],
  objective = "reg:squarederror",
  verbose = 0
)
# create importance matrix
election_xgb_imp_train_year_g <- xgb.importance(model = election_xgb_year_g)
election_xgb_imp_train_year_g <- election_xgb_imp_train_year_g %>%
  head(15)
# variable importance plot
ggplot(election_xgb_imp_train_year_g,
       aes(x = Gain,
           y = reorder(Feature,Gain))) +
  scale_x_continuous(limits = c(0,0.3),breaks = seq(0,0.3,0.05))+
  geom_col(width=0.8,fill="darkred")+
  labs(title="Relative Influence: GB x Year x Second",x="Relative Influence",y="")+
  theme_bw()+
  theme(text = element_text(size=14, face="bold" ,family="Helvetica"),
        plot.title = element_text(size = 16, hjust = 0.5))
ggsave("election_xgb_imp_train_year_g.png")
#RMSE
election_xgb_pred_train_year_g <- predict(election_xgb_year_g,train_year_f_g)
election_xgb_pred_test_year_g <- predict(election_xgb_year_g,test_year_f_g)
rmse(election_xgb_pred_train_year_g , train_year_g$scale_vote_share)
rmse(election_xgb_pred_test_year_g , test_year_g$scale_vote_share)
# Accuracy
accuracy(train_key_year_g ,election_xgb_pred_train_year_g)
accuracy(test_key_year_g ,election_xgb_pred_test_year_g)


# Neural Net

maxs <- apply(train_random_g, 2, max) 
mins <- apply(train_random_g, 2, min)
train_random_1_g <- as.data.frame(scale(train_random_g, center = mins, scale = maxs - mins))
test_random_1_g <- as.data.frame(scale(test_random_g, center = mins, scale = maxs - mins))

maxs <- apply(train_year_g, 2, max) 
mins <- apply(train_year_g, 2, min)
train_year_1_g <- as.data.frame(scale(train_year_g, center = mins, scale = maxs - mins))
test_year_1_g <- as.data.frame(scale(test_year_g, center = mins, scale = maxs - mins))

n <- names(train_random_1_g)
f <- as.formula(paste("scale_vote_share ~", paste(n[!n %in% "scale_vote_share"], collapse = " + ")))

set.seed(323)
election_nn_random.10_g <- neuralnet(f, data = train_random_1_g, hidden = c(10),stepmax = 1000000, linear.output = T)

plot(election_nn_random.10_g, fontsize = 10)
quartz.save("election_nn_random.10_g.png", type="png")

election_nn_pred_train_random.10_g <- compute(election_nn_random.10_g, train_random_1_g[,-1])
election_nn_pred_train_random.10_g <- election_nn_pred_train_random.10_g$net.result *
  (max(train_random_g$scale_vote_share) - min(train_random_g$scale_vote_share)) + min(train_random_g$scale_vote_share)
election_nn_pred_test_random.10_g <- compute(election_nn_random.10_g, test_random_1_g[,-1])
election_nn_pred_test_random.10_g <- election_nn_pred_test_random.10_g$net.result *
  (max(train_random_g$scale_vote_share) - min(train_random_g$scale_vote_share)) + min(train_random_g$scale_vote_share)

rmse(election_nn_pred_train_random.10_g,train_random_g$scale_vote_share)
rmse(election_nn_pred_test_random.10_g,test_random_g$scale_vote_share)
accuracy(train_key_random_g ,election_nn_pred_train_random.10_g)
accuracy(test_key_random_g ,election_nn_pred_test_random.10_g)

set.seed(323)
election_nn_year.10_g <- neuralnet(f, data = train_year_1_g, hidden = c(10),stepmax = 1000000, linear.output = T)

plot(election_nn_year.10_g, fontsize = 10)
quartz.save("election_nn_year.10_g.png", type="png")

election_nn_pred_train_year.10_g <- compute(election_nn_year.10_g, train_year_1_g[,-1])
election_nn_pred_train_year.10_g <- election_nn_pred_train_year.10_g$net.result *
  (max(train_year_g$scale_vote_share) - min(train_year_g$scale_vote_share)) + min(train_year_g$scale_vote_share)
election_nn_pred_test_year.10_g <- compute(election_nn_year.10_g, test_year_1_g[,-1])
election_nn_pred_test_year.10_g <- election_nn_pred_test_year.10_g$net.result *
  (max(train_year_g$scale_vote_share) - min(train_year_g$scale_vote_share)) + min(train_year_g$scale_vote_share)

rmse(election_nn_pred_train_year.10_g,train_year_g$scale_vote_share)
rmse(election_nn_pred_test_year.10_g,test_year_g$scale_vote_share)
accuracy(train_key_year_g ,election_nn_pred_train_year.10_g)
accuracy(test_key_year_g ,election_nn_pred_test_year.10_g)

set.seed(323)
election_nn_random.10.6_g <- neuralnet(f, data = train_random_1_g, hidden = c(10,6),stepmax = 1000000, linear.output = T)

plot(election_nn_random.10.6_g, fontsize = 10)
quartz.save("election_nn_random.10.6_g.png", type="png")

election_nn_pred_train_random.10.6_g <- compute(election_nn_random.10.6_g, train_random_1_g[,-1])
election_nn_pred_train_random.10.6_g <- election_nn_pred_train_random.10.6_g$net.result *
  (max(train_random_g$scale_vote_share) - min(train_random_g$scale_vote_share)) + min(train_random_g$scale_vote_share)
election_nn_pred_test_random.10.6_g <- compute(election_nn_random.10.6_g, test_random_1_g[,-1])
election_nn_pred_test_random.10.6_g <- election_nn_pred_test_random.10.6_g$net.result *
  (max(train_random_g$scale_vote_share) - min(train_random_g$scale_vote_share)) + min(train_random_g$scale_vote_share)

rmse(election_nn_pred_train_random.10.6_g,train_random_g$scale_vote_share)
rmse(election_nn_pred_test_random.10.6_g,test_random_g$scale_vote_share)
accuracy(train_key_random_g ,election_nn_pred_train_random.10.6_g)
accuracy(test_key_random_g ,election_nn_pred_test_random.10.6_g)

set.seed(323)
election_nn_year.10.6_g <- neuralnet(f, data = train_year_1_g, hidden = c(10,6),stepmax = 1000000, linear.output = T)

plot(election_nn_year.10.6_g, fontsize = 10)
quartz.save("election_nn_year.10.6_g.png", type="png")

election_nn_pred_train_year.10.6_g <- compute(election_nn_year.10.6_g, train_year_1_g[,-1])
election_nn_pred_train_year.10.6_g <- election_nn_pred_train_year.10.6_g$net.result *
  (max(train_year_g$scale_vote_share) - min(train_year_g$scale_vote_share)) + min(train_year_g$scale_vote_share)
election_nn_pred_test_year.10.6_g <- compute(election_nn_year.10.6_g, test_year_1_g[,-1])
election_nn_pred_test_year.10.6_g <- election_nn_pred_test_year.10.6_g$net.result *
  (max(train_year_g$scale_vote_share) - min(train_year_g$scale_vote_share)) + min(train_year_g$scale_vote_share)

rmse(election_nn_pred_train_year.10.6_g,train_year_g$scale_vote_share)
rmse(election_nn_pred_test_year.10.6_g,test_year_g$scale_vote_share)
accuracy(train_key_year_g ,election_nn_pred_train_year.10.6_g)
accuracy(test_key_year_g ,election_nn_pred_test_year.10.6_g)