1 Data Preprocession

# 读取名为"数据.xlsx"的Excel文件
data <- read_xlsx("数据.xlsx")

# 读取名为"优劣解距离法计算结果.csv"的CSV文件
data.response <- read.csv("优劣解距离法计算结果博弈_ec1de02e-9ba6-4fa8-aece-d52128d9e13a.csv")

# 将"data.response"数据框中的"综合得分指数"列添加到"data"数据框中,并命名为"Y"
data %>% mutate(Y = data.response$综合得分指数) -> data

# 从"data"数据框中移除"编码"和"ID"列
data %>% select(-`编码`, -ID) -> data

# 从"data"数据框中移除"Y"列,并将结果转换为矩阵
data %>% select(-Y) %>% as.matrix() -> X

# 对矩阵X进行标准化处理
X <- scale(X)

# 从"data"数据框中选择"Y"列,并将结果转换为矩阵
data %>% select(Y) %>% as.matrix() -> Y
# 设置随机数种子,以确保结果的可重复性
set.seed(1)

# 获取矩阵X的行数,即样本数量
ns <- nrow(X)

# 生成一个随机的索引,用于随机抽样数据
idx <- sample(1:ns, ns, replace = F)

# 计算训练集的样本数量,占总样本的70%
ntr <- floor(ns * 0.7)

# 计算验证集的样本数量,占总样本的10%
nval <- floor(ns * 0.1)

# 计算测试集的样本数量,占总样本的剩余部分
nte <- ns - ntr - nval

# 根据计算出的样本数量,从随机索引中分割出训练集的索引
idx.tr <- idx[1:ntr]

# 从随机索引中分割出验证集的索引
idx.val <- idx[(ntr+1):(ntr+nval)]

# 从随机索引中分割出测试集的索引
idx.te <- idx[(ntr+nval+1):(ntr+nval+nte)]

# 根据训练集的索引,从X矩阵中选取对应的特征数据
X.tr <- X[idx.tr,]

# 根据验证集的索引,从X矩阵中选取对应的特征数据
X.val <- X[idx.val,]

# 根据测试集的索引,从X矩阵中选取对应的特征数据
X.te <- X[idx.te,]

# 根据训练集的索引,从Y矩阵中选取对应的标签数据
Y.tr <- Y[idx.tr,]

# 根据验证集的索引,从Y矩阵中选取对应的标签数据
Y.val <- Y[idx.val,]

# 根据测试集的索引,从Y矩阵中选取对应的标签数据
Y.te <- Y[idx.te,]

2 Define objective function to minimize

2.1 utility kernel function

# 定义一个径向基函数(RBF)核
rbf_kernel <- function(x1, x2, sigma){
  # 计算x1和x2之间的欧几里得距离的平方,然后除以2倍的sigma的平方
  # 最后计算指数函数,得到RBF核的值
  exp(-sum((x1 - x2) ** 2) / (2 * sigma ** 2))
}

# 定义一个多项式核
poly_kernel <- function(x1, x2, p){
  # 将x1和x2转换为列向量,计算它们的点积,然后加上1
  # 最后将结果的p次幂,得到多项式核的值
  (matrix(x1, nrow = 1) %*% matrix(x2, ncol = 1) + 1) ** p
}

# 定义一个加权核,它是RBF核和多项式核的加权组合
weighted_kernel <- function(x1, x2, mu, sigma, p){
  # 计算多项式核的值
  poly_value <- poly_kernel(x1, x2, p)
  # 计算RBF核的值
  rbf_value <- rbf_kernel(x1, x2, sigma)
  # 将多项式核的值乘以权重mu,RBF核的值乘以(1-mu)
  # 然后将两者相加,得到加权核的值
  mu * poly_value + (1-mu) * rbf_value
}

2.2 objective function

obj <- function(x, return_seq = F){
  # 从参数x中提取mu, sigma和p
  mu <- x[1]
  sigma <- x[2]
  # p取整,因为多项式度数通常为整数
  p <- round(x[3])
  # 获取训练集的样本数量
  ns <- nrow(X.tr)
  # 获取训练集的特征数量
  nf <- ncol(X.tr)
  set.seed(1)
  W1 <- rnorm(nL * nf) %>% matrix(nrow = nf, ncol = nL)
  g <- function(X, W1){
    H <- matrix(NA, nrow = dim(X)[1], ncol = dim(W1)[2])
    for (i in 1:dim(X)[1]){
      for (j in 1:dim(W1)[2]){
        x1 <- X[i,]
        x2 <- W1[,j]
        H[i,j] <- weighted_kernel(x1, x2, mu, sigma, p)
      }
    }
    H
  }
  H.tr <- g(X.tr, W1)
  H.tr.pinv <- pinv(H.tr)
  W2 <- H.tr.pinv %*% Y.tr
  
  # Prediction
  H.val <- g(X.val, W1)
  H.te <- g(X.te, W1)
  pred.tr <- H.tr %*% W2
  pred.val <- H.val %*% W2
  pred.te <- H.te %*% W2
  
  # Evaluation
  RMSE.tr <- mean((Y.tr - pred.tr) ** 2) ** (1/2)
  RMSE.val <- mean((Y.val - pred.val) ** 2) ** (1/2)
  RMSE.te <- mean((Y.te - pred.te) ** 2) ** (1/2)
  
  if (return_seq){
    res.list <- list()
    data.frame(real = as.numeric(Y.tr), 
               pred = as.numeric(pred.tr)) -> res.list[["train"]]
    data.frame(real = as.numeric(Y.val), 
               pred = as.numeric(pred.val)) -> res.list[["validation"]]
    data.frame(real = as.numeric(Y.te), 
               pred = as.numeric(pred.te)) -> res.list[["test"]]
    return(res.list)
  } else{
    return(RMSE.val)
  }
}

3 Optimization

lower_bound <- c(0.01, 1e-1, 1)
upper_bound <- c(0.99, 1e+2, 5)
initial_sol <- c(0.5, 1, 2)
nL <- 60

3.1 Particle Swarm Optimization (PSO)

3.1.1 PSO-MK-ELM

# 设置随机数种子,以确保结果+的可重复性
set.seed(1)

# 初始化粒子的位置,这里使用一个包含3个NA值的向量
# 这表示粒子的初始位置是未知的,需要优化算法来确定
psoptim(rep(NA, 3), obj, 
        # 设置粒子位置的下界和上界
        lower = lower_bound, upper = upper_bound, 
        # 设置优化算法的控制参数
        control = list(maxf = 100)) -> res.PSO

# 将优化结果保存到RDS文件中
saveRDS(res.PSO, "results/res.PSO.RDS")
# 从RDS文件中读取PSO优化结果
res.PSO <- readRDS("results/res.PSO.RDS")

# 从结果对象中提取最优参数
par.PSO <- res.PSO$par

# 使用最优参数评估验证集的RMSE
RMSE.val <- obj(par.PSO)

# 使用最优参数并请求返回序列,以获取训练集、验证集和测试集的详细结果
res.list <- obj(par.PSO, return_seq = T)

idx_list <- list(test = idx.te, 
                 train = idx.tr, 
                 validation = idx.val)
for (ds_type in c("train", "validation", "test")){
  # 处理测试集的结果,添加索引并按索引排序
  res.list[[ds_type]] %>% 
    mutate(Index = idx_list[[ds_type]]) %>% 
    arrange(Index) -> res.df.PSO
  # 将处理后的测试集结果保存为CSV文件
  res.df.PSO %>% write.csv(sprintf("results/PSO-MK-ELM_%s_dataset.csv", 
                                   ds_type), row.names = F)
}

3.1.2 PSO-ELM

obj2 <- function(x, return_seq=F){
  W2 <- x %>% matrix(ncol = 1)
  # 获取训练集的样本数量
  ns <- nrow(X.tr)
  # 获取训练集的特征数量
  nf <- ncol(X.tr)
  set.seed(1)
  W1 <- rnorm(nL * nf) %>% matrix(nrow = nf, ncol = nL)
  # H.tr <- g(X.tr, W1)
  H.tr <- sigmoid(X.tr %*% W1)
  H.tr.pinv <- pinv(H.tr)
  # W2 <- H.tr.pinv %*% Y.tr
  
  # Prediction
  H.val <- sigmoid(X.val %*% W1)
  H.te <- sigmoid(X.te %*% W1)
  pred.tr <- H.tr %*% W2
  pred.val <- H.val %*% W2
  pred.te <- H.te %*% W2
  
  # Evaluation
  RMSE.tr <- mean((Y.tr - pred.tr) ** 2) ** (1/2)
  RMSE.val <- mean((Y.val - pred.val) ** 2) ** (1/2)
  RMSE.te <- mean((Y.te - pred.te) ** 2) ** (1/2)
  
  if (return_seq){
    res.list <- list()
    data.frame(real = as.numeric(Y.tr), 
               pred = as.numeric(pred.tr)) -> res.list[["train"]]
    data.frame(real = as.numeric(Y.val), 
               pred = as.numeric(pred.val)) -> res.list[["validation"]]
    data.frame(real = as.numeric(Y.te), 
               pred = as.numeric(pred.te)) -> res.list[["test"]]
    return(res.list)
  } else{
    return(RMSE.tr)
  }
}
set.seed(1)
psoptim(rep(NA, nL), obj2, 
        # 设置粒子位置的下界和上界
        lower = rep(-0.1, nL), upper = rep(0.1, nL), 
        # 设置优化算法的控制参数
        control = list(maxf = 10000)) -> res.PSO2
saveRDS(res.PSO2, "results/res.PSO2.RDS")
res.PSO2 <- readRDS("results/res.PSO2.RDS")
par.PSO2 <- res.PSO2$par
res.list <- obj2(par.PSO2, return_seq = T)
idx_list <- list(test = idx.te, 
                 train = idx.tr, 
                 validation = idx.val)
for (ds_type in c("train", "validation", "test")){
  res.list[[ds_type]] %>% 
    mutate(Index = idx_list[[ds_type]]) %>% 
    arrange(Index) -> res.df.PSO2
  res.df.PSO2 %>% write.csv(sprintf("results/PSO-ELM_%s_dataset.csv", 
                                   ds_type), row.names = F)
}

3.2 Genetic Algorithms (GA)

3.2.1 GA-MK-ELM

# 设置随机数种子,以确保结果的可重复性
set.seed(1)

# 调用遗传算法函数,进行优化
ga(type = "real-valued", 
   fitness = function(x){-obj(x)}, 
   lower = lower_bound, upper = upper_bound, 
   maxiter = 10, monitor = F) -> res.GA

# 将遗传算法的结果保存到RDS文件中
saveRDS(res.GA, "results/res.GA.RDS")
# 从RDS文件中读取遗传算法优化结果
res.GA <- readRDS("results/res.GA.RDS")

# 从结果对象中提取最优参数
par.GA <- res.GA@solution

# 使用最优参数评估验证集的RMSE
RMSE.val <- obj(par.GA)

# 使用最优参数并请求返回序列,以获取训练集、验证集和测试集的详细结果
res.list <- obj(par.GA, return_seq = T)

for (ds_type in c("train", "validation", "test")){
  # 处理测试集的结果,添加索引并按索引排序
  res.list[[ds_type]] %>% 
    mutate(Index = idx_list[[ds_type]]) %>% 
    arrange(Index) -> res.df.GA
  # 将处理后的测试集结果保存为CSV文件
  res.df.GA %>% write.csv(sprintf("results/GA-MK-ELM_%s_dataset.csv", 
                                   ds_type), row.names = F)
}

3.2.2 GA-ELM

set.seed(1)
ga(type = "real-valued", 
   fitness = function(x){-obj2(x)}, 
   lower = rep(-0.1, nL), upper = rep(0.1, nL), 
   maxiter = 1000, monitor = F) -> res.GA2

saveRDS(res.GA2, "results/res.GA2.RDS")
res.GA2 <- readRDS("results/res.GA2.RDS")
par.GA2 <- res.GA2@solution
res.list <- obj2(par.GA2, return_seq = T)
idx_list <- list(test = idx.te, 
                 train = idx.tr, 
                 validation = idx.val)
for (ds_type in c("train", "validation", "test")){
  res.list[[ds_type]] %>% 
    mutate(Index = idx_list[[ds_type]]) %>% 
    arrange(Index) %>% 
    write.csv(sprintf("results/GA-ELM_%s_dataset.csv", 
                      ds_type), row.names = F)
}

3.3 Simulated Annealing (SA)

3.3.1 SA-MK-ELM

# 设置随机数种子,以确保结果的可重复性
set.seed(1)

# 调用模拟退火算法函数,进行优化
res.SA <- optim_sa(obj, start = initial_sol, 
                   lower = lower_bound, 
                   upper = upper_bound, control = list(nlimit = 10))

# 将模拟退火算法的结果保存到RDS文件中
saveRDS(res.SA, "results/res.SA.RDS")
# 从RDS文件中读取模拟退火算法优化结果
res.SA <- readRDS("results/res.SA.RDS")

# 从结果对象中提取最优参数
par.SA <- res.SA$par

# 使用最优参数评估验证集的RMSE
RMSE.val <- obj(par.SA)

# 使用最优参数并请求返回序列,以获取训练集、验证集和测试集的详细结果
res.list <- obj(par.SA, return_seq = T)

for (ds_type in c("train", "validation", "test")){
  # 处理测试集的结果,添加索引并按索引排序
  res.list[[ds_type]] %>% 
    mutate(Index = idx_list[[ds_type]]) %>% 
    arrange(Index) -> res.df.SA
  # 将处理后的测试集结果保存为CSV文件
  res.df.SA %>% write.csv(sprintf("results/SA-MK-ELM_%s_dataset.csv", 
                                   ds_type), row.names = F)
}

3.3.2 SA-ELM

set.seed(1)
res.SA2 <- optim_sa(obj2, start = rep(0, nL), 
                    lower = rep(-0.1, nL), upper = rep(0.1, nL), 
                    control = list(nlimit = 1000))
saveRDS(res.SA2, "results/res.SA2.RDS")
res.SA2 <- readRDS("results/res.SA2.RDS")
par.SA2 <- res.SA2$par
res.list <- obj2(par.SA2, return_seq = T)
idx_list <- list(test = idx.te, 
                 train = idx.tr, 
                 validation = idx.val)
for (ds_type in c("train", "validation", "test")){
  res.list[[ds_type]] %>% 
    mutate(Index = idx_list[[ds_type]]) %>% 
    arrange(Index) %>% 
    write.csv(sprintf("results/SA-ELM_%s_dataset.csv", 
                      ds_type), row.names = F)
}

3.4 Artificial Bee Colony (ABC)

3.4.1 ABC-MK-ELM

# 设置随机数种子,以确保结果的可重复性
set.seed(1)

# 调用人工蜂群算法函数,进行优化
res.ABC <- abc_optim(par = initial_sol, fn = obj, 
                     lb = lower_bound, ub = upper_bound, maxCycle = 10)

# 将人工蜂群算法的结果保存到RDS文件中
saveRDS(res.ABC, "results/res.ABC.RDS")
# 从RDS文件中读取人工蜂群算法优化结果
res.ABC <- readRDS("results/res.ABC.RDS")

# 从结果对象中提取最优参数
par.ABC <- res.ABC$par

# 使用最优参数评估验证集的RMSE
RMSE.val <- obj(par.ABC)

# 使用最优参数并请求返回序列,以获取训练集、验证集和测试集的详细结果
res.list <- obj(par.ABC, return_seq = T)

for (ds_type in c("train", "validation", "test")){
  # 处理测试集的结果,添加索引并按索引排序
  res.list[[ds_type]] %>% 
    mutate(Index = idx_list[[ds_type]]) %>% 
    arrange(Index) -> res.df.ABC
  # 将处理后的测试集结果保存为CSV文件
  res.df.ABC %>% write.csv(sprintf("results/ABC-MK-ELM_%s_dataset.csv", 
                                   ds_type), row.names = F)
}

3.4.2 ABC-ELM

set.seed(1)
res.ABC2 <- abc_optim(par = rep(0, nL), fn = obj2, 
                      lb = rep(-0.1, nL), ub = rep(0.1, nL), maxCycle = 1000)
saveRDS(res.ABC2, "results/res.ABC2.RDS")
res.ABC2 <- readRDS("results/res.ABC2.RDS")
par.ABC2 <- res.ABC2$par
res.list <- obj2(par.ABC2, return_seq = T)
idx_list <- list(test = idx.te, 
                 train = idx.tr, 
                 validation = idx.val)
for (ds_type in c("train", "validation", "test")){
  res.list[[ds_type]] %>% 
    mutate(Index = idx_list[[ds_type]]) %>% 
    arrange(Index) %>% 
    write.csv(sprintf("results/ABC-ELM_%s_dataset.csv", 
                      ds_type), row.names = F)
}

3.5 Differential Evolution (DE)

3.5.1 DE-MK-ELM

# 设置随机数种子,以确保结果的可重复性
set.seed(1)

# 调用差分进化算法函数,进行优化
DEoptim(obj, lower = lower_bound, upper = upper_bound, 
        control = DEoptim.control(itermax = 10, trace = F)) -> res.DE

# 将差分进化算法的结果保存到RDS文件中
saveRDS(res.DE, "results/res.DE.RDS")
# 从RDS文件中读取差分进化算法优化结果
res.DE <- readRDS("results/res.DE.RDS")

# 从结果对象中提取最优参数
par.DE <- res.DE$optim$bestmem

# 使用最优参数评估验证集的RMSE
RMSE.val <- obj(par.DE)

# 使用最优参数并请求返回序列,以获取训练集、验证集和测试集的详细结果
res.list <- obj(par.DE, return_seq = T)

for (ds_type in c("train", "validation", "test")){
  # 处理测试集的结果,添加索引并按索引排序
  res.list[[ds_type]] %>% 
    mutate(Index = idx_list[[ds_type]]) %>% 
    arrange(Index) -> res.df.DE
  # 将处理后的测试集结果保存为CSV文件
  res.df.DE %>% write.csv(sprintf("results/DE-MK-ELM_%s_dataset.csv", 
                                   ds_type), row.names = F)
}

3.5.2 DE-ELM

set.seed(1)
DEoptim(obj2, lower = rep(-0.1, nL), upper = rep(0.1, nL), 
        control = DEoptim.control(itermax = 1000, trace = F)) -> res.DE2
saveRDS(res.DE2, "results/res.DE2.RDS")
res.DE2 <- readRDS("results/res.DE2.RDS")
par.DE2 <- res.DE2$optim$bestmem
res.list <- obj2(par.DE2, return_seq = T)
idx_list <- list(test = idx.te, 
                 train = idx.tr, 
                 validation = idx.val)
for (ds_type in c("train", "validation", "test")){
  res.list[[ds_type]] %>% 
    mutate(Index = idx_list[[ds_type]]) %>% 
    arrange(Index) %>% 
    write.csv(sprintf("results/DE-ELM_%s_dataset.csv", 
                      ds_type), row.names = F)
}

4 Algorithm Comparison

method_vec <- c("PSO-MK-ELM", "PSO-ELM", "GA-MK-ELM", "GA-ELM", 
                "SA-MK-ELM", "SA-ELM", 
                "ABC-MK-ELM", "ABC-ELM", "DE-MK-ELM", "DE-ELM")
ds_type <- "test_dataset"
for(i in 1:length(method_vec)){
  read.csv(paste0("results/", method_vec[i], "_", ds_type, ".csv")) %>% 
    mutate(method = method_vec[i]) -> df
  if(i == 1){
    df.all <- df
  } else{
    df.all <- rbind(df.all, df)
  }
}
df.all %>% 
  mutate(residual = real - pred) %>% 
  ggplot() + 
  geom_density(aes(x = residual, color = method)) + 
  facet_wrap(~method) + 
  theme_pander()

df.all %>% 
  mutate(residual = real - pred) %>% 
  ggplot() + 
  geom_boxplot(aes(x = method, y = residual, color = method)) + 
  theme_pander() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

df.all %>% 
  group_by(method) %>% 
  summarise(R2 = cor(real, pred) ** 2 %>% round(6), 
            MSE = mean((real - pred) ** 2) %>% round(6), 
            RMSE = mean((real - pred) ** 2) ** (1/2) %>% round(6), 
            MAE = mean(abs(real-pred)) %>% round(6), 
            MAPE = mean(abs(real-pred)/real) %>% round(6)) -> res.df

# 将汇总的性能指标结果保存为CSV文件
res.df %>% 
  write.csv("results/Model_Performance.csv", row.names = F)


# 使用knitr包的kable函数将结果以表格形式展示
res.df %>% 
  knitr::kable()
method R2 MSE RMSE MAE MAPE
ABC-ELM 0.008436 0.007067 0.084063 0.070171 0.177515
ABC-MK-ELM 0.976624 0.000145 0.012051 0.009435 0.024532
DE-ELM 0.197946 0.013031 0.114152 0.094355 0.234603
DE-MK-ELM 0.976532 0.000145 0.012055 0.009381 0.024371
GA-ELM 0.545719 0.003181 0.056401 0.042779 0.105124
GA-MK-ELM 0.976444 0.000146 0.012076 0.009375 0.024356
PSO-ELM 0.592687 0.003291 0.057369 0.046564 0.118945
PSO-MK-ELM 0.977048 0.000142 0.011916 0.009414 0.024421
SA-ELM 0.361038 0.009269 0.096275 0.077307 0.200009
SA-MK-ELM 0.976243 0.000147 0.012133 0.009481 0.024656
df.all %>% 
  mutate(method = factor(method, levels = unique(df.all$method))) -> df.all

df.all %>% 
  group_by(method) %>% 
  summarise(R2 = cor(pred, real) ** 2) %>% 
  pull(R2) -> R2

df.all %>% 
  group_by(method) %>% 
  summarise(MSE = mean((pred - real) ** 2)) %>% 
  pull(MSE) -> MSE

# 确保r_squared是数值类型
if(is.numeric(R2)) {
  # 创建图表并注释R²值
  df.all %>% 
    mutate(method_label = factor(method, levels = levels(df.all$method), 
                                 labels = sprintf("%s\nR2: %.3f\n MSE: %.3e", 
                                                  levels(df.all$method), R2, MSE))) %>% 
    ggplot() + 
    geom_point(aes(x = real, y = pred), color = "gray") + 
    geom_smooth(aes(x = real, y = pred), method = "lm", formula = "y ~ x") + 
    geom_line(aes(x = real, y = real), color = "black", 
              linetype = "dashed", linewidth = 1) + 
    facet_wrap(~method_label) + 
    theme_pander() + 
    theme(legend.position = "none")
  
  ggsave("plot1.png", width = 8, height = 6, dpi = 300, bg = "white")
} else {
  stop("r_squared is not a numeric value.")
}