Data Preprocession
# 读取名为"数据.xlsx"的Excel文件
data <- read_xlsx("数据.xlsx")
# 读取名为"优劣解距离法计算结果.csv"的CSV文件
data.response <- read.csv("优劣解距离法计算结果博弈_ec1de02e-9ba6-4fa8-aece-d52128d9e13a.csv")
# 将"data.response"数据框中的"综合得分指数"列添加到"data"数据框中,并命名为"Y"
data %>% mutate(Y = data.response$综合得分指数) -> data
# 从"data"数据框中移除"编码"和"ID"列
data %>% select(-`编码`, -ID) -> data
# 从"data"数据框中移除"Y"列,并将结果转换为矩阵
data %>% select(-Y) %>% as.matrix() -> X
# 对矩阵X进行标准化处理
X <- scale(X)
# 从"data"数据框中选择"Y"列,并将结果转换为矩阵
data %>% select(Y) %>% as.matrix() -> Y
# 设置随机数种子,以确保结果的可重复性
set.seed(1)
# 获取矩阵X的行数,即样本数量
ns <- nrow(X)
# 生成一个随机的索引,用于随机抽样数据
idx <- sample(1:ns, ns, replace = F)
# 计算训练集的样本数量,占总样本的70%
ntr <- floor(ns * 0.7)
# 计算验证集的样本数量,占总样本的10%
nval <- floor(ns * 0.1)
# 计算测试集的样本数量,占总样本的剩余部分
nte <- ns - ntr - nval
# 根据计算出的样本数量,从随机索引中分割出训练集的索引
idx.tr <- idx[1:ntr]
# 从随机索引中分割出验证集的索引
idx.val <- idx[(ntr+1):(ntr+nval)]
# 从随机索引中分割出测试集的索引
idx.te <- idx[(ntr+nval+1):(ntr+nval+nte)]
# 根据训练集的索引,从X矩阵中选取对应的特征数据
X.tr <- X[idx.tr,]
# 根据验证集的索引,从X矩阵中选取对应的特征数据
X.val <- X[idx.val,]
# 根据测试集的索引,从X矩阵中选取对应的特征数据
X.te <- X[idx.te,]
# 根据训练集的索引,从Y矩阵中选取对应的标签数据
Y.tr <- Y[idx.tr,]
# 根据验证集的索引,从Y矩阵中选取对应的标签数据
Y.val <- Y[idx.val,]
# 根据测试集的索引,从Y矩阵中选取对应的标签数据
Y.te <- Y[idx.te,]
Define objective
function to minimize
utility kernel
function
# 定义一个径向基函数(RBF)核
rbf_kernel <- function(x1, x2, sigma){
# 计算x1和x2之间的欧几里得距离的平方,然后除以2倍的sigma的平方
# 最后计算指数函数,得到RBF核的值
exp(-sum((x1 - x2) ** 2) / (2 * sigma ** 2))
}
# 定义一个多项式核
poly_kernel <- function(x1, x2, p){
# 将x1和x2转换为列向量,计算它们的点积,然后加上1
# 最后将结果的p次幂,得到多项式核的值
(matrix(x1, nrow = 1) %*% matrix(x2, ncol = 1) + 1) ** p
}
# 定义一个加权核,它是RBF核和多项式核的加权组合
weighted_kernel <- function(x1, x2, mu, sigma, p){
# 计算多项式核的值
poly_value <- poly_kernel(x1, x2, p)
# 计算RBF核的值
rbf_value <- rbf_kernel(x1, x2, sigma)
# 将多项式核的值乘以权重mu,RBF核的值乘以(1-mu)
# 然后将两者相加,得到加权核的值
mu * poly_value + (1-mu) * rbf_value
}
objective
function
obj <- function(x, return_seq = F){
# 从参数x中提取mu, sigma和p
mu <- x[1]
sigma <- x[2]
# p取整,因为多项式度数通常为整数
p <- round(x[3])
# 获取训练集的样本数量
ns <- nrow(X.tr)
# 获取训练集的特征数量
nf <- ncol(X.tr)
set.seed(1)
W1 <- rnorm(nL * nf) %>% matrix(nrow = nf, ncol = nL)
g <- function(X, W1){
H <- matrix(NA, nrow = dim(X)[1], ncol = dim(W1)[2])
for (i in 1:dim(X)[1]){
for (j in 1:dim(W1)[2]){
x1 <- X[i,]
x2 <- W1[,j]
H[i,j] <- weighted_kernel(x1, x2, mu, sigma, p)
}
}
H
}
H.tr <- g(X.tr, W1)
H.tr.pinv <- pinv(H.tr)
W2 <- H.tr.pinv %*% Y.tr
# Prediction
H.val <- g(X.val, W1)
H.te <- g(X.te, W1)
pred.tr <- H.tr %*% W2
pred.val <- H.val %*% W2
pred.te <- H.te %*% W2
# Evaluation
RMSE.tr <- mean((Y.tr - pred.tr) ** 2) ** (1/2)
RMSE.val <- mean((Y.val - pred.val) ** 2) ** (1/2)
RMSE.te <- mean((Y.te - pred.te) ** 2) ** (1/2)
if (return_seq){
res.list <- list()
data.frame(real = as.numeric(Y.tr),
pred = as.numeric(pred.tr)) -> res.list[["train"]]
data.frame(real = as.numeric(Y.val),
pred = as.numeric(pred.val)) -> res.list[["validation"]]
data.frame(real = as.numeric(Y.te),
pred = as.numeric(pred.te)) -> res.list[["test"]]
return(res.list)
} else{
return(RMSE.val)
}
}
Optimization
lower_bound <- c(0.01, 1e-1, 1)
upper_bound <- c(0.99, 1e+2, 5)
initial_sol <- c(0.5, 1, 2)
nL <- 60
Particle Swarm
Optimization (PSO)
PSO-MK-ELM
# 设置随机数种子,以确保结果+的可重复性
set.seed(1)
# 初始化粒子的位置,这里使用一个包含3个NA值的向量
# 这表示粒子的初始位置是未知的,需要优化算法来确定
psoptim(rep(NA, 3), obj,
# 设置粒子位置的下界和上界
lower = lower_bound, upper = upper_bound,
# 设置优化算法的控制参数
control = list(maxf = 100)) -> res.PSO
# 将优化结果保存到RDS文件中
saveRDS(res.PSO, "results/res.PSO.RDS")
# 从RDS文件中读取PSO优化结果
res.PSO <- readRDS("results/res.PSO.RDS")
# 从结果对象中提取最优参数
par.PSO <- res.PSO$par
# 使用最优参数评估验证集的RMSE
RMSE.val <- obj(par.PSO)
# 使用最优参数并请求返回序列,以获取训练集、验证集和测试集的详细结果
res.list <- obj(par.PSO, return_seq = T)
idx_list <- list(test = idx.te,
train = idx.tr,
validation = idx.val)
for (ds_type in c("train", "validation", "test")){
# 处理测试集的结果,添加索引并按索引排序
res.list[[ds_type]] %>%
mutate(Index = idx_list[[ds_type]]) %>%
arrange(Index) -> res.df.PSO
# 将处理后的测试集结果保存为CSV文件
res.df.PSO %>% write.csv(sprintf("results/PSO-MK-ELM_%s_dataset.csv",
ds_type), row.names = F)
}
PSO-ELM
obj2 <- function(x, return_seq=F){
W2 <- x %>% matrix(ncol = 1)
# 获取训练集的样本数量
ns <- nrow(X.tr)
# 获取训练集的特征数量
nf <- ncol(X.tr)
set.seed(1)
W1 <- rnorm(nL * nf) %>% matrix(nrow = nf, ncol = nL)
# H.tr <- g(X.tr, W1)
H.tr <- sigmoid(X.tr %*% W1)
H.tr.pinv <- pinv(H.tr)
# W2 <- H.tr.pinv %*% Y.tr
# Prediction
H.val <- sigmoid(X.val %*% W1)
H.te <- sigmoid(X.te %*% W1)
pred.tr <- H.tr %*% W2
pred.val <- H.val %*% W2
pred.te <- H.te %*% W2
# Evaluation
RMSE.tr <- mean((Y.tr - pred.tr) ** 2) ** (1/2)
RMSE.val <- mean((Y.val - pred.val) ** 2) ** (1/2)
RMSE.te <- mean((Y.te - pred.te) ** 2) ** (1/2)
if (return_seq){
res.list <- list()
data.frame(real = as.numeric(Y.tr),
pred = as.numeric(pred.tr)) -> res.list[["train"]]
data.frame(real = as.numeric(Y.val),
pred = as.numeric(pred.val)) -> res.list[["validation"]]
data.frame(real = as.numeric(Y.te),
pred = as.numeric(pred.te)) -> res.list[["test"]]
return(res.list)
} else{
return(RMSE.tr)
}
}
set.seed(1)
psoptim(rep(NA, nL), obj2,
# 设置粒子位置的下界和上界
lower = rep(-0.1, nL), upper = rep(0.1, nL),
# 设置优化算法的控制参数
control = list(maxf = 10000)) -> res.PSO2
saveRDS(res.PSO2, "results/res.PSO2.RDS")
res.PSO2 <- readRDS("results/res.PSO2.RDS")
par.PSO2 <- res.PSO2$par
res.list <- obj2(par.PSO2, return_seq = T)
idx_list <- list(test = idx.te,
train = idx.tr,
validation = idx.val)
for (ds_type in c("train", "validation", "test")){
res.list[[ds_type]] %>%
mutate(Index = idx_list[[ds_type]]) %>%
arrange(Index) -> res.df.PSO2
res.df.PSO2 %>% write.csv(sprintf("results/PSO-ELM_%s_dataset.csv",
ds_type), row.names = F)
}
Genetic Algorithms
(GA)
GA-MK-ELM
# 设置随机数种子,以确保结果的可重复性
set.seed(1)
# 调用遗传算法函数,进行优化
ga(type = "real-valued",
fitness = function(x){-obj(x)},
lower = lower_bound, upper = upper_bound,
maxiter = 10, monitor = F) -> res.GA
# 将遗传算法的结果保存到RDS文件中
saveRDS(res.GA, "results/res.GA.RDS")
# 从RDS文件中读取遗传算法优化结果
res.GA <- readRDS("results/res.GA.RDS")
# 从结果对象中提取最优参数
par.GA <- res.GA@solution
# 使用最优参数评估验证集的RMSE
RMSE.val <- obj(par.GA)
# 使用最优参数并请求返回序列,以获取训练集、验证集和测试集的详细结果
res.list <- obj(par.GA, return_seq = T)
for (ds_type in c("train", "validation", "test")){
# 处理测试集的结果,添加索引并按索引排序
res.list[[ds_type]] %>%
mutate(Index = idx_list[[ds_type]]) %>%
arrange(Index) -> res.df.GA
# 将处理后的测试集结果保存为CSV文件
res.df.GA %>% write.csv(sprintf("results/GA-MK-ELM_%s_dataset.csv",
ds_type), row.names = F)
}
GA-ELM
set.seed(1)
ga(type = "real-valued",
fitness = function(x){-obj2(x)},
lower = rep(-0.1, nL), upper = rep(0.1, nL),
maxiter = 1000, monitor = F) -> res.GA2
saveRDS(res.GA2, "results/res.GA2.RDS")
res.GA2 <- readRDS("results/res.GA2.RDS")
par.GA2 <- res.GA2@solution
res.list <- obj2(par.GA2, return_seq = T)
idx_list <- list(test = idx.te,
train = idx.tr,
validation = idx.val)
for (ds_type in c("train", "validation", "test")){
res.list[[ds_type]] %>%
mutate(Index = idx_list[[ds_type]]) %>%
arrange(Index) %>%
write.csv(sprintf("results/GA-ELM_%s_dataset.csv",
ds_type), row.names = F)
}
Simulated Annealing
(SA)
SA-MK-ELM
# 设置随机数种子,以确保结果的可重复性
set.seed(1)
# 调用模拟退火算法函数,进行优化
res.SA <- optim_sa(obj, start = initial_sol,
lower = lower_bound,
upper = upper_bound, control = list(nlimit = 10))
# 将模拟退火算法的结果保存到RDS文件中
saveRDS(res.SA, "results/res.SA.RDS")
# 从RDS文件中读取模拟退火算法优化结果
res.SA <- readRDS("results/res.SA.RDS")
# 从结果对象中提取最优参数
par.SA <- res.SA$par
# 使用最优参数评估验证集的RMSE
RMSE.val <- obj(par.SA)
# 使用最优参数并请求返回序列,以获取训练集、验证集和测试集的详细结果
res.list <- obj(par.SA, return_seq = T)
for (ds_type in c("train", "validation", "test")){
# 处理测试集的结果,添加索引并按索引排序
res.list[[ds_type]] %>%
mutate(Index = idx_list[[ds_type]]) %>%
arrange(Index) -> res.df.SA
# 将处理后的测试集结果保存为CSV文件
res.df.SA %>% write.csv(sprintf("results/SA-MK-ELM_%s_dataset.csv",
ds_type), row.names = F)
}
SA-ELM
set.seed(1)
res.SA2 <- optim_sa(obj2, start = rep(0, nL),
lower = rep(-0.1, nL), upper = rep(0.1, nL),
control = list(nlimit = 1000))
saveRDS(res.SA2, "results/res.SA2.RDS")
res.SA2 <- readRDS("results/res.SA2.RDS")
par.SA2 <- res.SA2$par
res.list <- obj2(par.SA2, return_seq = T)
idx_list <- list(test = idx.te,
train = idx.tr,
validation = idx.val)
for (ds_type in c("train", "validation", "test")){
res.list[[ds_type]] %>%
mutate(Index = idx_list[[ds_type]]) %>%
arrange(Index) %>%
write.csv(sprintf("results/SA-ELM_%s_dataset.csv",
ds_type), row.names = F)
}
Artificial Bee Colony
(ABC)
ABC-MK-ELM
# 设置随机数种子,以确保结果的可重复性
set.seed(1)
# 调用人工蜂群算法函数,进行优化
res.ABC <- abc_optim(par = initial_sol, fn = obj,
lb = lower_bound, ub = upper_bound, maxCycle = 10)
# 将人工蜂群算法的结果保存到RDS文件中
saveRDS(res.ABC, "results/res.ABC.RDS")
# 从RDS文件中读取人工蜂群算法优化结果
res.ABC <- readRDS("results/res.ABC.RDS")
# 从结果对象中提取最优参数
par.ABC <- res.ABC$par
# 使用最优参数评估验证集的RMSE
RMSE.val <- obj(par.ABC)
# 使用最优参数并请求返回序列,以获取训练集、验证集和测试集的详细结果
res.list <- obj(par.ABC, return_seq = T)
for (ds_type in c("train", "validation", "test")){
# 处理测试集的结果,添加索引并按索引排序
res.list[[ds_type]] %>%
mutate(Index = idx_list[[ds_type]]) %>%
arrange(Index) -> res.df.ABC
# 将处理后的测试集结果保存为CSV文件
res.df.ABC %>% write.csv(sprintf("results/ABC-MK-ELM_%s_dataset.csv",
ds_type), row.names = F)
}
ABC-ELM
set.seed(1)
res.ABC2 <- abc_optim(par = rep(0, nL), fn = obj2,
lb = rep(-0.1, nL), ub = rep(0.1, nL), maxCycle = 1000)
saveRDS(res.ABC2, "results/res.ABC2.RDS")
res.ABC2 <- readRDS("results/res.ABC2.RDS")
par.ABC2 <- res.ABC2$par
res.list <- obj2(par.ABC2, return_seq = T)
idx_list <- list(test = idx.te,
train = idx.tr,
validation = idx.val)
for (ds_type in c("train", "validation", "test")){
res.list[[ds_type]] %>%
mutate(Index = idx_list[[ds_type]]) %>%
arrange(Index) %>%
write.csv(sprintf("results/ABC-ELM_%s_dataset.csv",
ds_type), row.names = F)
}
Differential
Evolution (DE)
DE-MK-ELM
# 设置随机数种子,以确保结果的可重复性
set.seed(1)
# 调用差分进化算法函数,进行优化
DEoptim(obj, lower = lower_bound, upper = upper_bound,
control = DEoptim.control(itermax = 10, trace = F)) -> res.DE
# 将差分进化算法的结果保存到RDS文件中
saveRDS(res.DE, "results/res.DE.RDS")
# 从RDS文件中读取差分进化算法优化结果
res.DE <- readRDS("results/res.DE.RDS")
# 从结果对象中提取最优参数
par.DE <- res.DE$optim$bestmem
# 使用最优参数评估验证集的RMSE
RMSE.val <- obj(par.DE)
# 使用最优参数并请求返回序列,以获取训练集、验证集和测试集的详细结果
res.list <- obj(par.DE, return_seq = T)
for (ds_type in c("train", "validation", "test")){
# 处理测试集的结果,添加索引并按索引排序
res.list[[ds_type]] %>%
mutate(Index = idx_list[[ds_type]]) %>%
arrange(Index) -> res.df.DE
# 将处理后的测试集结果保存为CSV文件
res.df.DE %>% write.csv(sprintf("results/DE-MK-ELM_%s_dataset.csv",
ds_type), row.names = F)
}
DE-ELM
set.seed(1)
DEoptim(obj2, lower = rep(-0.1, nL), upper = rep(0.1, nL),
control = DEoptim.control(itermax = 1000, trace = F)) -> res.DE2
saveRDS(res.DE2, "results/res.DE2.RDS")
res.DE2 <- readRDS("results/res.DE2.RDS")
par.DE2 <- res.DE2$optim$bestmem
res.list <- obj2(par.DE2, return_seq = T)
idx_list <- list(test = idx.te,
train = idx.tr,
validation = idx.val)
for (ds_type in c("train", "validation", "test")){
res.list[[ds_type]] %>%
mutate(Index = idx_list[[ds_type]]) %>%
arrange(Index) %>%
write.csv(sprintf("results/DE-ELM_%s_dataset.csv",
ds_type), row.names = F)
}
Algorithm
Comparison
method_vec <- c("PSO-MK-ELM", "PSO-ELM", "GA-MK-ELM", "GA-ELM",
"SA-MK-ELM", "SA-ELM",
"ABC-MK-ELM", "ABC-ELM", "DE-MK-ELM", "DE-ELM")
ds_type <- "test_dataset"
for(i in 1:length(method_vec)){
read.csv(paste0("results/", method_vec[i], "_", ds_type, ".csv")) %>%
mutate(method = method_vec[i]) -> df
if(i == 1){
df.all <- df
} else{
df.all <- rbind(df.all, df)
}
}
df.all %>%
mutate(residual = real - pred) %>%
ggplot() +
geom_density(aes(x = residual, color = method)) +
facet_wrap(~method) +
theme_pander()

df.all %>%
mutate(residual = real - pred) %>%
ggplot() +
geom_boxplot(aes(x = method, y = residual, color = method)) +
theme_pander() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

df.all %>%
group_by(method) %>%
summarise(R2 = cor(real, pred) ** 2 %>% round(6),
MSE = mean((real - pred) ** 2) %>% round(6),
RMSE = mean((real - pred) ** 2) ** (1/2) %>% round(6),
MAE = mean(abs(real-pred)) %>% round(6),
MAPE = mean(abs(real-pred)/real) %>% round(6)) -> res.df
# 将汇总的性能指标结果保存为CSV文件
res.df %>%
write.csv("results/Model_Performance.csv", row.names = F)
# 使用knitr包的kable函数将结果以表格形式展示
res.df %>%
knitr::kable()
| ABC-ELM |
0.008436 |
0.007067 |
0.084063 |
0.070171 |
0.177515 |
| ABC-MK-ELM |
0.976624 |
0.000145 |
0.012051 |
0.009435 |
0.024532 |
| DE-ELM |
0.197946 |
0.013031 |
0.114152 |
0.094355 |
0.234603 |
| DE-MK-ELM |
0.976532 |
0.000145 |
0.012055 |
0.009381 |
0.024371 |
| GA-ELM |
0.545719 |
0.003181 |
0.056401 |
0.042779 |
0.105124 |
| GA-MK-ELM |
0.976444 |
0.000146 |
0.012076 |
0.009375 |
0.024356 |
| PSO-ELM |
0.592687 |
0.003291 |
0.057369 |
0.046564 |
0.118945 |
| PSO-MK-ELM |
0.977048 |
0.000142 |
0.011916 |
0.009414 |
0.024421 |
| SA-ELM |
0.361038 |
0.009269 |
0.096275 |
0.077307 |
0.200009 |
| SA-MK-ELM |
0.976243 |
0.000147 |
0.012133 |
0.009481 |
0.024656 |
df.all %>%
mutate(method = factor(method, levels = unique(df.all$method))) -> df.all
df.all %>%
group_by(method) %>%
summarise(R2 = cor(pred, real) ** 2) %>%
pull(R2) -> R2
df.all %>%
group_by(method) %>%
summarise(MSE = mean((pred - real) ** 2)) %>%
pull(MSE) -> MSE
# 确保r_squared是数值类型
if(is.numeric(R2)) {
# 创建图表并注释R²值
df.all %>%
mutate(method_label = factor(method, levels = levels(df.all$method),
labels = sprintf("%s\nR2: %.3f\n MSE: %.3e",
levels(df.all$method), R2, MSE))) %>%
ggplot() +
geom_point(aes(x = real, y = pred), color = "gray") +
geom_smooth(aes(x = real, y = pred), method = "lm", formula = "y ~ x") +
geom_line(aes(x = real, y = real), color = "black",
linetype = "dashed", linewidth = 1) +
facet_wrap(~method_label) +
theme_pander() +
theme(legend.position = "none")
ggsave("plot1.png", width = 8, height = 6, dpi = 300, bg = "white")
} else {
stop("r_squared is not a numeric value.")
}