library(glmnet)
## Warning: package 'glmnet' was built under R version 4.5.3
## Loading required package: Matrix
## Loaded glmnet 5.0
library(e1071)
## Warning: package 'e1071' was built under R version 4.5.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.5.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
library(caret)
## Warning: package 'caret' was built under R version 4.5.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.5.3
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## The following object is masked from 'package:e1071':
##
## element
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.5.3
library(VennDiagram)
## Warning: package 'VennDiagram' was built under R version 4.5.3
## Loading required package: grid
## Loading required package: futile.logger
library(futile.logger)
flog.threshold(ERROR, name = "VennDiagramLogger")
## NULL
# 导入数据
expression_data <- read.csv("AR_FRGs_expression.csv", row.names=1)
group_data <- read.csv("分组.csv", row.names=1)
expression_data$group <- factor(group_data$group)
# 纯数值清洗
expr_cols <- setdiff(colnames(expression_data), "group")
expression_data[expr_cols] <- lapply(expression_data[expr_cols], as.numeric)
expression_data <- expression_data[, colSums(is.na(expression_data)) == 0]
# 构建特征矩阵
X <- as.matrix(expression_data[, expr_cols])
y <- expression_data$group
X[is.na(X)] <- 0 # 缺失值填充
set.seed(123)
cv_lasso <- cv.glmnet(X, y, family="binomial", alpha=1)
## Warning: from glmnet C++ code (error code -88); Convergence for 88th lambda
## value not reached after maxit=100000 iterations; solutions for larger lambdas
## returned
## Warning: from glmnet C++ code (error code -94); Convergence for 94th lambda
## value not reached after maxit=100000 iterations; solutions for larger lambdas
## returned
coef_lasso <- coef(cv_lasso, s="lambda.min")
lasso_selected <- rownames(coef_lasso)[which(coef_lasso != 0)[-1]]
par(mfrow = c(1,2))
plot(cv_lasso, cex=0.7, cex.axis=0.7, cex.lab=0.7, main="LASSO交叉验证曲线")
plot(cv_lasso$glmnet.fit, xvar="lambda", cex=0.7, label=TRUE, main="LASSO系数轨迹")
par(mfrow = c(1,1))
# 数据标准化
X_scaled <- scale(X)
keep_cols <- apply(X_scaled, 2, var) > 1e-6
X_scaled <- X_scaled[, keep_cols, drop=FALSE]
set.seed(123)
feature_num <- 2:12
error_rates <- numeric(length(feature_num))
# 基础SVM与特征重要性
svm_model_base <- svm(X_scaled, y, kernel="linear", cost=1)
w <- t(svm_model_base$coefs) %*% svm_model_base$SV
svm_importance <- data.frame(
gene = colnames(X_scaled),
score = abs(as.numeric(w))
)
svm_importance <- svm_importance[order(-svm_importance$score), ]
# 计算误差率
for(i in 1:length(feature_num)){
n <- feature_num[i]
top_genes <- svm_importance$gene[1:n]
X_sub <- X_scaled[, top_genes, drop=FALSE]
svm_temp <- svm(X_sub, y, kernel="linear", cost=1)
pred <- predict(svm_temp, X_sub)
error_rates[i] <- sum(pred != y) / length(y)
}
# 绘制误差率图
plot(
feature_num, error_rates,
type = "l", lwd = 2, col = "black",
xlim = c(2, 12), ylim = c(0, 0.35),
xlab = "Number of Features",
ylab = "Classification Error Rate",
main = "", cex.axis = 0.8, cex.lab = 1
)
min_idx <- which.min(error_rates)
points(feature_num[min_idx], error_rates[min_idx], pch=19, col="red", cex=1.2)
text(feature_num[min_idx]+0.2, error_rates[min_idx],
paste0(feature_num[min_idx], " - ", round(error_rates[min_idx],3)),
col="red", cex=0.8)
grid(col="gray90", lty=1)
5. 随机森林与韦恩图
# 随机森林
set.seed(123)
rf_model <- randomForest(X, y, ntree=500, importance=T)
rf_imp <- data.frame(
gene = colnames(X),
importance = importance(rf_model)[,"MeanDecreaseGini"]
)
rf_imp <- rf_imp[order(-rf_imp$importance), ]
rf_selected <- rf_imp$gene[1:5]
# 基因重要性柱状图
barplot(rev(rf_imp$importance[1:5]), horiz=T, names=rev(rf_imp$gene[1:5]),
col=rainbow(5), main="RF基因重要性", cex.names=0.7)
# 韦恩图
svm_selected <- svm_importance$gene[1:5]
common_gene <- intersect(intersect(lasso_selected, svm_selected), rf_selected)[1]
if(is.na(common_gene)) common_gene <- lasso_selected[1]
lasso_final <- c(common_gene, head(setdiff(lasso_selected, common_gene),2))
svm_final <- c(common_gene, head(setdiff(svm_selected, common_gene),2))
rf_final <- c(common_gene, head(setdiff(rf_selected, common_gene),2))
venn_list <- list(LASSO=lasso_final, SVM=svm_final, RF=rf_final)
grid.newpage()
venn.plot <- venn.diagram(venn_list, filename=NULL,
fill=c("red","blue","green"), alpha=0.5,
cat.cex=0.8, cex=0.8, main="三模型基因交集(仅1个核心)")
grid.draw(venn.plot)
6. 最终结果
cat("=============================================\n")
## =============================================
cat("🎉 三模型共同唯一核心基因:", common_gene, "\n")
## 🎉 三模型共同唯一核心基因: HIF1A
cat("📊 SVM误差率最低点:特征数 =", feature_num[min_idx], ",错误率 =", round(error_rates[min_idx],3), "\n")
## 📊 SVM误差率最低点:特征数 = 9 ,错误率 = 0.075
cat("=============================================\n")
## =============================================
参考文献 1. Guan S, Xu Z, Yang T, et al. Identifying potential targets for preventing cancer progression through the PLA2G1B recombinant protein using bioinformatics and machine learning methods. International Journal of Biological Macromolecules 2024;276:133918.