This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
#================ Project 1 ==============
# 项目 1:OPC 患者关键性体重减轻 (CWL) 预测优化代码
# 数据集:OPC Critical Weight Loss 数据
# 目标:预测治疗后是否发生 >20% 体重减轻(cwl)
# 安装并加载必要包
library(caret)
## 载入需要的程序包:ggplot2
## 载入需要的程序包:lattice
library(glmnet)
## 载入需要的程序包:Matrix
## Loaded glmnet 4.1-8
library(ranger)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## 载入程序包:'randomForest'
## The following object is masked from 'package:ranger':
##
## importance
## The following object is masked from 'package:ggplot2':
##
## margin
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## 载入程序包:'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(dplyr)
##
## 载入程序包:'dplyr'
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(corrplot)
## corrplot 0.95 loaded
library(ggplot2)
library(tibble)
# SMOTE
if(!requireNamespace("themis", quietly=TRUE)) install.packages("themis")
library(themis)
## 载入需要的程序包:recipes
##
## 载入程序包:'recipes'
## The following object is masked from 'package:Matrix':
##
## update
## The following object is masked from 'package:stats':
##
## step
# 数据导入与标签处理
opc <- read.csv(file.choose(), header=TRUE, check.names=FALSE)
opc$cwl <- factor(opc$cwl, levels=c(0,1), labels=c("no_loss","crit_loss"))
cat("样本数:", nrow(opc), " 特征数:", ncol(opc)-1, "
")
## 样本数: 253 特征数: 1807
# 数据预处理:去除近零方差并标准化
nzv <- nearZeroVar(opc, saveMetrics=TRUE)
opc2 <- opc[, !nzv$nzv]
pp <- preProcess(opc2 %>% select(-Patient_ID, -cwl), method=c("center","scale"))
X <- predict(pp, opc2 %>% select(-Patient_ID, -cwl))
opc_scaled <- cbind(X, Patient_ID=opc2$Patient_ID, cwl=opc2$cwl)
# 特征筛选:单变量Wilcoxon过滤(p<0.05) + 相关性过滤(ρ>0.9)
pv <- sapply(X, function(col) wilcox.test(col[opc_scaled$cwl=="no_loss"], col[opc_scaled$cwl=="crit_loss"])$p.value)
sig <- names(pv)[pv<0.05]
rem <- findCorrelation(cor(X[,sig]), cutoff=0.9)
feats <- sig[-rem]
# 划分训练/测试集(80%/20%,分层抽样)
set.seed(2025)
train_idx <- createDataPartition(opc_scaled$cwl, p=0.8, list=FALSE)
train <- opc_scaled[train_idx, c(feats,'cwl')]
test <- opc_scaled[-train_idx, c(feats,'cwl')]
# 交叉验证设置:重复5折×3次 + SMOTE过采样
ctrl <- trainControl(
method = "repeatedcv", number = 5, repeats = 3,
classProbs = TRUE, summaryFunction = twoClassSummary,
sampling = "smote", savePredictions = TRUE
)
# 模型训练与调优
set.seed(2025)
lasso <- train(cwl~., data=train, method="glmnet", metric="ROC",
trControl=ctrl,
tuneGrid=expand.grid(alpha=1, lambda=10^seq(-3,0,length=20)))
set.seed(2025)
rf <- train(cwl~., data=train, method="rf", metric="ROC",
trControl=ctrl,
tuneGrid=expand.grid(mtry=floor(sqrt(length(feats)))),
ntree=500)
set.seed(2025)
rngr <- train(cwl~., data=train, method="ranger", metric="ROC",
trControl=ctrl,
tuneGrid=expand.grid(mtry=10, splitrule="gini", min.node.size=1),
num.trees=500, importance="impurity")
# 新增模型:k近邻 (kNN)
set.seed(2025)
knn <- train(cwl~., data=train, method="knn", metric="ROC",
trControl=ctrl,
tuneGrid=expand.grid(k=c(3,5,7,9)))
# 性能评估:测试集ROC/AUC比较
roc_l <- roc(test$cwl, predict(lasso, test, type="prob")[,"crit_loss"])
## Setting levels: control = no_loss, case = crit_loss
## Setting direction: controls < cases
roc_rf <- roc(test$cwl, predict(rf, test, type="prob")[,"crit_loss"])
## Setting levels: control = no_loss, case = crit_loss
## Setting direction: controls < cases
roc_rg <- roc(test$cwl, predict(rngr, test, type="prob")[,"crit_loss"])
## Setting levels: control = no_loss, case = crit_loss
## Setting direction: controls < cases
roc_knn<- roc(test$cwl, predict(knn, test, type="prob")[,"crit_loss"])
## Setting levels: control = no_loss, case = crit_loss
## Setting direction: controls < cases
plot(roc_l, col="blue", main="测试集ROC曲线比较", legacy.axes=TRUE)
lines(roc_rf, col="red")
lines(roc_rg, col="green")
lines(roc_knn, col="purple")
legend("bottomright", legend=c("LASSO","RF","Ranger","kNN"), col=c("blue","red","green","purple"), lty=1)
cat(sprintf("LASSO Test AUC: %.3f
", auc(roc_l)))
## LASSO Test AUC: 0.610
cat(sprintf("RF Test AUC: %.3f
", auc(roc_rf)))
## RF Test AUC: 0.659
cat(sprintf("Ranger Test AUC: %.3f
", auc(roc_rg)))
## Ranger Test AUC: 0.657
cat(sprintf("kNN Test AUC: %.3f
", auc(roc_knn)))
## kNN Test AUC: 0.566
# 混淆矩阵展示
models <- list(LASSO=lasso, RF=rf, Ranger=rngr)
for(nm in names(models)){
pred <- predict(models[[nm]], test)
cm <- confusionMatrix(pred, test$cwl, positive="crit_loss")
cat(sprintf("\n=== 混淆矩阵: %s ===\n", nm)); print(cm)
}
##
## === 混淆矩阵: LASSO ===
## Confusion Matrix and Statistics
##
## Reference
## Prediction no_loss crit_loss
## no_loss 12 8
## crit_loss 18 12
##
## Accuracy : 0.48
## 95% CI : (0.3366, 0.6258)
## No Information Rate : 0.6
## P-Value [Acc > NIR] : 0.96859
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 0.07756
##
## Sensitivity : 0.60
## Specificity : 0.40
## Pos Pred Value : 0.40
## Neg Pred Value : 0.60
## Prevalence : 0.40
## Detection Rate : 0.24
## Detection Prevalence : 0.60
## Balanced Accuracy : 0.50
##
## 'Positive' Class : crit_loss
##
##
## === 混淆矩阵: RF ===
## Confusion Matrix and Statistics
##
## Reference
## Prediction no_loss crit_loss
## no_loss 19 8
## crit_loss 11 12
##
## Accuracy : 0.62
## 95% CI : (0.4717, 0.7535)
## No Information Rate : 0.6
## P-Value [Acc > NIR] : 0.4465
##
## Kappa : 0.2276
##
## Mcnemar's Test P-Value : 0.6464
##
## Sensitivity : 0.6000
## Specificity : 0.6333
## Pos Pred Value : 0.5217
## Neg Pred Value : 0.7037
## Prevalence : 0.4000
## Detection Rate : 0.2400
## Detection Prevalence : 0.4600
## Balanced Accuracy : 0.6167
##
## 'Positive' Class : crit_loss
##
##
## === 混淆矩阵: Ranger ===
## Confusion Matrix and Statistics
##
## Reference
## Prediction no_loss crit_loss
## no_loss 20 8
## crit_loss 10 12
##
## Accuracy : 0.64
## 95% CI : (0.4919, 0.7708)
## No Information Rate : 0.6
## P-Value [Acc > NIR] : 0.3356
##
## Kappa : 0.2623
##
## Mcnemar's Test P-Value : 0.8137
##
## Sensitivity : 0.6000
## Specificity : 0.6667
## Pos Pred Value : 0.5455
## Neg Pred Value : 0.7143
## Prevalence : 0.4000
## Detection Rate : 0.2400
## Detection Prevalence : 0.4400
## Balanced Accuracy : 0.6333
##
## 'Positive' Class : crit_loss
##
# 特征重要性可视化:前三模型各取前10,kNN无Importance列则取第二列
get_imp <- function(fit){
imp <- varImp(fit)$importance
df <- rownames_to_column(imp, "Feature")
second <- colnames(df)[2]
df %>% rename(Importance=all_of(second)) %>% arrange(desc(Importance)) %>% head(10)
}
imp_las <- get_imp(lasso)
imp_rf <- get_imp(rf)
imp_rg <- get_imp(rngr)
imp_knn <- get_imp(knn)
ggplot(imp_las, aes(x=reorder(Feature,Importance), y=Importance)) + geom_col(fill="#B7A99A") + coord_flip() + labs(title="LASSO前10特征", x="特征", y="重要性") + theme_minimal()
ggplot(imp_rf, aes(x=reorder(Feature,Importance), y=Importance)) + geom_col(fill="#8D918F") + coord_flip() + labs(title="RF前10特征", x="特征", y="重要性") + theme_minimal()
ggplot(imp_rg, aes(x=reorder(Feature,Importance), y=Importance)) + geom_col(fill="#A3A497") + coord_flip() + labs(title="Ranger前10特征", x="特征", y="重要性") + theme_minimal()
ggplot(imp_knn, aes(x=reorder(Feature,Importance), y=Importance)) + geom_col(fill="#857E7B") + coord_flip() + labs(title="kNN前10特征", x="特征", y="重要性") + theme_minimal()
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.