Project-1.knit

This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.

#================ Project 1 ==============
# 项目 1：OPC 患者关键性体重减轻 (CWL) 预测优化代码
# 数据集：OPC Critical Weight Loss 数据
# 目标：预测治疗后是否发生 >20% 体重减轻（cwl）

# 安装并加载必要包
library(caret)

## 载入需要的程序包：ggplot2

## 载入需要的程序包：lattice

library(glmnet)

## 载入需要的程序包：Matrix

## Loaded glmnet 4.1-8

library(ranger)
library(randomForest)

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## 载入程序包：'randomForest'

## The following object is masked from 'package:ranger':
## 
##     importance

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## 载入程序包：'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(dplyr)

## 
## 载入程序包：'dplyr'

## The following object is masked from 'package:randomForest':
## 
##     combine

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(corrplot)

## corrplot 0.95 loaded

library(ggplot2)
library(tibble)
# SMOTE
if(!requireNamespace("themis", quietly=TRUE)) install.packages("themis")
library(themis)

## 载入需要的程序包：recipes

## 
## 载入程序包：'recipes'

## The following object is masked from 'package:Matrix':
## 
##     update

## The following object is masked from 'package:stats':
## 
##     step

# 数据导入与标签处理
opc <- read.csv(file.choose(), header=TRUE, check.names=FALSE)
opc$cwl <- factor(opc$cwl, levels=c(0,1), labels=c("no_loss","crit_loss"))
cat("样本数：", nrow(opc), "  特征数：", ncol(opc)-1, "
")

## 样本数： 253   特征数： 1807

# 数据预处理：去除近零方差并标准化
nzv <- nearZeroVar(opc, saveMetrics=TRUE)
opc2 <- opc[, !nzv$nzv]
pp <- preProcess(opc2 %>% select(-Patient_ID, -cwl), method=c("center","scale"))
X <- predict(pp, opc2 %>% select(-Patient_ID, -cwl))
opc_scaled <- cbind(X, Patient_ID=opc2$Patient_ID, cwl=opc2$cwl)

# 特征筛选：单变量Wilcoxon过滤(p<0.05) + 相关性过滤(ρ>0.9)
pv <- sapply(X, function(col) wilcox.test(col[opc_scaled$cwl=="no_loss"], col[opc_scaled$cwl=="crit_loss"])$p.value)
sig <- names(pv)[pv<0.05]
rem <- findCorrelation(cor(X[,sig]), cutoff=0.9)
feats <- sig[-rem]

# 划分训练/测试集（80%/20%，分层抽样）
set.seed(2025)
train_idx <- createDataPartition(opc_scaled$cwl, p=0.8, list=FALSE)
train <- opc_scaled[train_idx, c(feats,'cwl')]
test  <- opc_scaled[-train_idx, c(feats,'cwl')]

# 交叉验证设置：重复5折×3次 + SMOTE过采样
ctrl <- trainControl(
  method = "repeatedcv", number = 5, repeats = 3,
  classProbs = TRUE, summaryFunction = twoClassSummary,
  sampling = "smote", savePredictions = TRUE
)

# 模型训练与调优
set.seed(2025)
lasso <- train(cwl~., data=train, method="glmnet", metric="ROC",
               trControl=ctrl,
               tuneGrid=expand.grid(alpha=1, lambda=10^seq(-3,0,length=20)))
set.seed(2025)
rf    <- train(cwl~., data=train, method="rf", metric="ROC",
               trControl=ctrl,
               tuneGrid=expand.grid(mtry=floor(sqrt(length(feats)))),
               ntree=500)
set.seed(2025)
rngr  <- train(cwl~., data=train, method="ranger", metric="ROC",
               trControl=ctrl,
               tuneGrid=expand.grid(mtry=10, splitrule="gini", min.node.size=1),
               num.trees=500, importance="impurity")
# 新增模型：k近邻 (kNN)
set.seed(2025)
knn   <- train(cwl~., data=train, method="knn", metric="ROC",
               trControl=ctrl,
               tuneGrid=expand.grid(k=c(3,5,7,9)))

# 性能评估：测试集ROC/AUC比较
roc_l  <- roc(test$cwl, predict(lasso, test, type="prob")[,"crit_loss"])

## Setting levels: control = no_loss, case = crit_loss

## Setting direction: controls < cases

roc_rf <- roc(test$cwl, predict(rf,    test, type="prob")[,"crit_loss"])

## Setting levels: control = no_loss, case = crit_loss
## Setting direction: controls < cases

roc_rg <- roc(test$cwl, predict(rngr,  test, type="prob")[,"crit_loss"])

## Setting levels: control = no_loss, case = crit_loss
## Setting direction: controls < cases

roc_knn<- roc(test$cwl, predict(knn,   test, type="prob")[,"crit_loss"])

## Setting levels: control = no_loss, case = crit_loss
## Setting direction: controls < cases

plot(roc_l, col="blue", main="测试集ROC曲线比较", legacy.axes=TRUE)
lines(roc_rf, col="red")
lines(roc_rg, col="green")
lines(roc_knn, col="purple")
legend("bottomright", legend=c("LASSO","RF","Ranger","kNN"), col=c("blue","red","green","purple"), lty=1)

cat(sprintf("LASSO   Test AUC: %.3f
", auc(roc_l)))

## LASSO   Test AUC: 0.610

cat(sprintf("RF      Test AUC: %.3f
", auc(roc_rf)))

## RF      Test AUC: 0.659

cat(sprintf("Ranger  Test AUC: %.3f
", auc(roc_rg)))

## Ranger  Test AUC: 0.657

cat(sprintf("kNN     Test AUC: %.3f
", auc(roc_knn)))

## kNN     Test AUC: 0.566

# 混淆矩阵展示
models <- list(LASSO=lasso, RF=rf, Ranger=rngr)
for(nm in names(models)){
  pred <- predict(models[[nm]], test)
  cm   <- confusionMatrix(pred, test$cwl, positive="crit_loss")
  cat(sprintf("\n=== 混淆矩阵: %s ===\n", nm)); print(cm)
}

## 
## === 混淆矩阵: LASSO ===
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  no_loss crit_loss
##   no_loss        12         8
##   crit_loss      18        12
##                                           
##                Accuracy : 0.48            
##                  95% CI : (0.3366, 0.6258)
##     No Information Rate : 0.6             
##     P-Value [Acc > NIR] : 0.96859         
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 0.07756         
##                                           
##             Sensitivity : 0.60            
##             Specificity : 0.40            
##          Pos Pred Value : 0.40            
##          Neg Pred Value : 0.60            
##              Prevalence : 0.40            
##          Detection Rate : 0.24            
##    Detection Prevalence : 0.60            
##       Balanced Accuracy : 0.50            
##                                           
##        'Positive' Class : crit_loss       
##                                           
## 
## === 混淆矩阵: RF ===
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  no_loss crit_loss
##   no_loss        19         8
##   crit_loss      11        12
##                                           
##                Accuracy : 0.62            
##                  95% CI : (0.4717, 0.7535)
##     No Information Rate : 0.6             
##     P-Value [Acc > NIR] : 0.4465          
##                                           
##                   Kappa : 0.2276          
##                                           
##  Mcnemar's Test P-Value : 0.6464          
##                                           
##             Sensitivity : 0.6000          
##             Specificity : 0.6333          
##          Pos Pred Value : 0.5217          
##          Neg Pred Value : 0.7037          
##              Prevalence : 0.4000          
##          Detection Rate : 0.2400          
##    Detection Prevalence : 0.4600          
##       Balanced Accuracy : 0.6167          
##                                           
##        'Positive' Class : crit_loss       
##                                           
## 
## === 混淆矩阵: Ranger ===
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  no_loss crit_loss
##   no_loss        20         8
##   crit_loss      10        12
##                                           
##                Accuracy : 0.64            
##                  95% CI : (0.4919, 0.7708)
##     No Information Rate : 0.6             
##     P-Value [Acc > NIR] : 0.3356          
##                                           
##                   Kappa : 0.2623          
##                                           
##  Mcnemar's Test P-Value : 0.8137          
##                                           
##             Sensitivity : 0.6000          
##             Specificity : 0.6667          
##          Pos Pred Value : 0.5455          
##          Neg Pred Value : 0.7143          
##              Prevalence : 0.4000          
##          Detection Rate : 0.2400          
##    Detection Prevalence : 0.4400          
##       Balanced Accuracy : 0.6333          
##                                           
##        'Positive' Class : crit_loss       
##

# 特征重要性可视化：前三模型各取前10，kNN无Importance列则取第二列
get_imp <- function(fit){
  imp <- varImp(fit)$importance
  df  <- rownames_to_column(imp, "Feature")
  second <- colnames(df)[2]
  df %>% rename(Importance=all_of(second)) %>% arrange(desc(Importance)) %>% head(10)
}
imp_las <- get_imp(lasso)
imp_rf  <- get_imp(rf)
imp_rg  <- get_imp(rngr)
imp_knn <- get_imp(knn)
ggplot(imp_las, aes(x=reorder(Feature,Importance), y=Importance)) + geom_col(fill="#B7A99A") + coord_flip() + labs(title="LASSO前10特征", x="特征", y="重要性") + theme_minimal()

ggplot(imp_rf,  aes(x=reorder(Feature,Importance), y=Importance)) + geom_col(fill="#8D918F") + coord_flip() + labs(title="RF前10特征", x="特征", y="重要性") + theme_minimal()

ggplot(imp_rg,  aes(x=reorder(Feature,Importance), y=Importance)) + geom_col(fill="#A3A497") + coord_flip() + labs(title="Ranger前10特征", x="特征", y="重要性") + theme_minimal()

ggplot(imp_knn, aes(x=reorder(Feature,Importance), y=Importance)) + geom_col(fill="#857E7B") + coord_flip() + labs(title="kNN前10特征", x="特征", y="重要性") + theme_minimal()

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.