数据加载与预处理

# 加载数据
load("BreastCancer.RData")
objects()

## [1] "breast_cancer_x" "breast_cancer_y"

# 查看数据前几行
head(breast_cancer_x)

head(breast_cancer_y)

## [1] benign    benign    benign    benign    benign    malignant
## Levels: benign malignant

# 加载所需的包
library(dplyr)
library(caret)
library(klaR)

# 合并特征和标签为一个数据框
data <- data.frame(breast_cancer_x, Class = breast_cancer_y)

# 查看数据结构
head(data)

summary(data)

##   Cl.thickness      Cell.size        Cell.shape     Marg.adhesion   
##  Min.   : 1.000   Min.   : 1.000   Min.   : 1.000   Min.   : 1.000  
##  1st Qu.: 2.000   1st Qu.: 1.000   1st Qu.: 1.000   1st Qu.: 1.000  
##  Median : 4.000   Median : 1.000   Median : 2.000   Median : 1.000  
##  Mean   : 4.439   Mean   : 3.207   Mean   : 3.229   Mean   : 2.822  
##  3rd Qu.: 6.000   3rd Qu.: 5.000   3rd Qu.: 5.000   3rd Qu.: 4.000  
##  Max.   :10.000   Max.   :10.000   Max.   :10.000   Max.   :10.000  
##  NA's   :114      NA's   :123      NA's   :106      NA's   :108     
##   Epith.c.size     Bare.nuclei      Bl.cromatin     Normal.nucleoli 
##  Min.   : 1.000   Min.   : 1.000   Min.   : 1.000   Min.   : 1.000  
##  1st Qu.: 2.000   1st Qu.: 1.000   1st Qu.: 2.000   1st Qu.: 1.000  
##  Median : 2.000   Median : 1.000   Median : 3.000   Median : 1.000  
##  Mean   : 3.241   Mean   : 3.542   Mean   : 3.481   Mean   : 2.891  
##  3rd Qu.: 4.000   3rd Qu.: 7.000   3rd Qu.: 5.000   3rd Qu.: 4.000  
##  Max.   :10.000   Max.   :10.000   Max.   :10.000   Max.   :10.000  
##  NA's   :110      NA's   :110      NA's   :109      NA's   :113     
##     Mitoses             Class    
##  Min.   : 1.000   benign   :458  
##  1st Qu.: 1.000   malignant:241  
##  Median : 1.000                  
##  Mean   : 1.557                  
##  3rd Qu.: 1.000                  
##  Max.   :10.000                  
##  NA's   :121

# 检查缺失值
library(VIM)
sum(is.na(data))

## [1] 1014

# 删除缺失值和重复值
data <- na.omit(data) %>% unique() 
dim(data)

## [1] 112  10

sum(is.na(data))

## [1] 0

# 设置交叉验证参数(5折交叉验证)
train_control <- trainControl(method = 'cv', number = 5)

# 划分训练集和测试集(70%训练,30%测试)
set.seed(123)
index <- createDataPartition(data$Class, p = 0.7, list = FALSE) 
traindata <- data[index, ]
testdata <- data[-index, ]

# 数据标准化(中心化和缩放)
standard <- preProcess(traindata[, -ncol(traindata)], method = c("center", "scale"))
traindata_std <- predict(standard, traindata[, -ncol(traindata)])
traindata_std$Class <- traindata$Class
testdata_std <- predict(standard, testdata[, -ncol(testdata)])
testdata_std$Class <- testdata$Class

机器学习模型构建与评估

# 决策树模型
library(rpart)
rpart_model <- caret::train(Class ~ ., data = traindata_std, 
                           trControl = train_control, method = 'rpart')
                           
rpart_model$bestTune

# 决策树预测
rpart_pred <- predict(rpart_model, testdata_std[, -ncol(testdata_std)])

# 决策树混淆矩阵和性能评估
rpart_result <- confusionMatrix(rpart_pred, testdata_std$Class) 
rpart_result

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  benign malignant
##   benign        17         3
##   malignant      1        12
##                                         
##                Accuracy : 0.8788        
##                  95% CI : (0.718, 0.966)
##     No Information Rate : 0.5455        
##     P-Value [Acc > NIR] : 4.788e-05     
##                                         
##                   Kappa : 0.7528        
##                                         
##  Mcnemar's Test P-Value : 0.6171        
##                                         
##             Sensitivity : 0.9444        
##             Specificity : 0.8000        
##          Pos Pred Value : 0.8500        
##          Neg Pred Value : 0.9231        
##              Prevalence : 0.5455        
##          Detection Rate : 0.5152        
##    Detection Prevalence : 0.6061        
##       Balanced Accuracy : 0.8722        
##                                         
##        'Positive' Class : benign        
##

# 决策树变量重要性图
plot(varImp(rpart_model))

# 随机森林模型
library(randomForest)
rf_model <- randomForest(Class ~ ., data = traindata_std, importance = TRUE)

# 随机森林预测
rf_pred <- predict(rf_model, testdata_std[, -ncol(testdata_std)], type = 'class')

# 随机森林混淆矩阵和性能评估
rf_result <- confusionMatrix(rf_pred, testdata_std$Class)
rf_result

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  benign malignant
##   benign        16         0
##   malignant      2        15
##                                           
##                Accuracy : 0.9394          
##                  95% CI : (0.7977, 0.9926)
##     No Information Rate : 0.5455          
##     P-Value [Acc > NIR] : 8.125e-07       
##                                           
##                   Kappa : 0.8791          
##                                           
##  Mcnemar's Test P-Value : 0.4795          
##                                           
##             Sensitivity : 0.8889          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.8824          
##              Prevalence : 0.5455          
##          Detection Rate : 0.4848          
##    Detection Prevalence : 0.4848          
##       Balanced Accuracy : 0.9444          
##                                           
##        'Positive' Class : benign          
##

# 随机森林变量重要性图
varImpPlot(rf_model)

# 朴素贝叶斯模型
nb_model <- caret::train(Class ~ ., data = traindata_std, 
                        trControl = train_control, method = 'nb')
# 朴素贝叶斯预测
nb_pred <- predict(nb_model, testdata_std[, -ncol(testdata_std)])
# 朴素贝叶斯混淆矩阵和性能评估
nb_result <- confusionMatrix(nb_pred, testdata_std$Class) 
nb_result

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  benign malignant
##   benign        15         0
##   malignant      3        15
##                                           
##                Accuracy : 0.9091          
##                  95% CI : (0.7567, 0.9808)
##     No Information Rate : 0.5455          
##     P-Value [Acc > NIR] : 7.304e-06       
##                                           
##                   Kappa : 0.8197          
##                                           
##  Mcnemar's Test P-Value : 0.2482          
##                                           
##             Sensitivity : 0.8333          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.8333          
##              Prevalence : 0.5455          
##          Detection Rate : 0.4545          
##    Detection Prevalence : 0.4545          
##       Balanced Accuracy : 0.9167          
##                                           
##        'Positive' Class : benign          
##

# 支持向量机(SVM)模型
library(e1071)
svm_model <- svm(Class ~ ., data = traindata_std, kernel = 'radial')
# SVM预测
svm_pred <- predict(svm_model, testdata_std[, -ncol(testdata_std)])
# SVM混淆矩阵和性能评估
svm_result <- confusionMatrix(svm_pred, testdata_std$Class) 
svm_result

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  benign malignant
##   benign        16         0
##   malignant      2        15
##                                           
##                Accuracy : 0.9394          
##                  95% CI : (0.7977, 0.9926)
##     No Information Rate : 0.5455          
##     P-Value [Acc > NIR] : 8.125e-07       
##                                           
##                   Kappa : 0.8791          
##                                           
##  Mcnemar's Test P-Value : 0.4795          
##                                           
##             Sensitivity : 0.8889          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.8824          
##              Prevalence : 0.5455          
##          Detection Rate : 0.4848          
##    Detection Prevalence : 0.4848          
##       Balanced Accuracy : 0.9444          
##                                           
##        'Positive' Class : benign          
##

# K最近邻(KNN)模型 - 寻找最佳k值
library(class)
results <- c()
# 测试k值从3到10的准确率
for(i in seq(3,15,by=2)) {
    set.seed(1234)
    knn_pred <- knn(traindata_std[, -ncol(traindata_std)], 
                    testdata_std[, -ncol(testdata_std)], 
                    traindata_std$Class, k = i)
    Table <- table(knn_pred, testdata_std$Class)
    accuracy <- sum(diag(Table)) / sum(Table)
    results <- c(results, accuracy)
}

# 绘制k值与准确率的关系图
plot(x = seq(3,15,by=2), y = results, type = 'b', col = 'blue', xlab = 'k', ylab = 'accuracy')

# 使用最佳k值进行KNN预测
best_k <- which.max(results) + 2
knn_pred <- knn(train = traindata_std[, -ncol(traindata_std)], 
                test = testdata_std[, -ncol(testdata_std)], 
                cl = traindata_std$Class, k = best_k)
# KNN混淆矩阵和性能评估
knn_result <- confusionMatrix(knn_pred, testdata_std$Class)
knn_result

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  benign malignant
##   benign        17         0
##   malignant      1        15
##                                           
##                Accuracy : 0.9697          
##                  95% CI : (0.8424, 0.9992)
##     No Information Rate : 0.5455          
##     P-Value [Acc > NIR] : 5.86e-08        
##                                           
##                   Kappa : 0.9392          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9444          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.9375          
##              Prevalence : 0.5455          
##          Detection Rate : 0.5152          
##    Detection Prevalence : 0.5152          
##       Balanced Accuracy : 0.9722          
##                                           
##        'Positive' Class : benign          
##

# ROC曲线和AUC值计算
library(pROC)

# 计算各模型的ROC曲线
rpart_roc <- roc(testdata_std$Class, as.numeric(rpart_pred) - 1)
rf_roc <- roc(testdata_std$Class, as.numeric(rf_pred) - 1)
nb_roc <- roc(testdata_std$Class, as.numeric(nb_pred) - 1)
svm_roc <- roc(testdata_std$Class, as.numeric(svm_pred) - 1)
knn_roc <- roc(testdata_std$Class, as.numeric(knn_pred) - 1)

# 输出各模型的AUC值
cat("Decision Tree AUC:", auc(rpart_roc), "\n")

## Decision Tree AUC: 0.8722222

cat("Random Forest AUC:", auc(rf_roc), "\n")

## Random Forest AUC: 0.9444444

cat("Naive Bayes AUC:", auc(nb_roc), "\n")

## Naive Bayes AUC: 0.9166667

cat("SVM AUC:", auc(svm_roc), "\n")

## SVM AUC: 0.9444444

cat("KNN AUC:", auc(knn_roc), "\n")

## KNN AUC: 0.9722222

# 绘制所有模型的ROC曲线对比图
plot(rpart_roc, col = "red", main = "ROC Curves Comparison")
lines(rf_roc, col = "blue")
lines(nb_roc, col = "green")
lines(svm_roc, col = "purple")
lines(knn_roc, col = "orange")
legend("bottomright", legend = c("Decision Tree", "Random Forest", "Naive Bayes", "SVM", "KNN"),
       col = c("red", "blue", "green", "purple", "orange"), lty = 1)

基因芯片数据分析 — 机器学习

Wang ZG

2025-11-04

数据加载与预处理

机器学习模型构建与评估