数据加载与预处理
# 加载数据
load("BreastCancer.RData")
objects()
## [1] "breast_cancer_x" "breast_cancer_y"
# 查看数据前几行
head(breast_cancer_x)
head(breast_cancer_y)
## [1] benign benign benign benign benign malignant
## Levels: benign malignant
# 加载所需的包
library(dplyr)
library(caret)
library(klaR)
# 合并特征和标签为一个数据框
data <- data.frame(breast_cancer_x, Class = breast_cancer_y)
# 查看数据结构
head(data)
summary(data)
## Cl.thickness Cell.size Cell.shape Marg.adhesion
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. : 1.000
## 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 1.000
## Median : 4.000 Median : 1.000 Median : 2.000 Median : 1.000
## Mean : 4.439 Mean : 3.207 Mean : 3.229 Mean : 2.822
## 3rd Qu.: 6.000 3rd Qu.: 5.000 3rd Qu.: 5.000 3rd Qu.: 4.000
## Max. :10.000 Max. :10.000 Max. :10.000 Max. :10.000
## NA's :114 NA's :123 NA's :106 NA's :108
## Epith.c.size Bare.nuclei Bl.cromatin Normal.nucleoli
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. : 1.000
## 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.: 1.000
## Median : 2.000 Median : 1.000 Median : 3.000 Median : 1.000
## Mean : 3.241 Mean : 3.542 Mean : 3.481 Mean : 2.891
## 3rd Qu.: 4.000 3rd Qu.: 7.000 3rd Qu.: 5.000 3rd Qu.: 4.000
## Max. :10.000 Max. :10.000 Max. :10.000 Max. :10.000
## NA's :110 NA's :110 NA's :109 NA's :113
## Mitoses Class
## Min. : 1.000 benign :458
## 1st Qu.: 1.000 malignant:241
## Median : 1.000
## Mean : 1.557
## 3rd Qu.: 1.000
## Max. :10.000
## NA's :121
# 检查缺失值
library(VIM)
sum(is.na(data))
## [1] 1014
# 删除缺失值和重复值
data <- na.omit(data) %>% unique()
dim(data)
## [1] 112 10
sum(is.na(data))
## [1] 0
# 设置交叉验证参数(5折交叉验证)
train_control <- trainControl(method = 'cv', number = 5)
# 划分训练集和测试集(70%训练,30%测试)
set.seed(123)
index <- createDataPartition(data$Class, p = 0.7, list = FALSE)
traindata <- data[index, ]
testdata <- data[-index, ]
# 数据标准化(中心化和缩放)
standard <- preProcess(traindata[, -ncol(traindata)], method = c("center", "scale"))
traindata_std <- predict(standard, traindata[, -ncol(traindata)])
traindata_std$Class <- traindata$Class
testdata_std <- predict(standard, testdata[, -ncol(testdata)])
testdata_std$Class <- testdata$Class
机器学习模型构建与评估
# 决策树模型
library(rpart)
rpart_model <- caret::train(Class ~ ., data = traindata_std,
trControl = train_control, method = 'rpart')
rpart_model$bestTune
# 决策树预测
rpart_pred <- predict(rpart_model, testdata_std[, -ncol(testdata_std)])
# 决策树混淆矩阵和性能评估
rpart_result <- confusionMatrix(rpart_pred, testdata_std$Class)
rpart_result
## Confusion Matrix and Statistics
##
## Reference
## Prediction benign malignant
## benign 17 3
## malignant 1 12
##
## Accuracy : 0.8788
## 95% CI : (0.718, 0.966)
## No Information Rate : 0.5455
## P-Value [Acc > NIR] : 4.788e-05
##
## Kappa : 0.7528
##
## Mcnemar's Test P-Value : 0.6171
##
## Sensitivity : 0.9444
## Specificity : 0.8000
## Pos Pred Value : 0.8500
## Neg Pred Value : 0.9231
## Prevalence : 0.5455
## Detection Rate : 0.5152
## Detection Prevalence : 0.6061
## Balanced Accuracy : 0.8722
##
## 'Positive' Class : benign
##
# 决策树变量重要性图
plot(varImp(rpart_model))

# 随机森林模型
library(randomForest)
rf_model <- randomForest(Class ~ ., data = traindata_std, importance = TRUE)
# 随机森林预测
rf_pred <- predict(rf_model, testdata_std[, -ncol(testdata_std)], type = 'class')
# 随机森林混淆矩阵和性能评估
rf_result <- confusionMatrix(rf_pred, testdata_std$Class)
rf_result
## Confusion Matrix and Statistics
##
## Reference
## Prediction benign malignant
## benign 16 0
## malignant 2 15
##
## Accuracy : 0.9394
## 95% CI : (0.7977, 0.9926)
## No Information Rate : 0.5455
## P-Value [Acc > NIR] : 8.125e-07
##
## Kappa : 0.8791
##
## Mcnemar's Test P-Value : 0.4795
##
## Sensitivity : 0.8889
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.8824
## Prevalence : 0.5455
## Detection Rate : 0.4848
## Detection Prevalence : 0.4848
## Balanced Accuracy : 0.9444
##
## 'Positive' Class : benign
##
# 随机森林变量重要性图
varImpPlot(rf_model)

# 朴素贝叶斯模型
nb_model <- caret::train(Class ~ ., data = traindata_std,
trControl = train_control, method = 'nb')
# 朴素贝叶斯预测
nb_pred <- predict(nb_model, testdata_std[, -ncol(testdata_std)])
# 朴素贝叶斯混淆矩阵和性能评估
nb_result <- confusionMatrix(nb_pred, testdata_std$Class)
nb_result
## Confusion Matrix and Statistics
##
## Reference
## Prediction benign malignant
## benign 15 0
## malignant 3 15
##
## Accuracy : 0.9091
## 95% CI : (0.7567, 0.9808)
## No Information Rate : 0.5455
## P-Value [Acc > NIR] : 7.304e-06
##
## Kappa : 0.8197
##
## Mcnemar's Test P-Value : 0.2482
##
## Sensitivity : 0.8333
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.8333
## Prevalence : 0.5455
## Detection Rate : 0.4545
## Detection Prevalence : 0.4545
## Balanced Accuracy : 0.9167
##
## 'Positive' Class : benign
##
# 支持向量机(SVM)模型
library(e1071)
svm_model <- svm(Class ~ ., data = traindata_std, kernel = 'radial')
# SVM预测
svm_pred <- predict(svm_model, testdata_std[, -ncol(testdata_std)])
# SVM混淆矩阵和性能评估
svm_result <- confusionMatrix(svm_pred, testdata_std$Class)
svm_result
## Confusion Matrix and Statistics
##
## Reference
## Prediction benign malignant
## benign 16 0
## malignant 2 15
##
## Accuracy : 0.9394
## 95% CI : (0.7977, 0.9926)
## No Information Rate : 0.5455
## P-Value [Acc > NIR] : 8.125e-07
##
## Kappa : 0.8791
##
## Mcnemar's Test P-Value : 0.4795
##
## Sensitivity : 0.8889
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.8824
## Prevalence : 0.5455
## Detection Rate : 0.4848
## Detection Prevalence : 0.4848
## Balanced Accuracy : 0.9444
##
## 'Positive' Class : benign
##
# K最近邻(KNN)模型 - 寻找最佳k值
library(class)
results <- c()
# 测试k值从3到10的准确率
for(i in seq(3,15,by=2)) {
set.seed(1234)
knn_pred <- knn(traindata_std[, -ncol(traindata_std)],
testdata_std[, -ncol(testdata_std)],
traindata_std$Class, k = i)
Table <- table(knn_pred, testdata_std$Class)
accuracy <- sum(diag(Table)) / sum(Table)
results <- c(results, accuracy)
}
# 绘制k值与准确率的关系图
plot(x = seq(3,15,by=2), y = results, type = 'b', col = 'blue', xlab = 'k', ylab = 'accuracy')

# 使用最佳k值进行KNN预测
best_k <- which.max(results) + 2
knn_pred <- knn(train = traindata_std[, -ncol(traindata_std)],
test = testdata_std[, -ncol(testdata_std)],
cl = traindata_std$Class, k = best_k)
# KNN混淆矩阵和性能评估
knn_result <- confusionMatrix(knn_pred, testdata_std$Class)
knn_result
## Confusion Matrix and Statistics
##
## Reference
## Prediction benign malignant
## benign 17 0
## malignant 1 15
##
## Accuracy : 0.9697
## 95% CI : (0.8424, 0.9992)
## No Information Rate : 0.5455
## P-Value [Acc > NIR] : 5.86e-08
##
## Kappa : 0.9392
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9444
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9375
## Prevalence : 0.5455
## Detection Rate : 0.5152
## Detection Prevalence : 0.5152
## Balanced Accuracy : 0.9722
##
## 'Positive' Class : benign
##
# ROC曲线和AUC值计算
library(pROC)
# 计算各模型的ROC曲线
rpart_roc <- roc(testdata_std$Class, as.numeric(rpart_pred) - 1)
rf_roc <- roc(testdata_std$Class, as.numeric(rf_pred) - 1)
nb_roc <- roc(testdata_std$Class, as.numeric(nb_pred) - 1)
svm_roc <- roc(testdata_std$Class, as.numeric(svm_pred) - 1)
knn_roc <- roc(testdata_std$Class, as.numeric(knn_pred) - 1)
# 输出各模型的AUC值
cat("Decision Tree AUC:", auc(rpart_roc), "\n")
## Decision Tree AUC: 0.8722222
cat("Random Forest AUC:", auc(rf_roc), "\n")
## Random Forest AUC: 0.9444444
cat("Naive Bayes AUC:", auc(nb_roc), "\n")
## Naive Bayes AUC: 0.9166667
cat("SVM AUC:", auc(svm_roc), "\n")
## SVM AUC: 0.9444444
cat("KNN AUC:", auc(knn_roc), "\n")
## KNN AUC: 0.9722222
# 绘制所有模型的ROC曲线对比图
plot(rpart_roc, col = "red", main = "ROC Curves Comparison")
lines(rf_roc, col = "blue")
lines(nb_roc, col = "green")
lines(svm_roc, col = "purple")
lines(knn_roc, col = "orange")
legend("bottomright", legend = c("Decision Tree", "Random Forest", "Naive Bayes", "SVM", "KNN"),
col = c("red", "blue", "green", "purple", "orange"), lty = 1)
