library(ggplot2)
library(dplyr)
##
## 载入程序包:'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(caret)
## Warning: 程序包'caret'是用R版本4.4.2 来建造的
## 载入需要的程序包:lattice
library(gridExtra)
## Warning: 程序包'gridExtra'是用R版本4.4.2 来建造的
##
## 载入程序包:'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(naivebayes)
## Warning: 程序包'naivebayes'是用R版本4.4.2 来建造的
## naivebayes 1.0.0 loaded
## For more information please visit:
## https://majkamichal.github.io/naivebayes/
heart_data <- read.csv("C:/Users/书生飘啊飘/Desktop/heart.csv")
missing_values <- colSums(is.na(heart_data))
print("缺失值统计:")
## [1] "缺失值统计:"
print(missing_values)
## age sex cp trtbps chol fbs restecg thalachh
## 0 0 0 0 0 0 0 0
## exng oldpeak slp caa thall output
## 0 0 0 0 0 0
heart_data$sex <- as.factor(heart_data$sex)
heart_data$cp <- as.factor(heart_data$cp)
heart_data$fbs <- as.factor(heart_data$fbs)
heart_data$restecg <- as.factor(heart_data$restecg)
heart_data$exng <- as.factor(heart_data$exng)
heart_data$slp <- as.factor(heart_data$slp)
heart_data$caa <- as.factor(heart_data$caa)
heart_data$thall <- as.factor(heart_data$thall)
heart_data$output <- as.factor(heart_data$output) # 目标变量
str(heart_data)
## 'data.frame': 303 obs. of 14 variables:
## $ age : int 63 37 41 56 57 57 56 44 52 57 ...
## $ sex : Factor w/ 2 levels "0","1": 2 2 1 2 1 2 1 2 2 2 ...
## $ cp : Factor w/ 4 levels "0","1","2","3": 4 3 2 2 1 1 2 2 3 3 ...
## $ trtbps : int 145 130 130 120 120 140 140 120 172 150 ...
## $ chol : int 233 250 204 236 354 192 294 263 199 168 ...
## $ fbs : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 2 1 ...
## $ restecg : Factor w/ 3 levels "0","1","2": 1 2 1 2 2 2 1 2 2 2 ...
## $ thalachh: int 150 187 172 178 163 148 153 173 162 174 ...
## $ exng : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 1 1 1 ...
## $ oldpeak : num 2.3 3.5 1.4 0.8 0.6 0.4 1.3 0 0.5 1.6 ...
## $ slp : Factor w/ 3 levels "0","1","2": 1 1 3 3 3 2 2 3 3 3 ...
## $ caa : Factor w/ 5 levels "0","1","2","3",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ thall : Factor w/ 4 levels "0","1","2","3": 2 3 3 3 3 2 3 4 4 3 ...
## $ output : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
continuous_features <- c("age", "trtbps", "chol", "thalachh", "oldpeak")
plot_list <- list()
for (feature in continuous_features) {
p <- ggplot(heart_data, aes_string(x = feature)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "skyblue", color = "black", alpha = 0.7) +
geom_density(color = "red", size = 1) +
labs(title = paste("Distribution of", feature),
x = feature,
y = "Density") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
plot_list[[feature]] <- p
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# 拼接图
grid.arrange(
grobs = plot_list,
ncol = 2
)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

continuous_features <- c("age", "trtbps", "chol", "thalachh", "oldpeak")
for (feature in continuous_features) {
p <- ggplot(heart_data, aes_string(x = feature)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "skyblue", color = "black", alpha = 0.7) +
geom_density(color = "red", size = 1) +
labs(title = paste("Distribution of", feature),
x = feature,
y = "Density") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
# 在 for 循环中,必须显式调用 print()
print(p)
}





heart_data <- subset(heart_data, chol <= 500)
cat("清理后数据集行数:", nrow(heart_data), "\n")
## 清理后数据集行数: 302
set.seed(777)
train_index <- createDataPartition(heart_data$output, p = 0.8, list = FALSE)
train_data <- heart_data[train_index, ]
test_data <- heart_data[-train_index, ]
# Step 4: 训练混合贝叶斯分类器
model <- naive_bayes(output ~ ., data = train_data, laplace = 1)
print(model)
##
## ================================= Naive Bayes ==================================
##
## Call:
## naive_bayes.formula(formula = output ~ ., data = train_data,
## laplace = 1)
##
## --------------------------------------------------------------------------------
##
## Laplace smoothing: 1
##
## --------------------------------------------------------------------------------
##
## A priori probabilities:
##
## 0 1
## 0.4567901 0.5432099
##
## --------------------------------------------------------------------------------
##
## Tables:
##
## --------------------------------------------------------------------------------
## :: age (Gaussian)
## --------------------------------------------------------------------------------
##
## age 0 1
## mean 56.720721 52.363636
## sd 8.162187 9.876545
##
## --------------------------------------------------------------------------------
## :: sex (Bernoulli)
## --------------------------------------------------------------------------------
##
## sex 0 1
## 0 0.1681416 0.4402985
## 1 0.8318584 0.5597015
##
## --------------------------------------------------------------------------------
## :: cp (Categorical)
## --------------------------------------------------------------------------------
##
## cp 0 1
## 0 0.72173913 0.25000000
## 1 0.06956522 0.22794118
## 2 0.14782609 0.41176471
## 3 0.06086957 0.11029412
##
## --------------------------------------------------------------------------------
## :: trtbps (Gaussian)
## --------------------------------------------------------------------------------
##
## trtbps 0 1
## mean 135.10811 130.79545
## sd 18.99203 16.31594
##
## --------------------------------------------------------------------------------
## :: chol (Gaussian)
## --------------------------------------------------------------------------------
##
## chol 0 1
## mean 246.50450 236.31061
## sd 47.56352 45.62277
##
## --------------------------------------------------------------------------------
##
## # ... and 8 more tables
##
## --------------------------------------------------------------------------------
predictions <- predict(model, test_data)
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
conf_matrix <- confusionMatrix(predictions, test_data$output)
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 22 3
## 1 5 29
##
## Accuracy : 0.8644
## 95% CI : (0.7502, 0.9396)
## No Information Rate : 0.5424
## P-Value [Acc > NIR] : 1.458e-07
##
## Kappa : 0.7253
##
## Mcnemar's Test P-Value : 0.7237
##
## Sensitivity : 0.8148
## Specificity : 0.9062
## Pos Pred Value : 0.8800
## Neg Pred Value : 0.8529
## Prevalence : 0.4576
## Detection Rate : 0.3729
## Detection Prevalence : 0.4237
## Balanced Accuracy : 0.8605
##
## 'Positive' Class : 0
##
set.seed(123) # 设置随机种子以保证结果可复现
train_index <- createDataPartition(heart_data$output, p = 0.8, list = FALSE)
train_data <- heart_data[train_index, ]
test_data <- heart_data[-train_index, ]
library(randomForest)
## Warning: 程序包'randomForest'是用R版本4.4.2 来建造的
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## 载入程序包:'randomForest'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
# 训练随机森林模型
rf_model <- randomForest(output ~ ., data = heart_data, ntree = 500, mtry = 3, importance = TRUE)
# 查看模型类型
rf_model$type # 应输出 "classification"
## [1] "classification"
# 划分训练集和测试集
set.seed(123) # 设置随机种子
train_index <- createDataPartition(heart_data$output, p = 0.8, list = FALSE)
train_data <- heart_data[train_index, ]
test_data <- heart_data[-train_index, ]
# 确保训练集和测试集目标变量仍为因子类型
str(train_data$output) # 应显示 Factor w/ 2 levels
## Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
str(test_data$output) # 应显示 Factor w/ 2 levels
## Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
# 调整分类阈值
probs <- predict(rf_model, test_data, type = "prob")
adjusted_predictions <- ifelse(probs[, 2] > 0.4, 1, 0) # 使用 0.4 作为阈值
adjusted_predictions <- as.factor(adjusted_predictions)
confusionMatrix(adjusted_predictions, test_data$output)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 27 0
## 1 0 32
##
## Accuracy : 1
## 95% CI : (0.9394, 1)
## No Information Rate : 0.5424
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.4576
## Detection Rate : 0.4576
## Detection Prevalence : 0.4576
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 0
##
# 训练随机森林模型
rf_model <- randomForest(output ~ ., data = train_data, ntree = 500, mtry = 3, importance = TRUE)
# 确认模型类型
rf_model$type # 应输出 "classification"
## [1] "classification"
# 模型预测
predictions <- predict(rf_model, test_data)
# 评估模型性能
conf_matrix <- confusionMatrix(predictions, test_data$output)
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 21 5
## 1 6 27
##
## Accuracy : 0.8136
## 95% CI : (0.6909, 0.9031)
## No Information Rate : 0.5424
## P-Value [Acc > NIR] : 1.224e-05
##
## Kappa : 0.6233
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.7778
## Specificity : 0.8438
## Pos Pred Value : 0.8077
## Neg Pred Value : 0.8182
## Prevalence : 0.4576
## Detection Rate : 0.3559
## Detection Prevalence : 0.4407
## Balanced Accuracy : 0.8108
##
## 'Positive' Class : 0
##