Untitled

library(ggplot2)
library(dplyr)

## 
## 载入程序包：'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(caret)

## Warning: 程序包'caret'是用R版本4.4.2 来建造的

## 载入需要的程序包：lattice

library(gridExtra)

## Warning: 程序包'gridExtra'是用R版本4.4.2 来建造的

## 
## 载入程序包：'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(naivebayes)

## Warning: 程序包'naivebayes'是用R版本4.4.2 来建造的

## naivebayes 1.0.0 loaded

## For more information please visit:

## https://majkamichal.github.io/naivebayes/

heart_data <- read.csv("C:/Users/书生飘啊飘/Desktop/heart.csv")

missing_values <- colSums(is.na(heart_data))
print("缺失值统计：")

## [1] "缺失值统计："

print(missing_values)

##      age      sex       cp   trtbps     chol      fbs  restecg thalachh 
##        0        0        0        0        0        0        0        0 
##     exng  oldpeak      slp      caa    thall   output 
##        0        0        0        0        0        0

heart_data$sex <- as.factor(heart_data$sex)
heart_data$cp <- as.factor(heart_data$cp)
heart_data$fbs <- as.factor(heart_data$fbs)
heart_data$restecg <- as.factor(heart_data$restecg)
heart_data$exng <- as.factor(heart_data$exng)
heart_data$slp <- as.factor(heart_data$slp)
heart_data$caa <- as.factor(heart_data$caa)
heart_data$thall <- as.factor(heart_data$thall)
heart_data$output <- as.factor(heart_data$output)  # 目标变量
str(heart_data)

## 'data.frame':    303 obs. of  14 variables:
##  $ age     : int  63 37 41 56 57 57 56 44 52 57 ...
##  $ sex     : Factor w/ 2 levels "0","1": 2 2 1 2 1 2 1 2 2 2 ...
##  $ cp      : Factor w/ 4 levels "0","1","2","3": 4 3 2 2 1 1 2 2 3 3 ...
##  $ trtbps  : int  145 130 130 120 120 140 140 120 172 150 ...
##  $ chol    : int  233 250 204 236 354 192 294 263 199 168 ...
##  $ fbs     : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 2 1 ...
##  $ restecg : Factor w/ 3 levels "0","1","2": 1 2 1 2 2 2 1 2 2 2 ...
##  $ thalachh: int  150 187 172 178 163 148 153 173 162 174 ...
##  $ exng    : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 1 1 1 ...
##  $ oldpeak : num  2.3 3.5 1.4 0.8 0.6 0.4 1.3 0 0.5 1.6 ...
##  $ slp     : Factor w/ 3 levels "0","1","2": 1 1 3 3 3 2 2 3 3 3 ...
##  $ caa     : Factor w/ 5 levels "0","1","2","3",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ thall   : Factor w/ 4 levels "0","1","2","3": 2 3 3 3 3 2 3 4 4 3 ...
##  $ output  : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...

continuous_features <- c("age", "trtbps", "chol", "thalachh", "oldpeak")
plot_list <- list()
for (feature in continuous_features) {
  p <- ggplot(heart_data, aes_string(x = feature)) +
    geom_histogram(aes(y = ..density..), bins = 30, fill = "skyblue", color = "black", alpha = 0.7) +
    geom_density(color = "red", size = 1) +
    labs(title = paste("Distribution of", feature),
         x = feature,
         y = "Density") +
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5))
  plot_list[[feature]] <- p
}

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# 拼接图
grid.arrange(
  grobs = plot_list, 
  ncol = 2            
)

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

continuous_features <- c("age", "trtbps", "chol", "thalachh", "oldpeak")

for (feature in continuous_features) {
  p <- ggplot(heart_data, aes_string(x = feature)) +
    geom_histogram(aes(y = ..density..), bins = 30, fill = "skyblue", color = "black", alpha = 0.7) +
    geom_density(color = "red", size = 1) +
    labs(title = paste("Distribution of", feature),
         x = feature,
         y = "Density") +
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5))
  
  # 在 for 循环中，必须显式调用 print()
  print(p)
}

heart_data <- subset(heart_data, chol <= 500)
cat("清理后数据集行数:", nrow(heart_data), "\n")

## 清理后数据集行数: 302

set.seed(777) 
train_index <- createDataPartition(heart_data$output, p = 0.8, list = FALSE)
train_data <- heart_data[train_index, ]
test_data <- heart_data[-train_index, ]

# Step 4: 训练混合贝叶斯分类器
model <- naive_bayes(output ~ ., data = train_data, laplace = 1)

print(model)

## 
## ================================= Naive Bayes ==================================
## 
## Call:
## naive_bayes.formula(formula = output ~ ., data = train_data, 
##     laplace = 1)
## 
## -------------------------------------------------------------------------------- 
##  
## Laplace smoothing: 1
## 
## -------------------------------------------------------------------------------- 
##  
## A priori probabilities: 
## 
##         0         1 
## 0.4567901 0.5432099 
## 
## -------------------------------------------------------------------------------- 
##  
## Tables: 
## 
## -------------------------------------------------------------------------------- 
## :: age (Gaussian) 
## -------------------------------------------------------------------------------- 
##       
## age            0         1
##   mean 56.720721 52.363636
##   sd    8.162187  9.876545
## 
## -------------------------------------------------------------------------------- 
## :: sex (Bernoulli) 
## -------------------------------------------------------------------------------- 
##    
## sex         0         1
##   0 0.1681416 0.4402985
##   1 0.8318584 0.5597015
## 
## -------------------------------------------------------------------------------- 
## :: cp (Categorical) 
## -------------------------------------------------------------------------------- 
##    
## cp           0          1
##   0 0.72173913 0.25000000
##   1 0.06956522 0.22794118
##   2 0.14782609 0.41176471
##   3 0.06086957 0.11029412
## 
## -------------------------------------------------------------------------------- 
## :: trtbps (Gaussian) 
## -------------------------------------------------------------------------------- 
##       
## trtbps         0         1
##   mean 135.10811 130.79545
##   sd    18.99203  16.31594
## 
## -------------------------------------------------------------------------------- 
## :: chol (Gaussian) 
## -------------------------------------------------------------------------------- 
##       
## chol           0         1
##   mean 246.50450 236.31061
##   sd    47.56352  45.62277
## 
## --------------------------------------------------------------------------------
## 
## # ... and 8 more tables
## 
## --------------------------------------------------------------------------------

predictions <- predict(model, test_data)

## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.

conf_matrix <- confusionMatrix(predictions, test_data$output)
print(conf_matrix)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 22  3
##          1  5 29
##                                           
##                Accuracy : 0.8644          
##                  95% CI : (0.7502, 0.9396)
##     No Information Rate : 0.5424          
##     P-Value [Acc > NIR] : 1.458e-07       
##                                           
##                   Kappa : 0.7253          
##                                           
##  Mcnemar's Test P-Value : 0.7237          
##                                           
##             Sensitivity : 0.8148          
##             Specificity : 0.9062          
##          Pos Pred Value : 0.8800          
##          Neg Pred Value : 0.8529          
##              Prevalence : 0.4576          
##          Detection Rate : 0.3729          
##    Detection Prevalence : 0.4237          
##       Balanced Accuracy : 0.8605          
##                                           
##        'Positive' Class : 0               
##

set.seed(123)  # 设置随机种子以保证结果可复现
train_index <- createDataPartition(heart_data$output, p = 0.8, list = FALSE)
train_data <- heart_data[train_index, ]
test_data <- heart_data[-train_index, ]

library(randomForest)

## Warning: 程序包'randomForest'是用R版本4.4.2 来建造的

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## 载入程序包：'randomForest'

## The following object is masked from 'package:gridExtra':
## 
##     combine

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

# 训练随机森林模型
rf_model <- randomForest(output ~ ., data = heart_data, ntree = 500, mtry = 3, importance = TRUE)

# 查看模型类型
rf_model$type  # 应输出 "classification"

## [1] "classification"

# 划分训练集和测试集
set.seed(123)  # 设置随机种子
train_index <- createDataPartition(heart_data$output, p = 0.8, list = FALSE)
train_data <- heart_data[train_index, ]
test_data <- heart_data[-train_index, ]

# 确保训练集和测试集目标变量仍为因子类型
str(train_data$output)  # 应显示 Factor w/ 2 levels

##  Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...

str(test_data$output)   # 应显示 Factor w/ 2 levels

##  Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...

# 调整分类阈值
probs <- predict(rf_model, test_data, type = "prob")
adjusted_predictions <- ifelse(probs[, 2] > 0.4, 1, 0)  # 使用 0.4 作为阈值
adjusted_predictions <- as.factor(adjusted_predictions)
confusionMatrix(adjusted_predictions, test_data$output)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 27  0
##          1  0 32
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9394, 1)
##     No Information Rate : 0.5424     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.4576     
##          Detection Rate : 0.4576     
##    Detection Prevalence : 0.4576     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : 0          
##

# 训练随机森林模型
rf_model <- randomForest(output ~ ., data = train_data, ntree = 500, mtry = 3, importance = TRUE)

# 确认模型类型
rf_model$type  # 应输出 "classification"

## [1] "classification"

# 模型预测
predictions <- predict(rf_model, test_data)

# 评估模型性能
conf_matrix <- confusionMatrix(predictions, test_data$output)
print(conf_matrix)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 21  5
##          1  6 27
##                                           
##                Accuracy : 0.8136          
##                  95% CI : (0.6909, 0.9031)
##     No Information Rate : 0.5424          
##     P-Value [Acc > NIR] : 1.224e-05       
##                                           
##                   Kappa : 0.6233          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.7778          
##             Specificity : 0.8438          
##          Pos Pred Value : 0.8077          
##          Neg Pred Value : 0.8182          
##              Prevalence : 0.4576          
##          Detection Rate : 0.3559          
##    Detection Prevalence : 0.4407          
##       Balanced Accuracy : 0.8108          
##                                           
##        'Positive' Class : 0               
##

Untitled

许浩川

2024-12-20