BKT_03

library(MASS)

## Warning: package 'MASS' was built under R version 4.4.3

# Xem qua dữ liệu
head(Boston)

##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
##   medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7

# Xây dựng mô hình hồi quy tuyến tính dự đoán medv từ crim và indus
model <- lm(medv ~ crim + indus, data = Boston)

# Xem kết quả
summary(model)

## 
## Call:
## lm(formula = medv ~ crim + indus, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -12.011  -4.876  -1.683   3.024  32.491 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 29.24829    0.67046  43.624  < 2e-16 ***
## crim        -0.24548    0.04434  -5.536 4.99e-08 ***
## indus       -0.52335    0.05559  -9.414  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.83 on 503 degrees of freedom
## Multiple R-squared:  0.278,  Adjusted R-squared:  0.2751 
## F-statistic: 96.83 on 2 and 503 DF,  p-value: < 2.2e-16

# Tải thư viện
library(ggplot2)

# Tải dữ liệu Pima Indians Diabetes
data <- read.csv("https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv",
                 header=FALSE)
colnames(data) <- c("Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin",
                    "BMI", "DiabetesPedigreeFunction", "Age", "Outcome")

# Chia dữ liệu thành tập huấn luyện và kiểm tra
set.seed(123)
index <- sample(1:nrow(data), 0.8 * nrow(data))
train_data <- data[index, ]
test_data <- data[-index, ]

# Huấn luyện mô hình hồi quy logistic
model <- glm(Outcome ~ ., data = train_data, family = binomial)

# Dự đoán trên tập kiểm tra
pred_prob <- predict(model, test_data, type = "response")

# Tạo danh sách ngưỡng từ 0 đến 1
thresholds <- seq(0, 1, by = 0.01)
precision <- c()
recall <- c()

# Tính Precision và Recall
for (t in thresholds) {
  pred_label <- ifelse(pred_prob >= t, 1, 0)
  tp <- sum(pred_label == 1 & test_data$Outcome == 1)
  fp <- sum(pred_label == 1 & test_data$Outcome == 0)
  fn <- sum(pred_label == 0 & test_data$Outcome == 1)
  
  prec <- ifelse((tp + fp) == 0, 1, tp / (tp + fp))
  rec <- ifelse((tp + fn) == 0, 1, tp / (tp + fn))
  
  precision <- c(precision, prec)
  recall <- c(recall, rec)
}

# Vẽ biểu đồ Precision-Recall
pr_data <- data.frame(Recall = recall, Precision = precision)

ggplot(pr_data, aes(x = Recall, y = Precision)) +
  geom_line(color = "blue") +
  labs(title = "Precision-Recall Curve", x = "Recall", y = "Precision") +
  theme_minimal()

BKT_03

Trần Nguyẽn Anh Đức

2025-03-13