thuc_hanh

# 1. Xây dựng mô hình
model1 <- lm(Petal.Length ~ Sepal.Length + Sepal.Width, data = iris)

# Xem kết quả chi tiết
summary(model1)

## 
## Call:
## lm(formula = Petal.Length ~ Sepal.Length + Sepal.Width, data = iris)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.25582 -0.46922 -0.05741  0.45530  1.75599 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -2.52476    0.56344  -4.481 1.48e-05 ***
## Sepal.Length  1.77559    0.06441  27.569  < 2e-16 ***
## Sepal.Width  -1.33862    0.12236 -10.940  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6465 on 147 degrees of freedom
## Multiple R-squared:  0.8677, Adjusted R-squared:  0.8659 
## F-statistic:   482 on 2 and 147 DF,  p-value: < 2.2e-16

# 2. Kiểm tra giả định bằng Residual plot
# Biểu đồ 'Residuals vs Fitted' giúp kiểm tra tính tuyến tính và phương sai sai số không đổi
plot(model1, which = 1)

library(reticulate)

# 1. Nạp thư viện và dữ liệu
if(!require(mlbench)) install.packages("mlbench")

## Loading required package: mlbench

# ==============================
# 1. Load dữ liệu
# ==============================
library(mlbench)
data(PimaIndiansDiabetes)

# ==============================
# 2. Xây dựng mô hình Logistic
# ==============================
# diabetes là biến cần dự đoán
# dấu "." nghĩa là dùng tất cả biến còn lại làm biến độc lập
model_logistic <- glm(diabetes ~ ., 
                      data = PimaIndiansDiabetes, 
                      family = binomial)

# ==============================
# 3. Dự đoán xác suất
# ==============================
# type = "response" trả về xác suất (giá trị từ 0 đến 1)
prob <- predict(model_logistic, type = "response")

# ==============================
# 4. Chuyển xác suất thành nhãn
# ==============================
# Nếu xác suất > 0.5 → dự đoán mắc bệnh (pos)
# Ngược lại → không mắc bệnh (neg)
pred_labels <- ifelse(prob > 0.5, "pos", "neg")

# ==============================
# 5. Tạo ma trận nhầm lẫn
# ==============================
actual_labels <- PimaIndiansDiabetes$diabetes

conf_matrix <- table(Predicted = pred_labels,
                     Actual = actual_labels)

print(conf_matrix)

##          Actual
## Predicted neg pos
##       neg 445 112
##       pos  55 156

# ==============================
# 6. Tính các chỉ số đánh giá
# ==============================

# True Positive: dự đoán đúng là có bệnh
TP <- conf_matrix["pos", "pos"]

# False Positive: dự đoán có bệnh nhưng thực tế không
FP <- conf_matrix["pos", "neg"]

# False Negative: dự đoán không bệnh nhưng thực tế có
FN <- conf_matrix["neg", "pos"]

# Precision: trong các dự đoán "có bệnh", bao nhiêu là đúng
precision <- TP / (TP + FP)

# Recall: trong các trường hợp thực sự có bệnh, mô hình phát hiện được bao nhiêu
recall <- TP / (TP + FN)

# F1-score: trung bình điều hòa giữa Precision và Recall
f1_score <- 2 * (precision * recall) / (precision + recall)

# ==============================
# 7. In kết quả
# ==============================
cat("\n--- KẾT QUẢ ĐÁNH GIÁ MÔ HÌNH ---\n")

## 
## --- KẾT QUẢ ĐÁNH GIÁ MÔ HÌNH ---

cat("Precision:", round(precision, 4), "\n")

## Precision: 0.7393

cat("Recall:   ", round(recall, 4), "\n")

## Recall:    0.5821

cat("F1-score: ", round(f1_score, 4), "\n")

## F1-score:  0.6514

thuc_hanh_3

Vũ Ngọc Quý

2026-03-05