Câu 1

Load thư viện và dữ liệu

# MASS chứa dataset Boston
library(MASS)

# caret hỗ trợ chia train/test
library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

# Metrics để tính MAE, MSE
library(Metrics)

## 
## Attaching package: 'Metrics'

## The following objects are masked from 'package:caret':
## 
##     precision, recall

# load dataset Boston
data(Boston)

Kiểm tra ngoại lai (Outliers)

# Boxplot để quan sát ngoại lai trực quan
boxplot(Boston$lstat, main="Outliers in lstat")

boxplot(Boston$rm, main="Outliers in rm")

boxplot(Boston$medv, main="Outliers in medv")

Phát hiện ngoại lai bằng phương pháp IQR

# Tính các tứ phân vị
Q1 <- quantile(Boston$lstat, 0.25)
Q3 <- quantile(Boston$lstat, 0.75)

# IQR = Q3 - Q1
IQR_val <- IQR(Boston$lstat)

# Xác định các điểm ngoại lai
outliers <- Boston$lstat < (Q1 - 1.5*IQR_val) | Boston$lstat > (Q3 + 1.5*IQR_val)

# Hiển thị các dòng chứa outlier
Boston[outliers, ]

##         crim zn indus chas   nox    rm   age    dis rad tax ptratio  black
## 142  1.62864  0 21.89    0 0.624 5.019 100.0 1.4394   4 437    21.2 396.90
## 374 11.10810  0 18.10    0 0.668 4.906 100.0 1.1742  24 666    20.2 396.90
## 375 18.49820  0 18.10    0 0.668 4.138 100.0 1.1370  24 666    20.2 396.90
## 388 22.59710  0 18.10    0 0.700 5.000  89.5 1.5184  24 666    20.2 396.90
## 413 18.81100  0 18.10    0 0.597 4.628 100.0 1.5539  24 666    20.2  28.79
## 415 45.74610  0 18.10    0 0.693 4.519 100.0 1.6582  24 666    20.2  88.27
## 439 13.67810  0 18.10    0 0.740 5.935  87.9 1.8206  24 666    20.2  68.95
##     lstat medv
## 142 34.41 14.4
## 374 34.77 13.8
## 375 37.97 13.8
## 388 31.99  7.4
## 413 34.37 17.9
## 415 36.98  7.0
## 439 34.02  8.4

Xử lý ngoại lai

# Loại bỏ các dòng chứa ngoại lai
Boston <- Boston[!outliers, ]

Xây dựng mô hình hồi quy tuyến tính

Biến sử dụng

medv : giá trị nhà trung vị (biến mục tiêu)
lstat: % dân số thu nhập thấp
rm : số phòng trung bình

# cố định random seed để kết quả có thể lặp lại
set.seed(123)

# chia dữ liệu 80% train và 20% test
train_index <- createDataPartition(Boston$medv, p = 0.8, list = FALSE)

train_data <- Boston[train_index, ]
test_data  <- Boston[-train_index, ]

# xây dựng mô hình hồi quy tuyến tính
model_lm <- lm(medv ~ lstat + rm, data = train_data)

# xem kết quả mô hình
summary(model_lm)

## 
## Call:
## lm(formula = medv ~ lstat + rm, data = train_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.8416  -3.2806  -0.7253   2.1532  27.1937 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.16604    3.30555  -0.655    0.513    
## lstat       -0.70008    0.04858 -14.411   <2e-16 ***
## rm           5.30878    0.46057  11.527   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.24 on 398 degrees of freedom
## Multiple R-squared:  0.673,  Adjusted R-squared:  0.6714 
## F-statistic: 409.6 on 2 and 398 DF,  p-value: < 2.2e-16

Dự đoán và đánh giá mô hình

# dự đoán giá nhà trên tập test
pred_lm <- predict(model_lm, newdata = test_data)

# tính MAE (Mean Absolute Error)
mae_lm <- mae(test_data$medv, pred_lm)

# tính MSE (Mean Squared Error)
mse_lm <- mse(test_data$medv, pred_lm)

# in kết quả
mae_lm

## [1] 4.111603

mse_lm

## [1] 33.66115

Câu 2

Load dữ liệu và xây dựng mô hình Logistic Regression

# ISLR chứa dataset Default
library(ISLR)

# pROC dùng để vẽ ROC curve và tính AUC
library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following object is masked from 'package:Metrics':
## 
##     auc

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

# load dataset
data(Default)

# chia dữ liệu train/test
set.seed(123)

train_index2 <- createDataPartition(Default$default, p = 0.8, list = FALSE)

train_data2 <- Default[train_index2, ]
test_data2  <- Default[-train_index2, ]

# xây dựng mô hình logistic regression
model_logit <- glm(default ~ balance + income, 
                   data = train_data2, 
                   family = binomial)

# xem kết quả mô hình
summary(model_logit)

## 
## Call:
## glm(formula = default ~ balance + income, family = binomial, 
##     data = train_data2)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.186e+01  5.007e-01 -23.691  < 2e-16 ***
## balance      5.715e-03  2.574e-04  22.201  < 2e-16 ***
## income       2.708e-05  5.670e-06   4.777 1.78e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2340.6  on 8000  degrees of freedom
## Residual deviance: 1252.2  on 7998  degrees of freedom
## AIC: 1258.2
## 
## Number of Fisher Scoring iterations: 8

Dự đoán xác suất

# dự đoán xác suất vỡ nợ
prob_logit <- predict(model_logit, 
                      newdata = test_data2, 
                      type = "response")

Vẽ ROC curve và tính AUC

# tạo đối tượng ROC
roc_obj <- roc(test_data2$default, prob_logit)

## Setting levels: control = No, case = Yes

## Setting direction: controls < cases

# vẽ ROC curve
plot(roc_obj, col = "blue", main = "ROC Curve - Logistic Regression")

# tính AUC
auc_value <- auc(roc_obj)

# in AUC
auc_value

## Area under the curve: 0.952

Ma trận nhầm lẫn (Confusion Matrix)

# chuyển xác suất thành nhãn dự đoán với threshold = 0.5
pred_class <- ifelse(prob_logit > 0.5, "Yes", "No")

# chuyển thành factor giống dữ liệu gốc
pred_class <- factor(pred_class, levels = levels(test_data2$default))

# tạo ma trận nhầm lẫn
confusion_matrix <- table(Predicted = pred_class, Actual = test_data2$default)

# in ma trận nhầm lẫn
confusion_matrix

##          Actual
## Predicted   No  Yes
##       No  1922   49
##       Yes   11   17

Bài kiểm Tra số 3

Lê Xuân Nhất

2026-03-05

Câu 1

Load thư viện và dữ liệu

Kiểm tra ngoại lai (Outliers)

Phát hiện ngoại lai bằng phương pháp IQR

Xử lý ngoại lai

Xây dựng mô hình hồi quy tuyến tính

Dự đoán và đánh giá mô hình

Câu 2

Load dữ liệu và xây dựng mô hình Logistic Regression

Dự đoán xác suất

Vẽ ROC curve và tính AUC

Ma trận nhầm lẫn (Confusion Matrix)