# Load dataset
data(mtcars)

# Xem vài dòng dữ liệu
head(mtcars)

##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

# Thông tin dữ liệu
str(mtcars)

## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...

1.2. Xây dựng mô hình hồi quy tuyến tính

Mô hình:

\[ mpg = \beta_0 + \beta_1 hp + \beta_2 wt + \varepsilon \]

# Xây dựng mô hình hồi quy
model <- lm(mpg ~ hp + wt, data = mtcars)

# Tóm tắt mô hình
summary(model)

## 
## Call:
## lm(formula = mpg ~ hp + wt, data = mtcars)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.941 -1.600 -0.182  1.050  5.854 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 37.22727    1.59879  23.285  < 2e-16 ***
## hp          -0.03177    0.00903  -3.519  0.00145 ** 
## wt          -3.87783    0.63273  -6.129 1.12e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.593 on 29 degrees of freedom
## Multiple R-squared:  0.8268, Adjusted R-squared:  0.8148 
## F-statistic: 69.21 on 2 and 29 DF,  p-value: 9.109e-12

1.3. Tính giá trị dự đoán

# Giá trị dự đoán
pred <- predict(model)

# Xem một số giá trị dự đoán
head(pred)

##         Mazda RX4     Mazda RX4 Wag        Datsun 710    Hornet 4 Drive 
##          23.57233          22.58348          25.27582          21.26502 
## Hornet Sportabout           Valiant 
##          18.32727          20.47382

1.4. Tính R²

# Lấy R squared
r2 <- summary(model)$r.squared

r2

## [1] 0.8267855

1.5. Tính RMSE

Công thức:

\[ RMSE = \sqrt{\frac{1}{n}\sum(y - \hat{y})^2} \]

# Giá trị thực
actual <- mtcars$mpg

# Tính RMSE
rmse <- sqrt(mean((actual - pred)^2))

rmse

## [1] 2.468854

Câu 2 – Logistic Regression với `iris`

2.1. Tải dữ liệu và lọc 2 lớp

# Load dữ liệu
data(iris)

# Chỉ lấy 2 lớp
iris2 <- subset(iris, Species != "virginica")

# Kiểm tra
table(iris2$Species)

## 
##     setosa versicolor  virginica 
##         50         50          0

2.2. Chuyển Species thành nhị phân

Logistic regression cần biến nhị phân

iris2$Species <- ifelse(iris2$Species == "setosa", 1, 0)

head(iris2)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2       1
## 2          4.9         3.0          1.4         0.2       1
## 3          4.7         3.2          1.3         0.2       1
## 4          4.6         3.1          1.5         0.2       1
## 5          5.0         3.6          1.4         0.2       1
## 6          5.4         3.9          1.7         0.4       1

2.3. Xây dựng mô hình Logistic Regression

# Xây dựng mô hình logistic
log_model <- glm(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width,
                 data = iris2,
                 family = binomial)

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(log_model)

## 
## Call:
## glm(formula = Species ~ Sepal.Length + Sepal.Width + Petal.Length + 
##     Petal.Width, family = binomial, data = iris2)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)
## (Intercept)      -6.556 601950.494       0        1
## Sepal.Length      9.879 194223.316       0        1
## Sepal.Width       7.418  92924.482       0        1
## Petal.Length    -19.054 144516.044       0        1
## Petal.Width     -25.033 216059.004       0        1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1.3863e+02  on 99  degrees of freedom
## Residual deviance: 1.3166e-09  on 95  degrees of freedom
## AIC: 10
## 
## Number of Fisher Scoring iterations: 25

2.4. Dự đoán xác suất

# Dự đoán xác suất
prob <- predict(log_model, type = "response")

head(prob)

## 1 2 3 4 5 6 
## 1 1 1 1 1 1

2.5. Chuyển sang nhãn dự đoán

Ngưỡng phân loại thường dùng 0.5

pred_class <- ifelse(prob > 0.5, 1, 0)

head(pred_class)

## 1 2 3 4 5 6 
## 1 1 1 1 1 1

2.6. Ma trận nhầm lẫn (Confusion Matrix)

# Ma trận nhầm lẫn
conf_matrix <- table(Predicted = pred_class, Actual = iris2$Species)

conf_matrix

##          Actual
## Predicted  0  1
##         0 50  0
##         1  0 50

2.7. Tính độ chính xác (Accuracy)

Công thức:

\[ Accuracy = \frac{TP + TN}{Total} \]

accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)

accuracy

## [1] 1

3. Biểu đồ phân tán

plot(mtcars$wt, mtcars$mpg,
     xlab="Weight",
     ylab="MPG",
     main="MPG vs Weight")

KiemTra3

HoanBuCon

2026-03-05

1.2. Xây dựng mô hình hồi quy tuyến tính

1.3. Tính giá trị dự đoán

1.4. Tính R²

1.5. Tính RMSE

Câu 2 – Logistic Regression với `iris`

2.1. Tải dữ liệu và lọc 2 lớp

2.2. Chuyển Species thành nhị phân

2.3. Xây dựng mô hình Logistic Regression

2.4. Dự đoán xác suất

2.5. Chuyển sang nhãn dự đoán

2.6. Ma trận nhầm lẫn (Confusion Matrix)

2.7. Tính độ chính xác (Accuracy)

3. Biểu đồ phân tán

KiemTra3

HoanBuCon

2026-03-05

1.2. Xây dựng mô hình hồi quy tuyến tính

1.3. Tính giá trị dự đoán

1.4. Tính R²

1.5. Tính RMSE

Câu 2 – Logistic Regression với iris

2.1. Tải dữ liệu và lọc 2 lớp

2.2. Chuyển Species thành nhị phân

2.3. Xây dựng mô hình Logistic Regression

2.4. Dự đoán xác suất

2.5. Chuyển sang nhãn dự đoán

2.6. Ma trận nhầm lẫn (Confusion Matrix)

2.7. Tính độ chính xác (Accuracy)

3. Biểu đồ phân tán

Câu 2 – Logistic Regression với `iris`