1. Dataset: Housing price

df = read.csv("C:/Users/Momo/Desktop/Machine Learning/Housing.csv")

2. Tách dataset

# Tách biến mục tiêu y và các biến độc lập X
y <- df$price
X <- df[, !names(df) %in% "price"]

# Đặt seed để có thể tái lập kết quả
set.seed(123)

# Tạo chỉ mục để phân chia dữ liệu
train_indices <- sample(1:nrow(df), size = 0.8 * nrow(df))

# Tạo train set và test set
train_set <- df[train_indices, ]
test_set <- df[-train_indices, ]

3. DT model: Regression

library(rpart)

## Warning: package 'rpart' was built under R version 4.2.3

library(caret)

## Warning: package 'caret' was built under R version 4.2.3

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.2.3

## Loading required package: lattice

## Warning: package 'lattice' was built under R version 4.2.3

# Xác định lưới các giá trị maxdepth để thử nghiệm
grid <- expand.grid(.maxdepth = 1:20)

# Thiết lập kiểm tra chéo (cross-validation)
control <- trainControl(method = "cv", number = 5)

# Chạy Grid Search với Cross-Validation
train_model <- train(price ~ ., data = train_set, method = "rpart2", 
                     trControl = control, tuneGrid = grid)

train_model

## CART 
## 
## 436 samples
##  12 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 348, 349, 349, 349, 349 
## Resampling results across tuning parameters:
## 
##   maxdepth  RMSE     Rsquared   MAE      
##    1        1644366  0.2275180  1254469.5
##    2        1563819  0.3103600  1170876.8
##    3        1468249  0.3883325  1112458.1
##    4        1442528  0.4148353  1060303.1
##    5        1398364  0.4489784  1046841.8
##    6        1389617  0.4526032  1027071.0
##    7        1365464  0.4706554  1000152.6
##    8        1360458  0.4761426   997696.3
##    9        1338287  0.4934489   972601.6
##   10        1367388  0.4769118  1001026.4
##   11        1352189  0.4871915   986719.4
##   12        1349728  0.4904161   985056.8
##   13        1361160  0.4873290   990948.4
##   14        1361160  0.4873290   990948.4
##   15        1361160  0.4873290   990948.4
##   16        1361160  0.4873290   990948.4
##   17        1361160  0.4873290   990948.4
##   18        1361160  0.4873290   990948.4
##   19        1361160  0.4873290   990948.4
##   20        1361160  0.4873290   990948.4
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was maxdepth = 9.

# Lấy ra giá trị max-depth tốt nhất
best_max_depth <- train_model$bestTune$.maxdepth
print(paste("Best max-depth:", best_max_depth))

## [1] "Best max-depth: "

# Dự đoán và đánh giá mô hình
predictions <- predict(train_model, test_set)
mse <- mean((test_set$price - predictions)^2)
mse

## [1] 2.001963e+12

4. Visualization

library(ggplot2)
# Biểu đồ phân tán cho giá thực tế và giá dự đoán
ggplot() +
  geom_point(aes(x = test_set$price, y = predictions), color = 'blue') +
  labs(title = "Biểu đồ phân tán giá nhà thực tế vs. dự đoán",
       x = "Giá nhà thực tế",
       y = "Giá nhà dự đoán") +
  theme_minimal()

library(scales)

## Warning: package 'scales' was built under R version 4.2.3

# Biểu đồ phân tán cho giá thực tế và giá dự đoán với giá trị đầy đủ
ggplot() +
  geom_point(aes(x = test_set$price, y = predictions), color = 'blue') +
  labs(title = "Biểu đồ phân tán giá nhà thực tế vs. dự đoán",
       x = "Giá nhà thực tế",
       y = "Giá nhà dự đoán") +
  scale_x_continuous(labels = comma) +
  scale_y_continuous(labels = comma) +
  theme_minimal()

5. Performance metrics

# Dự đoán trên tập huấn luyện
train_predictions <- predict(train_model, train_set)

# Dự đoán trên tập kiểm tra
test_predictions <- predict(train_model, test_set)

# Tính toán các chỉ số hiệu suất cho tập huấn luyện
train_mse <- mean((train_set$price - train_predictions)^2)
train_rmse <- sqrt(train_mse)
train_mae <- mean(abs(train_set$price - train_predictions))
train_r2 <- 1 - (sum((train_set$price - train_predictions)^2) / sum((train_set$price - mean(train_set$price))^2))
train_mape <- mean(abs((train_set$price - train_predictions) / train_set$price)) * 100

# Tính toán các chỉ số hiệu suất cho tập kiểm tra
test_mse <- mean((test_set$price - test_predictions)^2)
test_rmse <- sqrt(test_mse)
test_mae <- mean(abs(test_set$price - test_predictions))
test_r2 <- 1 - (sum((test_set$price - test_predictions)^2) / sum((test_set$price - mean(test_set$price))^2))
test_mape <- mean(abs((test_set$price - test_predictions) / test_set$price)) * 100

# In kết quả
cat("Training Set Performance:\n")

## Training Set Performance:

cat("MSE:", train_mse, "\n")

## MSE: 1.251941e+12

cat("RMSE:", train_rmse, "\n")

## RMSE: 1118902

cat("MAE:", train_mae, "\n")

## MAE: 805041.6

cat("R-squared:", train_r2, "\n")

## R-squared: 0.6378368

cat("MAPE:", train_mape, "%\n\n")

## MAPE: 18.11942 %

cat("Test Set Performance:\n")

## Test Set Performance:

cat("MSE:", test_mse, "\n")

## MSE: 2.001963e+12

cat("RMSE:", test_rmse, "\n")

## RMSE: 1414907

cat("MAE:", test_mae, "\n")

## MAE: 1058280

cat("R-squared:", test_r2, "\n")

## R-squared: 0.4445534

cat("MAPE:", test_mape, "%\n")

## MAPE: 24.61423 %

6. Decision tree plot

library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 4.2.3

# Vẽ cây quyết định
rpart.plot(train_model$finalModel, main = "Decision Tree Plot")

################################################################################
# Vẽ cây quyết định với nhiều tùy chọn để tùy chỉnh
rpart.plot(train_model$finalModel, 
           type = 4,                   # Loại plot để hiện tất cả các nút
           extra = 101,                # Hiển thị thông tin về số lượng mẫu tại mỗi nút
           under = TRUE,              # Hiển thị tên các lớp biến ở dưới cùng của các nút
           cex = 0.8,                  # Kích thước chữ
           main = "Decision Tree Plot") # Tiêu đề

7. Variables Important

# Tính toán độ quan trọng của các biến
importance <- varImp(train_model, scale = FALSE)

# Hiển thị độ quan trọng của các biến
importance

## rpart2 variable importance
## 
##                                  Overall
## airconditioningyes               1.58481
## stories                          1.46013
## area                             1.13645
## bathrooms                        0.92632
## parking                          0.89208
## bedrooms                         0.74806
## furnishingstatusunfurnished      0.58454
## prefareayes                      0.54833
## mainroadyes                      0.39517
## basementyes                      0.26479
## furnishingstatussemi-furnished   0.09635
## `furnishingstatussemi-furnished` 0.00000
## hotwaterheatingyes               0.00000
## guestroomyes                     0.00000

# Chuyển đổi dữ liệu quan trọng thành dataframe
importance_df <- as.data.frame(importance$importance)

# Thay đổi tên cột
importance_df$Variable <- rownames(importance_df)
names(importance_df) <- c("Importance", "Variable")

# Vẽ biểu đồ độ quan trọng
ggplot(importance_df, aes(x = reorder(Variable, Importance), y = Importance)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Độ Quan Trọng của Các Predictors",
       x = "Predictor",
       y = "Độ Quan Trọng") +
  theme_minimal()

################################################################################
library(ggplot2)
library(scales)

# Biểu đồ phân tán cho giá thực tế và giá dự đoán
ggplot(data = test_set, aes(x = price, y = predictions)) +
  geom_point(color = 'blue', alpha = 0.7, size = 3) +   # Chấm có màu xanh và độ mờ
  geom_smooth(method = "lm", color = "red", se = FALSE) + # Thêm đường hồi quy tuyến tính
  scale_x_continuous(labels = comma) +
  scale_y_continuous(labels = comma) +
  labs(title = "Biểu Đồ Phân Tán Giá Nhà Thực Tế vs. Dự Đoán",
       x = "Giá Nhà Thực Tế",
       y = "Giá Nhà Dự Đoán") +
  theme_minimal(base_size = 15) +   # Kích thước chữ cơ sở lớn hơn
  theme(plot.title = element_text(hjust = 0.5, size = 18), # Canh giữa tiêu đề
        axis.title = element_text(size = 14),             # Kích thước chữ tiêu đề trục
        axis.text = element_text(size = 12))              # Kích thước chữ giá trị trục

## `geom_smooth()` using formula = 'y ~ x'

# Biểu đồ độ quan trọng của các predictors
ggplot(importance_df, aes(x = reorder(Variable, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "steelblue") +  # Màu sắc của thanh
  coord_flip() +  # Đảo ngược trục x và y để tên biến nằm ở trục y
  labs(title = "Độ Quan Trọng của Các Predictors",
       x = "Predictor",
       y = "Độ Quan Trọng") +
  theme_minimal(base_size = 15) +   # Kích thước chữ cơ sở lớn hơn
  theme(plot.title = element_text(hjust = 0.5, size = 18), # Canh giữa tiêu đề
        axis.title = element_text(size = 14),             # Kích thước chữ tiêu đề trục
        axis.text = element_text(size = 12))              # Kích thước chữ giá trị trục

8. Comparisons

# Đảm bảo có kết quả cho train_model
results <- train_model$results

# Tính MAE cho tập huấn luyện (training set) và tập kiểm tra (test set)
train_mae <- sapply(results$maxdepth, function(depth) {
  model <- rpart(price ~ ., data = train_set, control = rpart.control(maxdepth = depth))
  mean(abs(train_set$price - predict(model, train_set)))
})

test_mae <- sapply(results$maxdepth, function(depth) {
  model <- rpart(price ~ ., data = train_set, control = rpart.control(maxdepth = depth))
  mean(abs(test_set$price - predict(model, test_set)))
})

# Tạo DataFrame cho MAE
depth_mae_comparison <- data.frame(
  maxdepth = results$maxdepth,
  Train_MAE = train_mae,
  Test_MAE = test_mae
)


# Vẽ đồ thị với hai đường
ggplot(data = depth_mae_comparison) +
  geom_line(aes(x = maxdepth, y = Train_MAE, color = 'Training Set'), size = 1) +  # Đường MAE tập huấn luyện
  geom_point(aes(x = maxdepth, y = Train_MAE, color = 'Training Set'), size = 3) +  # Điểm MAE tập huấn luyện
  geom_line(aes(x = maxdepth, y = Test_MAE, color = 'Test Set'), size = 1, linetype = "dashed") +  # Đường MAE tập kiểm tra
  geom_point(aes(x = maxdepth, y = Test_MAE, color = 'Test Set'), size = 3) +  # Điểm MAE tập kiểm tra
  labs(title = "So Sánh MAE giữa Tập Huấn Luyện và Tập Kiểm Tra",
       x = "Maxdepth",
       y = "Mean Absolute Error (MAE)") +
  scale_color_manual(values = c('Training Set' = 'blue', 'Test Set' = 'red')) +
  theme_minimal(base_size = 15) +  # Kích thước chữ cơ sở lớn hơn
  theme(plot.title = element_text(hjust = 0.5, size = 18),  # Canh giữa tiêu đề
        axis.title = element_text(size = 14),  # Kích thước chữ tiêu đề trục
        axis.text = element_text(size = 12))  # Kích thước chữ giá trị trục

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Decision Tree

Vu Thien

2024-08-04

1. Dataset: Housing price

2. Tách dataset

3. DT model: Regression

4. Visualization

5. Performance metrics

6. Decision tree plot

7. Variables Important

8. Comparisons