030239230036-Nguyễn-Khánh-Dương.knit

# Cài đặt và load các thư viện cần thiết
#install.packages("readxl")
#install.packages("forecast")
#install.packages("ggplot2")
#install.packages("dplyr")

library(readxl)   # Để đọc file Excel
library(forecast) # Để xây dựng các mô hình dự báo

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(ggplot2)  # Để vẽ đồ thị
library(dplyr)    # Để xử lý dữ liệu

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# Đọc dữ liệu từ tệp Excel
file_path <- "D:\\Năm 2 - HK2 - Đợt 2\\R\\GiaVang.xlsx"  # Đường dẫn đến file Excel
data <- read_excel(file_path)

# Chuyển đổi cột Date sang kiểu Date
data$Date <- as.Date(data$Date)

# Chuyển đổi giá vàng thành số và loại bỏ dấu phẩy
data$Gold_Price <- as.numeric(gsub(",", "", data$Gold_Price))

# Kiểm tra xem có giá trị NA nào không
if (any(is.na(data$Gold_Price))) {
  data <- data %>% filter(!is.na(Gold_Price) & !is.na(Date))  # Loại bỏ hàng có NA
}

# Hiển thị dữ liệu để kiểm tra
head(data)

## # A tibble: 6 × 5
##   Date       Gold_Price   PC1   PC2   PC3
##   <date>          <dbl> <dbl> <dbl> <dbl>
## 1 2022-01-04      1815.  5.07 -5.21 0.305
## 2 2022-01-05      1825.  4.85 -5.09 0.559
## 3 2022-01-06      1789.  4.26 -5.06 0.324
## 4 2022-01-07      1797.  4.41 -5.18 0.500
## 5 2022-01-08      1799.  4.07 -5.07 0.220
## 6 2022-01-09      1818.  4.48 -5.20 0.303

# Xác định ngày bắt đầu và kết thúc cho tập huấn luyện
train_start <- as.Date("2021-01-04")
train_end <- as.Date("2023-12-31")  # Cập nhật ngày kết thúc cho tập huấn luyện

# Xác định ngày bắt đầu và kết thúc cho tập kiểm tra
test_start <- as.Date("2024-01-01")
test_end <- as.Date("2024-01-10")

# Chia tập dữ liệu thành tập huấn luyện và tập kiểm tra
train_data <- subset(data, Date >= train_start & Date <= train_end)
test_data <- subset(data, Date >= test_start & Date <= test_end)

# Kiểm tra số lượng quan sát sau khi chia
cat("Số quan sát trong tập huấn luyện:", nrow(train_data), "\n")

## Số quan sát trong tập huấn luyện: 514

cat("Số quan sát trong tập kiểm tra:", nrow(test_data), "\n")

## Số quan sát trong tập kiểm tra: 7

# Tìm độ trễ tối ưu cho mô hình AR với việc tính AIC
aic_values <- sapply(1:10, function(p) {
  model <- arima(train_data$Gold_Price, order = c(p, 0, 0))
  return(AIC(model))
})

# Lựa chọn độ trễ có giá trị AIC thấp nhất
optimal_p_ar <- which.min(aic_values)
cat("Độ trễ tối ưu cho mô hình AR:", optimal_p_ar, "\n")

## Độ trễ tối ưu cho mô hình AR: 8

# Xây dựng mô hình AR với độ trễ tối ưu
ar_model <- arima(train_data$Gold_Price, order = c(optimal_p_ar, 0, 0))

# Dự đoán cho tập kiểm tra
ar_forecast <- forecast(ar_model, h = nrow(test_data))

# Tính MAE và RMSE cho mô hình AR
ar_mae <- mean(abs(ar_forecast$mean - test_data$Gold_Price))
ar_rmse <- sqrt(mean((ar_forecast$mean - test_data$Gold_Price)^2))

cat("MAE cho mô hình AR:", ar_mae, "\n")

## MAE cho mô hình AR: 22.67913

cat("RMSE cho mô hình AR:", ar_rmse, "\n")

## RMSE cho mô hình AR: 24.72662

# Tạo bảng kết quả cho mô hình AR
ar_results <- data.frame(
  Date = test_data$Date,
  Real_Value = test_data$Gold_Price,
  Forecast_Value = ar_forecast$mean,
  MAE = rep(ar_mae, nrow(test_data)),  # Lặp lại MAE cho mỗi hàng
  RMSE = rep(ar_rmse, nrow(test_data))  # Lặp lại RMSE cho mỗi hàng
)

# Hiển thị bảng kết quả cho mô hình AR
cat("Bảng kết quả cho mô hình AR:\n")

## Bảng kết quả cho mô hình AR:

print(ar_results)

##         Date Real_Value Forecast_Value      MAE     RMSE
## 1 2024-01-02     2073.4       2075.092 22.67913 24.72662
## 2 2024-01-03     2042.8       2073.266 22.67913 24.72662
## 3 2024-01-04     2050.0       2071.066 22.67913 24.72662
## 4 2024-01-05     2049.8       2066.999 22.67913 24.72662
## 5 2024-01-08     2033.5       2061.814 22.67913 24.72662
## 6 2024-01-09     2033.0       2061.206 22.67913 24.72662
## 7 2024-01-10     2027.8       2059.611 22.67913 24.72662

# Xây dựng mô hình ARIMA
auto_arima_model <- auto.arima(train_data$Gold_Price)

# Dự đoán cho tập kiểm tra
arima_forecast <- forecast(auto_arima_model, h = nrow(test_data))

# Tính MAE và RMSE cho mô hình ARIMA
arima_mae <- mean(abs(arima_forecast$mean - test_data$Gold_Price))
arima_rmse <- sqrt(mean((arima_forecast$mean - test_data$Gold_Price)^2))

cat("MAE cho mô hình ARIMA:", arima_mae, "\n")

## MAE cho mô hình ARIMA: 27.92857

cat("RMSE cho mô hình ARIMA:", arima_rmse, "\n")

## RMSE cho mô hình ARIMA: 30.96341

# Tạo bảng kết quả cho mô hình ARIMA
arima_results <- data.frame(
  Date = test_data$Date,
  Real_Value = test_data$Gold_Price,
  Forecast_Value = arima_forecast$mean,
  MAE = rep(arima_mae, nrow(test_data)),  # Lặp lại MAE cho mỗi hàng
  RMSE = rep(arima_rmse, nrow(test_data))  # Lặp lại RMSE cho mỗi hàng
)

# Hiển thị bảng kết quả cho mô hình ARIMA
cat("Bảng kết quả cho mô hình ARIMA:\n")

## Bảng kết quả cho mô hình ARIMA:

print(arima_results)

##         Date Real_Value Forecast_Value      MAE     RMSE
## 1 2024-01-02     2073.4         2071.8 27.92857 30.96341
## 2 2024-01-03     2042.8         2071.8 27.92857 30.96341
## 3 2024-01-04     2050.0         2071.8 27.92857 30.96341
## 4 2024-01-05     2049.8         2071.8 27.92857 30.96341
## 5 2024-01-08     2033.5         2071.8 27.92857 30.96341
## 6 2024-01-09     2033.0         2071.8 27.92857 30.96341
## 7 2024-01-10     2027.8         2071.8 27.92857 30.96341

# Vẽ đồ thị dự đoán của hai mô hình so với giá thực tế
ggplot() +
  geom_line(data = test_data, aes(x = Date, y = Gold_Price, color = "Giá thực tế")) +
  geom_line(data = data.frame(Date = test_data$Date, Value = ar_forecast$mean), 
            aes(x = Date, y = Value, color = "Dự đoán AR")) +
  geom_line(data = data.frame(Date = test_data$Date, Value = arima_forecast$mean), 
            aes(x = Date, y = Value, color = "Dự đoán ARIMA")) +
  geom_point(data = test_data, aes(x = Date, y = Gold_Price, color = "Giá thực tế"), shape = 16, size = 3) +
  geom_point(data = data.frame(Date = test_data$Date, Value = ar_forecast$mean), 
             aes(x = Date, y = Value, color = "Dự đoán AR"), shape = 17, size = 3) +
  geom_point(data = data.frame(Date = test_data$Date, Value = arima_forecast$mean), 
             aes(x = Date, y = Value, color = "Dự đoán ARIMA"), shape = 18, size = 3) +
  geom_text(data = test_data, aes(x = Date, y = Gold_Price, label = Gold_Price), 
            vjust = -1, color = "black", size = 3) +
  geom_text(data = data.frame(Date = test_data$Date, Value = ar_forecast$mean), 
            aes(x = Date, y = Value, label = round(Value, 2)), 
            vjust = -1, color = "blue", size = 3) +
  geom_text(data = data.frame(Date = test_data$Date, Value = arima_forecast$mean), 
            aes(x = Date, y = Value, label = round(Value, 2)), 
            vjust = -1, color = "red", size = 3) +
  labs(title = "So sánh giá thực tế và dự đoán giá vàng",
       x = "Ngày", 
       y = "Giá vàng") +
  scale_color_manual(values = c("Giá thực tế" = "black", 
                                "Dự đoán AR" = "blue", 
                                "Dự đoán ARIMA" = "red")) +
  theme_minimal()

## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.

library(readxl)
library(dplyr)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

# Chuyển biến Date về dạng Date
data$Date <- ymd(data$Date)

# Chuyển các biến về số (nếu cần)
data <- data %>%
  mutate(
    Gold_Price = as.numeric(Gold_Price),
    PC1 = as.numeric(PC1),
    PC2 = as.numeric(PC2),
    PC3 = as.numeric(PC3)
  )

# Sắp xếp dữ liệu theo Date
data <- data %>% arrange(Date)

# Chia dữ liệu train/test
train <- filter(data, Date >= as.Date("2021-01-04") & Date <= as.Date("2023-12-29"))
test  <- filter(data, Date >= as.Date("2024-01-01") & Date <= as.Date("2024-01-10"))

# Kiểm tra
dim(train); dim(test)

## [1] 514   5

## [1] 7 5

#Mô hình đa biến VAR (Vector Autoregression)

library(vars)

## Loading required package: MASS

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

## Loading required package: strucchange

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: sandwich

## Loading required package: urca

## Loading required package: lmtest

# Chọn biến phụ thuộc và độc lập (VAR model thường các biến cùng vai trò)
# Giả sử cùng dùng 4 biến trong VAR
var_data <- dplyr::select(train, Gold_Price, PC1, PC2, PC3)

# Xây dựng mô hình VAR, chọn bậc trễ tối ưu bằng criteria (AIC, BIC)
lag_select <- VARselect(var_data, lag.max = 10, type = "const")
lag_p <- lag_select$selection["AIC(n)"]

var_model <- VAR(var_data, p = lag_p, type = "const")

summary(var_model)

## 
## VAR Estimation Results:
## ========================= 
## Endogenous variables: Gold_Price, PC1, PC2, PC3 
## Deterministic variables: const 
## Sample size: 510 
## Log Likelihood: -2197.739 
## Roots of the characteristic polynomial:
## 0.9986 0.9919 0.9919 0.9461 0.7435 0.5083 0.5083 0.5047 0.5047 0.461 0.4469 0.4469 0.4074 0.4074 0.3417 0.03486
## Call:
## VAR(y = var_data, p = lag_p, type = "const")
## 
## 
## Estimation results for equation Gold_Price: 
## =========================================== 
## Gold_Price = Gold_Price.l1 + PC1.l1 + PC2.l1 + PC3.l1 + Gold_Price.l2 + PC1.l2 + PC2.l2 + PC3.l2 + Gold_Price.l3 + PC1.l3 + PC2.l3 + PC3.l3 + Gold_Price.l4 + PC1.l4 + PC2.l4 + PC3.l4 + const 
## 
##               Estimate Std. Error t value Pr(>|t|)    
## Gold_Price.l1  0.85986    0.05400  15.924  < 2e-16 ***
## PC1.l1         6.01512    1.94571   3.091  0.00210 ** 
## PC2.l1        -4.28908    4.37298  -0.981  0.32717    
## PC3.l1        -1.86186    3.24337  -0.574  0.56620    
## Gold_Price.l2  0.03831    0.07117   0.538  0.59058    
## PC1.l2        -6.80501    2.59031  -2.627  0.00888 ** 
## PC2.l2        -0.26428    6.20985  -0.043  0.96607    
## PC3.l2         6.59629    3.78311   1.744  0.08185 .  
## Gold_Price.l3  0.17678    0.07069   2.501  0.01272 *  
## PC1.l3        -0.42755    2.61230  -0.164  0.87006    
## PC2.l3         9.86068    6.15132   1.603  0.10957    
## PC3.l3         0.35669    3.80579   0.094  0.92537    
## Gold_Price.l4 -0.11875    0.05392  -2.202  0.02811 *  
## PC1.l4         2.36582    1.97177   1.200  0.23078    
## PC2.l4        -4.97611    4.33105  -1.149  0.25114    
## PC3.l4        -4.24200    3.27473  -1.295  0.19580    
## const         82.86769   25.32632   3.272  0.00114 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## Residual standard error: 16.58 on 493 degrees of freedom
## Multiple R-Squared: 0.9765,  Adjusted R-squared: 0.9758 
## F-statistic:  1282 on 16 and 493 DF,  p-value: < 2.2e-16 
## 
## 
## Estimation results for equation PC1: 
## ==================================== 
## PC1 = Gold_Price.l1 + PC1.l1 + PC2.l1 + PC3.l1 + Gold_Price.l2 + PC1.l2 + PC2.l2 + PC3.l2 + Gold_Price.l3 + PC1.l3 + PC2.l3 + PC3.l3 + Gold_Price.l4 + PC1.l4 + PC2.l4 + PC3.l4 + const 
## 
##                 Estimate Std. Error t value Pr(>|t|)    
## Gold_Price.l1  0.0011103  0.0014326   0.775 0.438693    
## PC1.l1         0.9859093  0.0516201  19.099  < 2e-16 ***
## PC2.l1        -0.1241418  0.1160161  -1.070 0.285125    
## PC3.l1        -0.0328065  0.0860475  -0.381 0.703175    
## Gold_Price.l2 -0.0007877  0.0018880  -0.417 0.676727    
## PC1.l2         0.0812785  0.0687216   1.183 0.237490    
## PC2.l2         0.2359222  0.1647486   1.432 0.152774    
## PC3.l2        -0.1819346  0.1003668  -1.813 0.070487 .  
## Gold_Price.l3  0.0010814  0.0018755   0.577 0.564473    
## PC1.l3        -0.1794930  0.0693048  -2.590 0.009884 ** 
## PC2.l3        -0.0227723  0.1631960  -0.140 0.889081    
## PC3.l3         0.3410608  0.1009686   3.378 0.000788 ***
## Gold_Price.l4 -0.0009064  0.0014306  -0.634 0.526654    
## PC1.l4         0.0942021  0.0523116   1.801 0.072347 .  
## PC2.l4        -0.0711751  0.1149038  -0.619 0.535918    
## PC3.l4        -0.1251278  0.0868793  -1.440 0.150432    
## const         -0.9320537  0.6719130  -1.387 0.166018    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## Residual standard error: 0.4399 on 493 degrees of freedom
## Multiple R-Squared: 0.9854,  Adjusted R-squared: 0.9849 
## F-statistic:  2081 on 16 and 493 DF,  p-value: < 2.2e-16 
## 
## 
## Estimation results for equation PC2: 
## ==================================== 
## PC2 = Gold_Price.l1 + PC1.l1 + PC2.l1 + PC3.l1 + Gold_Price.l2 + PC1.l2 + PC2.l2 + PC3.l2 + Gold_Price.l3 + PC1.l3 + PC2.l3 + PC3.l3 + Gold_Price.l4 + PC1.l4 + PC2.l4 + PC3.l4 + const 
## 
##                 Estimate Std. Error t value Pr(>|t|)    
## Gold_Price.l1  0.0006993  0.0006484   1.079 0.281337    
## PC1.l1        -0.0972697  0.0233619  -4.164  3.7e-05 ***
## PC2.l1         1.0397370  0.0525060  19.802  < 2e-16 ***
## PC3.l1        -0.0714409  0.0389429  -1.835 0.067182 .  
## Gold_Price.l2 -0.0000407  0.0008545  -0.048 0.962028    
## PC1.l2         0.1033141  0.0311017   3.322 0.000961 ***
## PC2.l2        -0.1539001  0.0745611  -2.064 0.039533 *  
## PC3.l2         0.0007501  0.0454235   0.017 0.986832    
## Gold_Price.l3 -0.0007343  0.0008488  -0.865 0.387427    
## PC1.l3        -0.0265140  0.0313656  -0.845 0.398343    
## PC2.l3         0.1039261  0.0738584   1.407 0.160028    
## PC3.l3         0.0246169  0.0456958   0.539 0.590328    
## Gold_Price.l4  0.0004485  0.0006474   0.693 0.488827    
## PC1.l4         0.0080985  0.0236749   0.342 0.732442    
## PC2.l4         0.0048691  0.0520026   0.094 0.925440    
## PC3.l4         0.0350887  0.0393194   0.892 0.372612    
## const         -0.6849045  0.3040909  -2.252 0.024742 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## Residual standard error: 0.1991 on 493 degrees of freedom
## Multiple R-Squared: 0.9943,  Adjusted R-squared: 0.9941 
## F-statistic:  5335 on 16 and 493 DF,  p-value: < 2.2e-16 
## 
## 
## Estimation results for equation PC3: 
## ==================================== 
## PC3 = Gold_Price.l1 + PC1.l1 + PC2.l1 + PC3.l1 + Gold_Price.l2 + PC1.l2 + PC2.l2 + PC3.l2 + Gold_Price.l3 + PC1.l3 + PC2.l3 + PC3.l3 + Gold_Price.l4 + PC1.l4 + PC2.l4 + PC3.l4 + const 
## 
##                 Estimate Std. Error t value Pr(>|t|)    
## Gold_Price.l1  0.0008008  0.0008470   0.946 0.344859    
## PC1.l1         0.0860051  0.0305198   2.818 0.005026 ** 
## PC2.l1        -0.1041578  0.0685933  -1.518 0.129534    
## PC3.l1         0.6007681  0.0508747  11.809  < 2e-16 ***
## Gold_Price.l2  0.0010748  0.0011163   0.963 0.336094    
## PC1.l2        -0.1420349  0.0406309  -3.496 0.000515 ***
## PC2.l2         0.1807992  0.0974059   1.856 0.064030 .  
## PC3.l2         0.2876091  0.0593408   4.847 1.68e-06 ***
## Gold_Price.l3 -0.0004441  0.0011089  -0.400 0.688994    
## PC1.l3         0.0771503  0.0409757   1.883 0.060312 .  
## PC2.l3         0.0306470  0.0964879   0.318 0.750904    
## PC3.l3        -0.0460270  0.0596966  -0.771 0.441067    
## Gold_Price.l4 -0.0018653  0.0008458  -2.205 0.027892 *  
## PC1.l4        -0.0209627  0.0309287  -0.678 0.498231    
## PC2.l4        -0.0999585  0.0679357  -1.471 0.141829    
## PC3.l4         0.1520054  0.0513664   2.959 0.003232 ** 
## const          0.8044982  0.3972615   2.025 0.043394 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## Residual standard error: 0.2601 on 493 degrees of freedom
## Multiple R-Squared: 0.9798,  Adjusted R-squared: 0.9792 
## F-statistic:  1496 on 16 and 493 DF,  p-value: < 2.2e-16 
## 
## 
## 
## Covariance matrix of residuals:
##            Gold_Price       PC1       PC2       PC3
## Gold_Price   274.9417  1.824838 -1.681822  0.892851
## PC1            1.8248  0.193518 -0.008957  0.055094
## PC2           -1.6818 -0.008957  0.039637 -0.001036
## PC3            0.8929  0.055094 -0.001036  0.067647
## 
## Correlation matrix of residuals:
##            Gold_Price     PC1      PC2      PC3
## Gold_Price     1.0000  0.2502 -0.50946  0.20703
## PC1            0.2502  1.0000 -0.10227  0.48153
## PC2           -0.5095 -0.1023  1.00000 -0.02001
## PC3            0.2070  0.4815 -0.02001  1.00000

# Forecast ra 10 bước (kỳ là số hàng test)
var_forecast <- predict(var_model, n.ahead = nrow(test))

# Lấy dự báo biến Gold_Price
gold_forecast <- var_forecast$fcst$Gold_Price[,1]

# So sánh với test
results <- data.frame(
  Date = test$Date,
  Actual = test$Gold_Price,
  Forecast = gold_forecast
)

print(results)

##         Date Actual Forecast
## 1 2024-01-02 2073.4 2069.988
## 2 2024-01-03 2042.8 2069.219
## 3 2024-01-04 2050.0 2068.963
## 4 2024-01-05 2049.8 2068.928
## 5 2024-01-08 2033.5 2068.269
## 6 2024-01-09 2033.0 2067.625
## 7 2024-01-10 2027.8 2067.353

#3. Mô hình ARDL với lựa chọn biến trễ và loại bỏ biến không ý nghĩa

library(dynlm)
library(broom)

# Tạo chuỗi thời gian với Date làm index
train_ts <- train %>% dplyr::select(Date, Gold_Price, PC1, PC2, PC3) %>% arrange(Date)

# Để làm ARDL, bạn cần tạo biến trễ thủ công
# Ở đây tạo trễ 1 cho tất cả biến ví dụ (có thể mở rộng tùy ý)

train_ts <- train_ts %>%
  mutate(
    Gold_Price_Lag1 = lag(Gold_Price, 1),
    PC1_Lag1 = lag(PC1, 1),
    PC2_Lag1 = lag(PC2, 1),
    PC3_Lag1 = lag(PC3, 1)
  ) %>%
  filter(!is.na(Gold_Price_Lag1))  # loại bỏ NA do trễ

# Mô hình đầy đủ với biến trễ
ardl_formula <- as.formula("Gold_Price ~ Gold_Price_Lag1 + PC1 + PC2 + PC3 + PC1_Lag1 + PC2_Lag1 + PC3_Lag1")

ardl_model <- dynlm(ardl_formula, data = train_ts)

summary(ardl_model)

## 
## Time series regression with "numeric" data:
## Start = 1, End = 513
## 
## Call:
## dynlm(formula = ardl_formula, data = train_ts)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.930  -8.003  -0.039   7.811  52.710 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      53.12780   20.31960   2.615 0.009200 ** 
## Gold_Price_Lag1   0.97226    0.01083  89.798  < 2e-16 ***
## PC1               5.01299    1.54680   3.241 0.001270 ** 
## PC2             -40.50217    3.07149 -13.186  < 2e-16 ***
## PC3               8.58541    2.43380   3.528 0.000458 ***
## PC1_Lag1         -4.32325    1.54635  -2.796 0.005375 ** 
## PC2_Lag1         40.48417    3.05922  13.233  < 2e-16 ***
## PC3_Lag1         -8.14817    2.43686  -3.344 0.000888 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.98 on 505 degrees of freedom
## Multiple R-squared:  0.983,  Adjusted R-squared:  0.9827 
## F-statistic:  4162 on 7 and 505 DF,  p-value: < 2.2e-16

# Loại bỏ biến có p-value > 0.1

tidy_ardl <- broom::tidy(ardl_model)

## Warning: The `tidy()` method for objects of class `dynlm` is not maintained by the broom team, and is only supported through the `lm` tidier method. Please be cautious in interpreting and reporting broom output.
## 
## This warning is displayed once per session.

vars_signif <- tidy_ardl %>% filter(term != "(Intercept)" & p.value <= 0.1) %>% pull(term)

# Tạo lại công thức với biến có ý nghĩa
ardl_formula2 <- as.formula(paste("Gold_Price ~", paste(vars_signif, collapse = " + ")))

ardl_model2 <- dynlm(ardl_formula2, data = train_ts)
summary(ardl_model2)

## 
## Time series regression with "numeric" data:
## Start = 1, End = 513
## 
## Call:
## dynlm(formula = ardl_formula2, data = train_ts)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.930  -8.003  -0.039   7.811  52.710 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      53.12780   20.31960   2.615 0.009200 ** 
## Gold_Price_Lag1   0.97226    0.01083  89.798  < 2e-16 ***
## PC1               5.01299    1.54680   3.241 0.001270 ** 
## PC2             -40.50217    3.07149 -13.186  < 2e-16 ***
## PC3               8.58541    2.43380   3.528 0.000458 ***
## PC1_Lag1         -4.32325    1.54635  -2.796 0.005375 ** 
## PC2_Lag1         40.48417    3.05922  13.233  < 2e-16 ***
## PC3_Lag1         -8.14817    2.43686  -3.344 0.000888 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.98 on 505 degrees of freedom
## Multiple R-squared:  0.983,  Adjusted R-squared:  0.9827 
## F-statistic:  4162 on 7 and 505 DF,  p-value: < 2.2e-16

#4. Mô hình Random Forest cho chuỗi thời gian (giải pháp supervised learning)

library(randomForest)

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

# Chuẩn bị dữ liệu train (bỏ biến Date)
train_rf <- train_ts %>% dplyr::select(-Date)

# Tạo biến mục tiêu là Gold_Price, input là các biến khác
rf_model <- randomForest(Gold_Price ~ ., data = train_rf, ntree = 500)

print(rf_model)

## 
## Call:
##  randomForest(formula = Gold_Price ~ ., data = train_rf, ntree = 500) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 275.1772
##                     % Var explained: 97.56

# Chuẩn bị test tương tự (tạo biến trễ như train)
test_ts <- test %>%
  mutate(
    Gold_Price_Lag1 = lag(train_ts$Gold_Price, 1)[(nrow(train_ts) - nrow(test) + 1):nrow(train_ts)],
    PC1_Lag1 = lag(train_ts$PC1, 1)[(nrow(train_ts) - nrow(test) + 1):nrow(train_ts)],
    PC2_Lag1 = lag(train_ts$PC2, 1)[(nrow(train_ts) - nrow(test) + 1):nrow(train_ts)],
    PC3_Lag1 = lag(train_ts$PC3, 1)[(nrow(train_ts) - nrow(test) + 1):nrow(train_ts)]
  ) %>%
  filter(!is.na(Gold_Price_Lag1))

test_rf <- dplyr::select(test_ts, -Date)

# Dự báo trên test set
pred_rf <- predict(rf_model, newdata = test_rf)

results_rf <- data.frame(Date = test_ts$Date, Actual = test_rf$Gold_Price, Forecast_RF=pred_rf)
print(results_rf)

##         Date Actual Forecast_RF
## 1 2024-01-02 2073.4    2045.892
## 2 2024-01-03 2042.8    2033.943
## 3 2024-01-04 2050.0    2041.663
## 4 2024-01-05 2049.8    2045.494
## 5 2024-01-08 2033.5    2041.540
## 6 2024-01-09 2033.0    2043.762
## 7 2024-01-10 2027.8    2042.507

#5. Một số mô hình học máy khác phù hợp với chuỗi thời gian

#install.packages("xgboost")

library(xgboost)

## 
## Attaching package: 'xgboost'

## The following object is masked from 'package:dplyr':
## 
##     slice

# Chuyển dữ liệu sang matrix
train_matrix <- train_rf %>%
  dplyr::select(-Gold_Price) %>%
  as.matrix()
train_label <- train_rf$Gold_Price

dtrain <- xgb.DMatrix(data = train_matrix, label = train_label)

params <- list(objective = "reg:squarederror")
xgb_model <- xgb.train(params = params, data = dtrain, nrounds = 100)

# Dự báo test
test_matrix <- if ("Gold_Price" %in% colnames(test_rf)) {
  test_rf %>% dplyr::select(-Gold_Price) %>% as.matrix()
} else {
  as.matrix(test_rf)
}
dtest <- xgb.DMatrix(data = test_matrix)
pred_xgb <- predict(xgb_model, dtest)

results_xgb <- data.frame(Date = test_ts$Date, Actual = test_rf$Gold_Price, Forecast_XGB = pred_xgb)
print(results_xgb)

##         Date Actual Forecast_XGB
## 1 2024-01-02 2073.4     2047.954
## 2 2024-01-03 2042.8     2037.110
## 3 2024-01-04 2050.0     2051.525
## 4 2024-01-05 2049.8     2059.361
## 5 2024-01-08 2033.5     2070.469
## 6 2024-01-09 2033.0     2059.811
## 7 2024-01-10 2027.8     2054.358