0. Chuẩn bị
setwd("/Users/thuphan/Desktop/dataR")
library(rio)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom 1.0.0 ✔ recipes 1.0.1
## ✔ dials 1.0.0 ✔ rsample 1.0.0
## ✔ dplyr 1.0.9 ✔ tibble 3.1.7
## ✔ ggplot2 3.3.6 ✔ tidyr 1.2.0
## ✔ infer 1.0.2 ✔ tune 1.0.0
## ✔ modeldata 1.0.0 ✔ workflows 1.0.0
## ✔ parsnip 1.0.0 ✔ workflowsets 1.0.0
## ✔ purrr 0.3.4 ✔ yardstick 1.0.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
library(modeltime)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ✔ stringr 1.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ stringr::fixed() masks recipes::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ readr::spec() masks yardstick::spec()
library(timetk)
1. Tạo dữ liệu
dulieu <-import("day.csv")
dulieu <-dulieu %>% select(dteday, cnt) %>% set_names(c("date","value")) %>% tibble()
dulieu$date = as.Date(dulieu$date) # format cho biến thời gian
dulieu
## # A tibble: 731 × 2
## date value
## <date> <int>
## 1 2011-01-01 985
## 2 2011-01-02 801
## 3 2011-01-03 1349
## 4 2011-01-04 1562
## 5 2011-01-05 1600
## 6 2011-01-06 1606
## 7 2011-01-07 1510
## 8 2011-01-08 959
## 9 2011-01-09 822
## 10 2011-01-10 1321
## # … with 721 more rows
#Vẽ đồ thị
dulieu %>% plot_time_series(date, value, .interactive = T)
# Chia data cho train và test
chiaDulieu <- dulieu %>% time_series_split(assess = "3 months", cumulative = TRUE)
## Using date_var: date
# Vẽ đồ thị dự phân chi
chiaDulieu %>%
tk_time_series_cv_plan() %>% # chuyển đổi split sang dataframe
plot_time_series_cv_plan(date, value, .interactive = T )
2. Xây dựng mô hình
# Đăng ký time serial cho dữ liệu
congthuc <- value~date
ts_congthuc <- recipe(congthuc, data = training(chiaDulieu)) %>% step_timeseries_signature(date)
bake(prep(ts_congthuc), new_data = training(chiaDulieu))
## Warning: `terms_select()` was deprecated in recipes 0.1.17.
## Please use `recipes_eval_select()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## # A tibble: 641 × 29
## date value date_index.num date_year date_year.iso date_half
## <date> <int> <dbl> <int> <int> <int>
## 1 2011-01-01 985 1293840000 2011 2010 1
## 2 2011-01-02 801 1293926400 2011 2010 1
## 3 2011-01-03 1349 1294012800 2011 2011 1
## 4 2011-01-04 1562 1294099200 2011 2011 1
## 5 2011-01-05 1600 1294185600 2011 2011 1
## 6 2011-01-06 1606 1294272000 2011 2011 1
## 7 2011-01-07 1510 1294358400 2011 2011 1
## 8 2011-01-08 959 1294444800 2011 2011 1
## 9 2011-01-09 822 1294531200 2011 2011 1
## 10 2011-01-10 1321 1294617600 2011 2011 1
## # … with 631 more rows, and 23 more variables: date_quarter <int>,
## # date_month <int>, date_month.xts <int>, date_month.lbl <ord>,
## # date_day <int>, date_hour <int>, date_minute <int>, date_second <int>,
## # date_hour12 <int>, date_am.pm <int>, date_wday <int>, date_wday.xts <int>,
## # date_wday.lbl <ord>, date_mday <int>, date_qday <int>, date_yday <int>,
## # date_mweek <int>, date_week <int>, date_week.iso <int>, date_week2 <int>,
## # date_week3 <int>, date_week4 <int>, date_mday7 <int>
# Phân giải công thúc
ts_congthuc_cuoi <- ts_congthuc %>%
step_fourier(date, period = 365, K = 5) %>%
step_rm(date) %>%
step_rm(contains("iso"), contains("minute"), contains("hour"),
contains("am.pm"), contains("xts")) %>%
step_normalize(contains("index.num"), date_year) %>%
step_dummy(contains("lbl"), one_hot = TRUE)
juice(prep(ts_congthuc_cuoi))
## # A tibble: 641 × 47
## value date_index.num date_year date_half date_quarter date_month date_day
## <int> <dbl> <dbl> <int> <int> <int> <int>
## 1 985 -1.73 -0.869 1 1 1 1
## 2 801 -1.72 -0.869 1 1 1 2
## 3 1349 -1.72 -0.869 1 1 1 3
## 4 1562 -1.71 -0.869 1 1 1 4
## 5 1600 -1.71 -0.869 1 1 1 5
## 6 1606 -1.70 -0.869 1 1 1 6
## 7 1510 -1.70 -0.869 1 1 1 7
## 8 959 -1.69 -0.869 1 1 1 8
## 9 822 -1.68 -0.869 1 1 1 9
## 10 1321 -1.68 -0.869 1 1 1 10
## # … with 631 more rows, and 40 more variables: date_second <int>,
## # date_wday <int>, date_mday <int>, date_qday <int>, date_yday <int>,
## # date_mweek <int>, date_week <int>, date_week2 <int>, date_week3 <int>,
## # date_week4 <int>, date_mday7 <int>, date_sin365_K1 <dbl>,
## # date_cos365_K1 <dbl>, date_sin365_K2 <dbl>, date_cos365_K2 <dbl>,
## # date_sin365_K3 <dbl>, date_cos365_K3 <dbl>, date_sin365_K4 <dbl>,
## # date_cos365_K4 <dbl>, date_sin365_K5 <dbl>, date_cos365_K5 <dbl>, …
3. Thiết lập mô hình
# thiết lập hồi quy
hoiquy<- linear_reg(mode = "regression") %>% set_engine("lm")
# thiết lập quy trình
quytrinh_hoiquy <- workflow() %>%
add_recipe(ts_congthuc_cuoi) %>%
add_model(hoiquy)
quytrinh_hoiquy
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 6 Recipe Steps
##
## • step_timeseries_signature()
## • step_fourier()
## • step_rm()
## • step_rm()
## • step_normalize()
## • step_dummy()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Linear Regression Model Specification (regression)
##
## Computational engine: lm
# Chạy training
quytrinh_training <- quytrinh_hoiquy %>% fit(data = training(chiaDulieu))
quytrinh_training
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 6 Recipe Steps
##
## • step_timeseries_signature()
## • step_fourier()
## • step_rm()
## • step_rm()
## • step_normalize()
## • step_dummy()
##
## ── Model ───────────────────────────────────────────────────────────────────────
##
## Call:
## stats::lm(formula = ..y ~ ., data = data)
##
## Coefficients:
## (Intercept) date_index.num date_year date_half
## 42903.070 87792.552 -84823.218 -3654.870
## date_quarter date_month date_day date_second
## 76850.908 -38967.341 -1326.627 NA
## date_wday date_mday date_qday date_yday
## -40.184 NA 872.533 NA
## date_mweek date_week date_week2 date_week3
## 28.154 -75.400 -28.011 -5.791
## date_week4 date_mday7 date_sin365_K1 date_cos365_K1
## 56.448 -86.030 -933.970 -1262.472
## date_sin365_K2 date_cos365_K2 date_sin365_K3 date_cos365_K3
## -653.597 -474.458 40.164 -293.893
## date_sin365_K4 date_cos365_K4 date_sin365_K5 date_cos365_K5
## 104.315 8.885 32.961 19.975
## date_month.lbl_01 date_month.lbl_02 date_month.lbl_03 date_month.lbl_04
## 5152.522 2688.407 3586.588 3130.740
## date_month.lbl_05 date_month.lbl_06 date_month.lbl_07 date_month.lbl_08
## 2279.616 NA 5093.154 2882.643
## date_month.lbl_09 date_month.lbl_10 date_month.lbl_11 date_month.lbl_12
## NA 1470.130 NA NA
## date_wday.lbl_1 date_wday.lbl_2 date_wday.lbl_3 date_wday.lbl_4
## -478.402 -364.533 -198.376 -206.391
## date_wday.lbl_5 date_wday.lbl_6 date_wday.lbl_7
## -33.502 NA NA
# kiểm tra testing
quytrinh_testing <- quytrinh_training %>%
predict(testing(chiaDulieu)) %>%
bind_cols(testing(chiaDulieu))
## Warning in predict.lm(object = object$fit, newdata = new_data, type =
## "response"): prediction from a rank-deficient fit may be misleading
quytrinh_testing
## # A tibble: 90 × 3
## .pred date value
## <dbl> <date> <int>
## 1 6336. 2012-10-03 7572
## 2 6495. 2012-10-04 7328
## 3 6513. 2012-10-05 8156
## 4 6496. 2012-10-06 7965
## 5 6168. 2012-10-07 3510
## 6 6260. 2012-10-08 5478
## 7 6402. 2012-10-09 6392
## 8 6367. 2012-10-10 7691
## 9 6511. 2012-10-11 7570
## 10 6513. 2012-10-12 7282
## # … with 80 more rows
4. Dự báo với modeltime
# Thiết lập modeltime với quy trình training
mohinh_bang <- modeltime_table( quytrinh_training )
mohinh_bang
## # Modeltime Table
## # A tibble: 1 × 3
## .model_id .model .model_desc
## <int> <list> <chr>
## 1 1 <workflow> LM
# Hiệu chỉnh quy trình với quy trình testing
hieuchinh_bang <- mohinh_bang %>% modeltime_calibrate(testing(chiaDulieu))
## Warning in predict.lm(object = object$fit, newdata = new_data, type =
## "response"): prediction from a rank-deficient fit may be misleading
hieuchinh_bang
## # Modeltime Table
## # A tibble: 1 × 5
## .model_id .model .model_desc .type .calibration_data
## <int> <list> <chr> <chr> <list>
## 1 1 <workflow> LM Test <tibble [90 × 4]>
# Dự báo phần hiệu chỉnh
hieuchinh_bang %>%
modeltime_forecast(actual_data = dulieu) %>%
plot_modeltime_forecast(.interactive = T)
## Using '.calibration_data' to forecast.
## Warning in predict.lm(object = object$fit, newdata = new_data, type =
## "response"): prediction from a rank-deficient fit may be misleading
# Tính toán các chỉ số chất lượng
hieuchinh_bang %>%
modeltime_accuracy() %>%
table_modeltime_accuracy(.interactive = T)
# Dự báo tiếp theo 12 tháng
hieuchinh_bang %>%
modeltime_refit(dulieu) %>%
modeltime_forecast(h = "12 months", actual_data = dulieu) %>%
plot_modeltime_forecast(.interactive = T)
## Warning in predict.lm(object = object$fit, newdata = new_data, type =
## "response"): prediction from a rank-deficient fit may be misleading