0. Chuẩn bị

setwd("/Users/thuphan/Desktop/dataR")
library(rio)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom        1.0.0     ✔ recipes      1.0.1
## ✔ dials        1.0.0     ✔ rsample      1.0.0
## ✔ dplyr        1.0.9     ✔ tibble       3.1.7
## ✔ ggplot2      3.3.6     ✔ tidyr        1.2.0
## ✔ infer        1.0.2     ✔ tune         1.0.0
## ✔ modeldata    1.0.0     ✔ workflows    1.0.0
## ✔ parsnip      1.0.0     ✔ workflowsets 1.0.0
## ✔ purrr        0.3.4     ✔ yardstick    1.0.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
library(modeltime)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ✔ stringr 1.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard()    masks scales::discard()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ stringr::fixed()    masks recipes::fixed()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ readr::spec()       masks yardstick::spec()
library(timetk)

1. Tạo dữ liệu

dulieu <-import("day.csv")
dulieu <-dulieu %>% select(dteday, cnt) %>% set_names(c("date","value")) %>% tibble()
dulieu$date = as.Date(dulieu$date) # format cho biến thời gian
dulieu
## # A tibble: 731 × 2
##    date       value
##    <date>     <int>
##  1 2011-01-01   985
##  2 2011-01-02   801
##  3 2011-01-03  1349
##  4 2011-01-04  1562
##  5 2011-01-05  1600
##  6 2011-01-06  1606
##  7 2011-01-07  1510
##  8 2011-01-08   959
##  9 2011-01-09   822
## 10 2011-01-10  1321
## # … with 721 more rows
#Vẽ đồ thị
dulieu %>% plot_time_series(date, value, .interactive = T)
# Chia data cho train và test
chiaDulieu <- dulieu %>% time_series_split(assess = "3 months", cumulative = TRUE)
## Using date_var: date
# Vẽ đồ thị dự phân chi
chiaDulieu %>%
  tk_time_series_cv_plan() %>% # chuyển đổi split sang dataframe
  plot_time_series_cv_plan(date, value, .interactive = T )

2. Xây dựng mô hình

# Đăng ký time serial cho dữ liệu
congthuc <- value~date
ts_congthuc <- recipe(congthuc, data = training(chiaDulieu)) %>% step_timeseries_signature(date) 

bake(prep(ts_congthuc), new_data = training(chiaDulieu))
## Warning: `terms_select()` was deprecated in recipes 0.1.17.
## Please use `recipes_eval_select()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## # A tibble: 641 × 29
##    date       value date_index.num date_year date_year.iso date_half
##    <date>     <int>          <dbl>     <int>         <int>     <int>
##  1 2011-01-01   985     1293840000      2011          2010         1
##  2 2011-01-02   801     1293926400      2011          2010         1
##  3 2011-01-03  1349     1294012800      2011          2011         1
##  4 2011-01-04  1562     1294099200      2011          2011         1
##  5 2011-01-05  1600     1294185600      2011          2011         1
##  6 2011-01-06  1606     1294272000      2011          2011         1
##  7 2011-01-07  1510     1294358400      2011          2011         1
##  8 2011-01-08   959     1294444800      2011          2011         1
##  9 2011-01-09   822     1294531200      2011          2011         1
## 10 2011-01-10  1321     1294617600      2011          2011         1
## # … with 631 more rows, and 23 more variables: date_quarter <int>,
## #   date_month <int>, date_month.xts <int>, date_month.lbl <ord>,
## #   date_day <int>, date_hour <int>, date_minute <int>, date_second <int>,
## #   date_hour12 <int>, date_am.pm <int>, date_wday <int>, date_wday.xts <int>,
## #   date_wday.lbl <ord>, date_mday <int>, date_qday <int>, date_yday <int>,
## #   date_mweek <int>, date_week <int>, date_week.iso <int>, date_week2 <int>,
## #   date_week3 <int>, date_week4 <int>, date_mday7 <int>
# Phân giải công thúc
ts_congthuc_cuoi <- ts_congthuc %>%
    step_fourier(date, period = 365, K = 5) %>%
    step_rm(date) %>%
    step_rm(contains("iso"), contains("minute"), contains("hour"),
            contains("am.pm"), contains("xts")) %>%
    step_normalize(contains("index.num"), date_year) %>%
    step_dummy(contains("lbl"), one_hot = TRUE) 

juice(prep(ts_congthuc_cuoi))
## # A tibble: 641 × 47
##    value date_index.num date_year date_half date_quarter date_month date_day
##    <int>          <dbl>     <dbl>     <int>        <int>      <int>    <int>
##  1   985          -1.73    -0.869         1            1          1        1
##  2   801          -1.72    -0.869         1            1          1        2
##  3  1349          -1.72    -0.869         1            1          1        3
##  4  1562          -1.71    -0.869         1            1          1        4
##  5  1600          -1.71    -0.869         1            1          1        5
##  6  1606          -1.70    -0.869         1            1          1        6
##  7  1510          -1.70    -0.869         1            1          1        7
##  8   959          -1.69    -0.869         1            1          1        8
##  9   822          -1.68    -0.869         1            1          1        9
## 10  1321          -1.68    -0.869         1            1          1       10
## # … with 631 more rows, and 40 more variables: date_second <int>,
## #   date_wday <int>, date_mday <int>, date_qday <int>, date_yday <int>,
## #   date_mweek <int>, date_week <int>, date_week2 <int>, date_week3 <int>,
## #   date_week4 <int>, date_mday7 <int>, date_sin365_K1 <dbl>,
## #   date_cos365_K1 <dbl>, date_sin365_K2 <dbl>, date_cos365_K2 <dbl>,
## #   date_sin365_K3 <dbl>, date_cos365_K3 <dbl>, date_sin365_K4 <dbl>,
## #   date_cos365_K4 <dbl>, date_sin365_K5 <dbl>, date_cos365_K5 <dbl>, …

3. Thiết lập mô hình

# thiết lập hồi quy
hoiquy<- linear_reg(mode = "regression") %>% set_engine("lm")

# thiết lập quy trình
quytrinh_hoiquy <- workflow() %>%
    add_recipe(ts_congthuc_cuoi) %>%
    add_model(hoiquy)

quytrinh_hoiquy
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 6 Recipe Steps
## 
## • step_timeseries_signature()
## • step_fourier()
## • step_rm()
## • step_rm()
## • step_normalize()
## • step_dummy()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Linear Regression Model Specification (regression)
## 
## Computational engine: lm
# Chạy training
quytrinh_training <- quytrinh_hoiquy %>% fit(data = training(chiaDulieu))
quytrinh_training
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 6 Recipe Steps
## 
## • step_timeseries_signature()
## • step_fourier()
## • step_rm()
## • step_rm()
## • step_normalize()
## • step_dummy()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## 
## Call:
## stats::lm(formula = ..y ~ ., data = data)
## 
## Coefficients:
##       (Intercept)     date_index.num          date_year          date_half  
##         42903.070          87792.552         -84823.218          -3654.870  
##      date_quarter         date_month           date_day        date_second  
##         76850.908         -38967.341          -1326.627                 NA  
##         date_wday          date_mday          date_qday          date_yday  
##           -40.184                 NA            872.533                 NA  
##        date_mweek          date_week         date_week2         date_week3  
##            28.154            -75.400            -28.011             -5.791  
##        date_week4         date_mday7     date_sin365_K1     date_cos365_K1  
##            56.448            -86.030           -933.970          -1262.472  
##    date_sin365_K2     date_cos365_K2     date_sin365_K3     date_cos365_K3  
##          -653.597           -474.458             40.164           -293.893  
##    date_sin365_K4     date_cos365_K4     date_sin365_K5     date_cos365_K5  
##           104.315              8.885             32.961             19.975  
## date_month.lbl_01  date_month.lbl_02  date_month.lbl_03  date_month.lbl_04  
##          5152.522           2688.407           3586.588           3130.740  
## date_month.lbl_05  date_month.lbl_06  date_month.lbl_07  date_month.lbl_08  
##          2279.616                 NA           5093.154           2882.643  
## date_month.lbl_09  date_month.lbl_10  date_month.lbl_11  date_month.lbl_12  
##                NA           1470.130                 NA                 NA  
##   date_wday.lbl_1    date_wday.lbl_2    date_wday.lbl_3    date_wday.lbl_4  
##          -478.402           -364.533           -198.376           -206.391  
##   date_wday.lbl_5    date_wday.lbl_6    date_wday.lbl_7  
##           -33.502                 NA                 NA
# kiểm tra testing
quytrinh_testing <- quytrinh_training %>% 
    predict(testing(chiaDulieu)) %>%
    bind_cols(testing(chiaDulieu)) 
## Warning in predict.lm(object = object$fit, newdata = new_data, type =
## "response"): prediction from a rank-deficient fit may be misleading
quytrinh_testing
## # A tibble: 90 × 3
##    .pred date       value
##    <dbl> <date>     <int>
##  1 6336. 2012-10-03  7572
##  2 6495. 2012-10-04  7328
##  3 6513. 2012-10-05  8156
##  4 6496. 2012-10-06  7965
##  5 6168. 2012-10-07  3510
##  6 6260. 2012-10-08  5478
##  7 6402. 2012-10-09  6392
##  8 6367. 2012-10-10  7691
##  9 6511. 2012-10-11  7570
## 10 6513. 2012-10-12  7282
## # … with 80 more rows

4. Dự báo với modeltime

# Thiết lập modeltime với quy trình training
mohinh_bang <- modeltime_table( quytrinh_training ) 

mohinh_bang
## # Modeltime Table
## # A tibble: 1 × 3
##   .model_id .model     .model_desc
##       <int> <list>     <chr>      
## 1         1 <workflow> LM
# Hiệu chỉnh quy trình với quy trình testing
hieuchinh_bang <- mohinh_bang %>%  modeltime_calibrate(testing(chiaDulieu))
## Warning in predict.lm(object = object$fit, newdata = new_data, type =
## "response"): prediction from a rank-deficient fit may be misleading
hieuchinh_bang
## # Modeltime Table
## # A tibble: 1 × 5
##   .model_id .model     .model_desc .type .calibration_data
##       <int> <list>     <chr>       <chr> <list>           
## 1         1 <workflow> LM          Test  <tibble [90 × 4]>
# Dự báo phần hiệu chỉnh
hieuchinh_bang %>%
  modeltime_forecast(actual_data = dulieu) %>%
  plot_modeltime_forecast(.interactive = T)
## Using '.calibration_data' to forecast.
## Warning in predict.lm(object = object$fit, newdata = new_data, type =
## "response"): prediction from a rank-deficient fit may be misleading
# Tính toán các chỉ số chất lượng
hieuchinh_bang %>%
  modeltime_accuracy() %>%
  table_modeltime_accuracy(.interactive = T)
# Dự báo tiếp theo 12 tháng
hieuchinh_bang %>%
  modeltime_refit(dulieu) %>%
  modeltime_forecast(h = "12 months", actual_data = dulieu) %>%
  plot_modeltime_forecast(.interactive = T)
## Warning in predict.lm(object = object$fit, newdata = new_data, type =
## "response"): prediction from a rank-deficient fit may be misleading