機械学習

# CSVデータをウェブサイトから取得する。
d0 <- read.csv('https://stats.dip.jp/01_ds/data/load/load_amedas_y2022_tokyo.csv')

# データの期間を絞る（2022年5月1日～2022年6月30日）
is.term <- '2022-05-01' <= d0$date & d0$date <= '2022-06-30'
d1 <- d0[is.term, ]

n <- nrow(d1)                 # データサイズ
t <- 1:n                      # 経過時間（hour）
px <- as.POSIXct(d1$datetime) # 時間オブジェクトに変換 
y <- d1$mw / 1000             # 測定値 (単位変換：MW -> GW)

d1 <- data.frame(px, t, y) # データフレームにまとめる

head(d1) # データの先頭の5行を表示

library('forecast')

## Warning: パッケージ 'forecast' はバージョン 4.3.3 の R の下で造られました

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

# LOESS（局所回帰）のフィッティング（fitting）
# 平滑化窓幅（span）：30日（トレンドを捉えるのに十分な窓幅を設定する）
fit1 <- loess(y ~ t, # モデル式（データフレームd1のカラム名で表記する）
              data = d1, 
              span = 24 * 30 / n)

# フィット値（fitted values）がトレンドとなる。
# データフレームd1にtrendを追加
d1$trend <- fit1$fitted

# 折れ線プロット（原系列）
matplot(x = d1$px, y = d1$y, type = 'l', pch = 1, col = 2,
        main = '電力需要',
        xlab = '月 日',
        ylab = 'GW')

abline(h = 0, col = 'gray') # x軸線

# 折れ線プロット（トレンド成分）
matlines(x = d1$px, y = d1$trend, pch = 2, col = 4, lwd = 2)

# 凡例（はんれい）
legend('topleft', lty = 1, col = c(2, 4), 
       legend = c('原系列', 'トレンド'))

# 残余成分（トレンド除去成分）計算
# データフレームd1にdetrendedを追加
d1$detrended <- d1$y - d1$trend

# 折れ線プロット（残余成分）
matplot(d1$px, d1$detrended, type = 'l', pch = 1, col = 2,
        main = '残余成分（原系列－トレンド）',
        xlab = '月 日',
        ylab = 'GW')

abline(h = 0, col = 'gray') # x軸線

fit2 <- loess(detrended ~ t, 
              data = d1, # モデル式（データフレームd1のカラム名で表記する） 
              span = 24 * 4 / n)

# フィット値（fitted values）が周期成分となる。
# データフレームd1にseasonalを追加
d1$seasonal <- fit2$fitted

# 折れ線プロット（トレンド除去成分・周期成分）
matplot(x = d1$px, y = d1$detrended, type = 'l', pch = 1, col = 2, ylim = c(-15, 15),
        main = '残余成分（トレンド除去後の成分）・周期成分',
        xlab = '月 日',
        ylab = 'GW')

# 折れ線プロット（周期成分）
matlines(x = d1$px, y = d1$seasonal, pch = 2, col = 4, lwd = 2)

abline(h = 0, col = 'gray') # x軸線

# 残余成分を計算（トレンド除去成分－周期成分）
# データフレームd1にremainderを追加
d1$remainder <- d1$detrended - d1$seasonal

# 折れ線プロット（残余成分）
matplot(d1$px, d1$remainder, type = 'l', pch = 1, col = 2, ylim = c(-15, 15),
        main = '残余成分（トレンド，周期成分除去後の成分）',
        xlab = '月 日',
        ylab = 'GW')

abline(h = 0, col = 'gray') # x軸線

s <- mstl(x = msts(y, seasonal.period = c(24, 24*7)),
          s.window = 'periodic')

autoplot(s, main = '周期成分：時間変動なし')

s <- mstl(x = msts(y, seasonal.period = c(24, 24*7)))

autoplot(s, main = '周期成分：時間変動あり')

PERIOD <- c(24, 24*7) # 周期変動成分の周期（単位：hour）
n.p <- length(PERIOD) # 時間数

#
# データ取得
#
# 【注意】訓練データとテストデータの期間は連続していること。
d0 <- read.csv('https://stats.dip.jp/01_ds/data/load/load_amedas_y2022_tokyo.csv')


# 訓練データ
d.tr <- d0['2022-05-01' <= d0$date & d0$date <= '2022-06-30', ]

# テストデータ（予測区間の正解データとして精度評価に利用）
d.te <- d0['2022-07-01' <= d0$date & d0$date <= '2022-07-03', ]

# データサイズ
n.tr <- nrow(d.tr) # 訓練期間データサイズ
n.te <- nrow(d.te) # 試験期間データサイズ
n.al <- n.tr + n.te # 全（訓練＋試験）期間データサイズ

# 範囲インデックス
ii.tr <- 1:n.tr
ii.te <- (n.tr + 1):(n.tr + n.te)
ii.al <- 1:(n.tr + n.te)

# 測定値 (単位変換：MW -> GW)
y.tr <- d.tr$mw / 1000
y.te <- d.te$mw / 1000
y.al <- c(y.tr, y.te)

# タイムスタンプ
px.tr <- as.POSIXct(d.tr$datetime)
px.te <- as.POSIXct(d.te$datetime)
px.al <- c(px.tr, px.te)

fit.stl <- mstl(x = msts(y.tr, seasonal.period = PERIOD))

# 周期成分に時間変化を許さないときは次を使う。上との違いを確認すること。
#fit.stl <- mstl(x = msts(y.tr, seasonal.period = PERIOD), s.window = 'periodic')

autoplot(fit.stl)

#
# トレンド成分，周期変動成分の格納
#
stl.al <- matrix(NA, nrow = n.al, ncol = n.p + 3,
                 dimnames = list(ii.al, colnames(fit.stl)))

# トレンド成分（適合値，予測値）の格納
fit.trend <- loess(y ~ x, span = 0.1,
                   data = data.frame(x = ii.tr, y = fit.stl[, 'Trend']),
                   control = loess.control(surface = 'direct'))
# 予測（LOESS）
pred.trend <- predict(fit.trend, newdata = data.frame(x = ii.te))

stl.al[, 'Trend'] <- c(fit.stl[, 'Trend'], pred.trend)

# 周期変動成分（適合値，予測値）の格納
for (j in seq_along(PERIOD))
{
  stl.al[ii.tr, 2 + j] <- fit.stl[, 2 + j]

  for (i in 1:n.te)
  {
    # 予測値は前周期の繰り返し
    stl.al[n.tr + i, 2 + j] <- stl.al[n.tr + i - PERIOD[j], 2 + j]
  }
}

# 残余成分（予測期間は値なし）
stl.al[, 'Remainder'] <- c(fit.stl[, 'Remainder'], rep(NA, n.te))

# 残余成分以外（トレンド成分＋周期成分）で原系列の適合値，予測値を計算する。
yhat.al <- stl.al[, 'Data'] <- rowSums(stl.al[, 2:(n.p+2)])
yhat.te <- yhat.al[ii.te]

head(stl.al)

##       Data    Trend Seasonal24 Seasonal168 Remainder
## 1 20.20017 24.11917  -3.190286  -0.7287139  1.419826
## 2 19.36617 24.12002  -4.019563  -0.7342915  1.283830
## 3 19.39058 24.12088  -3.960578  -0.7697210  1.219423
## 4 19.69813 24.12173  -3.580835  -0.8427612  1.111870
## 5 19.74336 24.12258  -3.420685  -0.9585347  1.116641
## 6 19.19200 24.12343  -3.639697  -1.2917366  1.528005

# 誤差分散の推定
#【注意】
# データへの適合誤差（fitting error）≒ 予測誤差という考え方で推定。
# 正確には１期先予測誤差系列から推定する必要がある。
v <- var(fit.stl[, 'Remainder']) # 不偏分散（誤差分散の推定値）
vs <- cumsum(rep(v, n.te))       # 累積和（時間経過ともに誤差拡大）
se <- sqrt(vs)                   # 不偏分散の平方根（標準誤差）
yhat.te.low <- yhat.te - 1.96 * se
yhat.te.upp <- yhat.te + 1.96 * se

#
# グラフ
#

# 画面表示範囲
range.px <- c(px.tr[n.tr - 24*7*3], px.te[n.te])

# トップパネル
par(mfrow = c(n.p + 2, 1), mar = c(2, 3, 1, 0))

matplot (x = px.al[ii.al],
         y = rep(mean(y.tr), n.al),
         type = 'n', xlim = range.px, ylim = c(20, 80))

abline(v = px.te[1], col = 3, lty = 2) # 予測開始線

grid()

matlines(x = px.al[ii.tr], y = y.tr,              type = 'o', pch = 16, col = 2, lty = 1) # 測定値
matlines(x = px.al[ii.al], y = stl.al[, 'Trend'], type = 'o', pch = NA, col = 4, lty = 1) # トレンド成分値
matlines(x = px.al[ii.te], y = y.te,              type = 'o', pch = 16, col = 2, lty = 1) # 予測正解値
matlines(x = px.al[ii.te], y = yhat.te,           type = 'o', pch =  1, col = 4, lty = 2) # 予測値
matlines(x = px.al[ii.te], y = yhat.te.low, col = 'lightgray') # 95%予測区間下限
matlines(x = px.al[ii.te], y = yhat.te.upp, col = 'lightgray') # 95%予測区間上限

# 凡例
legend('top', horiz = T, bty = 'n', 
       lty = c(1, 2, 1), pch = c(16, 1, NA, NA), col = c(2, 4, 'lightgray', 4),
       legend = c('原系列', '予測値', '95%予測区間', 'トレンド成分'))
legend('topright', bty = 'n', 
       legend = sprintf('MAPE: %2.1f%%', mean(abs((yhat.te - y.te)/y.te))*100))

# 周期成分パネル
for (j in 3:(n.p + 2))
{
  matplot(x = px.al, y = stl.al[, j],
          type = 'l', col = 4, xlim = range.px, ylim = c(-30, 30),
          main = paste0(PERIOD[j-2], '周期成分'))
  abline(v = px.te[1], col = 3, lty = 2) # 予測開始線
  grid()
}

# 残余成分パネル
matplot (x = px.al, y = stl.al[, 'Remainder'], 
         type = 'l', col = 4, xlim = range.px, ylim = c(-30, 30),
         main = '残余成分')
abline(v = px.te[1], col = 3, lty = 2) # 予測開始線
grid()

px <- seq(as.POSIXct('1949-01-01'),
          as.POSIXct('1960-12-01'), by = 'month')

d0 <- data.frame(px, y = AirPassengers)

head(d0)

library(forecast)

# 時系列データの作成
ts_data <- ts(AirPassengers, start = c(1949, 1), frequency = 12)

# msts関数で複数の季節性成分を設定
y <- ts_data
s <- mstl(x = msts(y, seasonal.period = c(24, 24*7)), s.window = 'periodic')

## Warning in mstl(x = msts(y, seasonal.period = c(24, 24 * 7)), s.window =
## "periodic"): Dropping seasonal components with fewer than two full periods.

# 結果のプロット
autoplot(s, main = '周期成分：時間変動なし')

機械学習

演習課題

22150007

2024-11-26