0.0.1 Datacamp 線上課程

Online course on forecasting using R

0.0.2 Forecasting: principles and practice

Innovations state space models for exponential smoothing


1 來店人數資料

library(forecast)
library(tidyverse)
library(lubridate)
traffic_data <- read.csv('C:\\Users\\User\\Desktop\\time series\\thesis\\TimeSeriesData.csv',h=T)

1.0.1 dmy : 將格式為 日-月-年 的字串轉成日期格式

  • ex : “01-01-2007” or “1-Jan-07” 轉成 “2007-01-01”
traffic_data$date2 <- dmy(traffic_data$date2)

1.0.2 新增 newdate 變數做為新的資料

newdate <- (unique(traffic_data$date2) %>% rep(24))
newdate <- data.frame("date" = ymd(newdate))
  • 由於 dshw 不接受’值等於0’的data,所以最小值設為1
newdate$traffic <- rep(1,8760)
head(newdate)

1.0.3 新增 hour (0 ~ 23)

  • 每個小時重複365次
xx <- c()
for(i in 0:23) { xx <- c(xx,rep(i,365)) }
newdate$hour <- xx
head(newdate)

1.0.4 將原始資料轉進 newdate

newdate$ymdh <- paste(newdate$date,newdate$hour,sep = "-") %>% ymd_h()

traffic_data$ymdh <- paste(traffic_data$date2,traffic_data$hour,sep = "-") %>% ymd_h()

newdate$traffic[match(traffic_data$ymdh,newdate$ymdh)] <- traffic_data$traffic

newdate$traffic[which(newdate$traffic==0)] <- 1

newdate <- newdate[order(newdate$ymdh,newdate$hour),]
head(newdate)

1.0.5 加入 week 週數 (1~52) , 12/31 放到 53 周

newdate$week <- lubridate::week(newdate$date)
head(newdate)

1.0.6 加入星期 dow (day of the week) : 星期日 = 0 , 星期一~六 = 1~6

newdate$dow <- sapply(newdate$date,weekdays)
head(newdate)

1.0.7 三天休假設col=3,休假隔日設col=2

newdate$groupcol <- 1
newdate$groupcol[which(newdate$date==ymd("2007/04/08"))] <- 3
newdate$groupcol[which(newdate$date==ymd("2007/11/22"))] <- 3
newdate$groupcol[which(newdate$date==ymd("2007/12/25"))] <- 3

newdate$groupcol[which(newdate$date==ymd("2007/04/09"))] <- 2
newdate$groupcol[which(newdate$date==ymd("2007/11/23"))] <- 2
newdate$groupcol[which(newdate$date==ymd("2007/12/26"))] <- 2

head(newdate)

1.0.8 插補

w1=NULL;w2=NULL;w3=NULL;q1=NULL;q2=NULL;q3=NULL
for(j in 0:23) {
  z1 = newdate$traffic[which((newdate$dow == "星期日") & newdate$hour == j & newdate$week > 8 & newdate$week < 14)]
  z2 = newdate$traffic[which((newdate$dow == "星期四") & newdate$hour == j & newdate$week > 41 & newdate$week < 47)]
  z3 = newdate$traffic[which((newdate$dow == "星期二") & newdate$hour == j & newdate$week > 46 & newdate$week < 52)]

  x1 = newdate$traffic[which((newdate$dow == "星期一") & newdate$hour == j & newdate$week > 9 & newdate$week < 15)]
  x2 = newdate$traffic[which((newdate$dow == "星期五") & newdate$hour == j & newdate$week > 41 & newdate$week < 47)]
  x3 = newdate$traffic[which((newdate$dow == "星期三") & newdate$hour == j & newdate$week > 46 & newdate$week < 52)]

  w1=cbind(w1,z1)
  w2=cbind(w2,z2)
  w3=cbind(w3,z3)
  
  q1=cbind(q1,x1)
  q2=cbind(q2,x2)
  q3=cbind(q3,x3)
}

newdate$traffic[which(newdate$date==ymd("2007/04/08"))] <- apply(w1,2,mean)
newdate$traffic[which(newdate$date==ymd("2007/11/22"))] <- apply(w2,2,mean)
newdate$traffic[which(newdate$date==ymd("2007/12/25"))] <- apply(w3,2,mean)

newdate$traffic[which(newdate$date==ymd("2007/04/09"))] <- apply(q1,2,mean)
newdate$traffic[which(newdate$date==ymd("2007/11/23"))] <- apply(q2,2,mean)
newdate$traffic[which(newdate$date==ymd("2007/12/26"))] <- apply(q3,2,mean)
par(mfcol = c(2,1))
plot(newdate1[,c(4,2)],type = "h",col = as.numeric(newdate$groupcol),main = "插捕前")
plot(newdate[,c(4,2)],type = "h",col = as.numeric(newdate$groupcol),main = "插補後")

1.0.9 msts 可將資料設定為多季節時間序列

tts <- msts(newdate$traffic,seasonal.periods = c(24,168),start = c(2007,1))

1.0.10 季節圖

ggseasonplot(tts)+
  theme(legend.position = "none")

1.0.11 插補後資料 acf pacf 圖

tts %>% ggAcf()

tts %>% ggPacf()

1.0.12 將整理好的資料丟進 dshw() 函數

ttds <- dshw(tts)
autoplot(ttds,xlim = c(2055,2062))

1.0.13 dshw 模型殘差 acf pacf 圖

ttds$residuals %>% ggAcf()

ttds$residuals %>% ggPacf()


2 kaggle data

train <- read.csv("C:/Users/User/Desktop/Rossmann Store Sales/train/train.csv",sep = ",")
str(train)
## 'data.frame':    1017209 obs. of  9 variables:
##  $ Store        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ DayOfWeek    : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ Date         : Factor w/ 942 levels "2013-01-01","2013-01-02",..: 942 942 942 942 942 942 942 942 942 942 ...
##  $ Sales        : int  5263 6064 8314 13995 4822 5651 15344 8492 8565 7185 ...
##  $ Customers    : int  555 625 821 1498 559 589 1414 833 687 681 ...
##  $ Open         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Promo        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ StateHoliday : Factor w/ 4 levels "0","a","b","c": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SchoolHoliday: int  1 1 1 1 1 1 1 1 1 1 ...
###  sales data 
library(reshape2)
train.1 <- dcast(train[,-c(2,5:9)],Store~Date,sum)
train.2 <- train.1[,-1]

train.total <- colSums(train.2)
train.total.ts <- ts(t(train.2[1,]),frequency = 7)

str(train.total.ts)
##  Time-Series [1:942, 1] from 1 to 135: 0 5530 4327 4486 4997 0 7176 5580 5471 4892 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr "1"
plot(x=c(1:942),y=train.2[2,],type = "l")

HoltWinters(train.total.ts,gamma = FALSE) %>% forecast() %>% plot()

seasonplot(train.total.ts)

ggsubseriesplot(train.total.ts)

2.0.1 customers data

train.customer <- dcast(train[,-c(2,4,6:9)],Store~Date,sum)
train.customer.1 <- train.customer[,-1]
train.customer.1.ts <- ts(t(train.customer.1[1,]),frequency = 7)
HoltWinters(train.customer.1.ts) %>% forecast() %>% plot()

seasonplot(train.customer.1.ts)

ggsubseriesplot(train.customer.1.ts)

ggseasonplot(train.customer.1.ts)+theme(legend.position = "none")

gglagplot(train.customer.1.ts)

ggAcf(train.customer.1.ts)

findfrequency(train.customer.1.ts)
## [1] 4

3 Python

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
df = pd.read_csv('C:/Users/User/Desktop/time series/thesis/TimeSeriesData.csv')
print(df.head())
##     stid                     store_nm      date  month  day  year  \
## 0  60609  0006 - San Francisco Centre  1-Jan-07      1    1  2007   
## 1  60609  0006 - San Francisco Centre  1-Jan-07      1    1  2007   
## 2  60609  0006 - San Francisco Centre  1-Jan-07      1    1  2007   
## 3  60609  0006 - San Francisco Centre  1-Jan-07      1    1  2007   
## 4  60609  0006 - San Francisco Centre  1-Jan-07      1    1  2007   
## 
##   hours_ending  traffic  hour     date2  dow  dow.1 Unnamed: 12  
## 0        10:00        4    10  1-Jan-07    1    1.0         Mon  
## 1        11:00       17    11  1-Jan-07    1    2.0         Tue  
## 2        12:00       56    12  1-Jan-07    1    3.0         Wed  
## 3        13:00       83    13  1-Jan-07    1    4.0         Thu  
## 4        14:00       77    14  1-Jan-07    1    5.0         Fri
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
df = pd.read_csv('C:/Users/User/Desktop/time series/thesis/TimeSeriesData.csv')
print(df.describe())
##           stid        month          day    year      traffic         hour  \
## count   3850.0  3850.000000  3850.000000  3850.0  3850.000000  3850.000000   
## mean   60609.0     6.571169    15.825974  2007.0    93.609091    14.934545   
## std        0.0     3.461663     8.794460     0.0    67.943607     3.221836   
## min    60609.0     1.000000     1.000000  2007.0     0.000000     0.000000   
## 25%    60609.0     4.000000     8.000000  2007.0    50.000000    12.000000   
## 50%    60609.0     7.000000    16.000000  2007.0    81.000000    15.000000   
## 75%    60609.0    10.000000    23.000000  2007.0   123.000000    18.000000   
## max    60609.0    12.000000    31.000000  2007.0   540.000000    23.000000   
## 
##                dow     dow.1  
## count  3850.000000  7.000000  
## mean      3.160000  3.000000  
## std       1.934994  2.160247  
## min       0.000000  0.000000  
## 25%       1.000000  1.500000  
## 50%       3.000000  3.000000  
## 75%       5.000000  4.500000  
## max       6.000000  6.000000
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
df = pd.read_csv('C:/Users/User/Desktop/time series/thesis/TimeSeriesData.csv')
plt.plot(df.traffic)
# plt.savefig('data.png',dpi=600)