來店人數資料
library(forecast)
library(tidyverse)
library(lubridate)
traffic_data <- read.csv('C:\\Users\\User\\Desktop\\time series\\thesis\\TimeSeriesData.csv',h=T)
dmy : 將格式為 日-月-年 的字串轉成日期格式
- ex : “01-01-2007” or “1-Jan-07” 轉成 “2007-01-01”
traffic_data$date2 <- dmy(traffic_data$date2)
新增 newdate 變數做為新的資料
newdate <- (unique(traffic_data$date2) %>% rep(24))
newdate <- data.frame("date" = ymd(newdate))
- 由於 dshw 不接受’值等於0’的data,所以最小值設為1
newdate$traffic <- rep(1,8760)
head(newdate)
新增 hour (0 ~ 23)
xx <- c()
for(i in 0:23) { xx <- c(xx,rep(i,365)) }
newdate$hour <- xx
head(newdate)
將原始資料轉進 newdate
newdate$ymdh <- paste(newdate$date,newdate$hour,sep = "-") %>% ymd_h()
traffic_data$ymdh <- paste(traffic_data$date2,traffic_data$hour,sep = "-") %>% ymd_h()
newdate$traffic[match(traffic_data$ymdh,newdate$ymdh)] <- traffic_data$traffic
newdate$traffic[which(newdate$traffic==0)] <- 1
newdate <- newdate[order(newdate$ymdh,newdate$hour),]
head(newdate)
加入 week 週數 (1~52) , 12/31 放到 53 周
newdate$week <- lubridate::week(newdate$date)
head(newdate)
加入星期 dow (day of the week) : 星期日 = 0 , 星期一~六 = 1~6
newdate$dow <- sapply(newdate$date,weekdays)
head(newdate)
三天休假設col=3,休假隔日設col=2
newdate$groupcol <- 1
newdate$groupcol[which(newdate$date==ymd("2007/04/08"))] <- 3
newdate$groupcol[which(newdate$date==ymd("2007/11/22"))] <- 3
newdate$groupcol[which(newdate$date==ymd("2007/12/25"))] <- 3
newdate$groupcol[which(newdate$date==ymd("2007/04/09"))] <- 2
newdate$groupcol[which(newdate$date==ymd("2007/11/23"))] <- 2
newdate$groupcol[which(newdate$date==ymd("2007/12/26"))] <- 2
head(newdate)
插補
w1=NULL;w2=NULL;w3=NULL;q1=NULL;q2=NULL;q3=NULL
for(j in 0:23) {
z1 = newdate$traffic[which((newdate$dow == "星期日") & newdate$hour == j & newdate$week > 8 & newdate$week < 14)]
z2 = newdate$traffic[which((newdate$dow == "星期四") & newdate$hour == j & newdate$week > 41 & newdate$week < 47)]
z3 = newdate$traffic[which((newdate$dow == "星期二") & newdate$hour == j & newdate$week > 46 & newdate$week < 52)]
x1 = newdate$traffic[which((newdate$dow == "星期一") & newdate$hour == j & newdate$week > 9 & newdate$week < 15)]
x2 = newdate$traffic[which((newdate$dow == "星期五") & newdate$hour == j & newdate$week > 41 & newdate$week < 47)]
x3 = newdate$traffic[which((newdate$dow == "星期三") & newdate$hour == j & newdate$week > 46 & newdate$week < 52)]
w1=cbind(w1,z1)
w2=cbind(w2,z2)
w3=cbind(w3,z3)
q1=cbind(q1,x1)
q2=cbind(q2,x2)
q3=cbind(q3,x3)
}
newdate$traffic[which(newdate$date==ymd("2007/04/08"))] <- apply(w1,2,mean)
newdate$traffic[which(newdate$date==ymd("2007/11/22"))] <- apply(w2,2,mean)
newdate$traffic[which(newdate$date==ymd("2007/12/25"))] <- apply(w3,2,mean)
newdate$traffic[which(newdate$date==ymd("2007/04/09"))] <- apply(q1,2,mean)
newdate$traffic[which(newdate$date==ymd("2007/11/23"))] <- apply(q2,2,mean)
newdate$traffic[which(newdate$date==ymd("2007/12/26"))] <- apply(q3,2,mean)
par(mfcol = c(2,1))
plot(newdate1[,c(4,2)],type = "h",col = as.numeric(newdate$groupcol),main = "插捕前")
plot(newdate[,c(4,2)],type = "h",col = as.numeric(newdate$groupcol),main = "插補後")

msts 可將資料設定為多季節時間序列
tts <- msts(newdate$traffic,seasonal.periods = c(24,168),start = c(2007,1))
季節圖
ggseasonplot(tts)+
theme(legend.position = "none")

插補後資料 acf pacf 圖


將整理好的資料丟進 dshw() 函數
ttds <- dshw(tts)
autoplot(ttds,xlim = c(2055,2062))

dshw 模型殘差 acf pacf 圖
ttds$residuals %>% ggAcf()

ttds$residuals %>% ggPacf()

kaggle data
train <- read.csv("C:/Users/User/Desktop/Rossmann Store Sales/train/train.csv",sep = ",")
str(train)
## 'data.frame': 1017209 obs. of 9 variables:
## $ Store : int 1 2 3 4 5 6 7 8 9 10 ...
## $ DayOfWeek : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Date : Factor w/ 942 levels "2013-01-01","2013-01-02",..: 942 942 942 942 942 942 942 942 942 942 ...
## $ Sales : int 5263 6064 8314 13995 4822 5651 15344 8492 8565 7185 ...
## $ Customers : int 555 625 821 1498 559 589 1414 833 687 681 ...
## $ Open : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Promo : int 1 1 1 1 1 1 1 1 1 1 ...
## $ StateHoliday : Factor w/ 4 levels "0","a","b","c": 1 1 1 1 1 1 1 1 1 1 ...
## $ SchoolHoliday: int 1 1 1 1 1 1 1 1 1 1 ...
### sales data
library(reshape2)
train.1 <- dcast(train[,-c(2,5:9)],Store~Date,sum)
train.2 <- train.1[,-1]
train.total <- colSums(train.2)
train.total.ts <- ts(t(train.2[1,]),frequency = 7)
str(train.total.ts)
## Time-Series [1:942, 1] from 1 to 135: 0 5530 4327 4486 4997 0 7176 5580 5471 4892 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr "1"
plot(x=c(1:942),y=train.2[2,],type = "l")

HoltWinters(train.total.ts,gamma = FALSE) %>% forecast() %>% plot()

seasonplot(train.total.ts)

ggsubseriesplot(train.total.ts)

customers data
train.customer <- dcast(train[,-c(2,4,6:9)],Store~Date,sum)
train.customer.1 <- train.customer[,-1]
train.customer.1.ts <- ts(t(train.customer.1[1,]),frequency = 7)
HoltWinters(train.customer.1.ts) %>% forecast() %>% plot()

seasonplot(train.customer.1.ts)

ggsubseriesplot(train.customer.1.ts)

ggseasonplot(train.customer.1.ts)+theme(legend.position = "none")

gglagplot(train.customer.1.ts)

ggAcf(train.customer.1.ts)

findfrequency(train.customer.1.ts)
## [1] 4
Python
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
df = pd.read_csv('C:/Users/User/Desktop/time series/thesis/TimeSeriesData.csv')
print(df.head())
## stid store_nm date month day year \
## 0 60609 0006 - San Francisco Centre 1-Jan-07 1 1 2007
## 1 60609 0006 - San Francisco Centre 1-Jan-07 1 1 2007
## 2 60609 0006 - San Francisco Centre 1-Jan-07 1 1 2007
## 3 60609 0006 - San Francisco Centre 1-Jan-07 1 1 2007
## 4 60609 0006 - San Francisco Centre 1-Jan-07 1 1 2007
##
## hours_ending traffic hour date2 dow dow.1 Unnamed: 12
## 0 10:00 4 10 1-Jan-07 1 1.0 Mon
## 1 11:00 17 11 1-Jan-07 1 2.0 Tue
## 2 12:00 56 12 1-Jan-07 1 3.0 Wed
## 3 13:00 83 13 1-Jan-07 1 4.0 Thu
## 4 14:00 77 14 1-Jan-07 1 5.0 Fri
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
df = pd.read_csv('C:/Users/User/Desktop/time series/thesis/TimeSeriesData.csv')
print(df.describe())
## stid month day year traffic hour \
## count 3850.0 3850.000000 3850.000000 3850.0 3850.000000 3850.000000
## mean 60609.0 6.571169 15.825974 2007.0 93.609091 14.934545
## std 0.0 3.461663 8.794460 0.0 67.943607 3.221836
## min 60609.0 1.000000 1.000000 2007.0 0.000000 0.000000
## 25% 60609.0 4.000000 8.000000 2007.0 50.000000 12.000000
## 50% 60609.0 7.000000 16.000000 2007.0 81.000000 15.000000
## 75% 60609.0 10.000000 23.000000 2007.0 123.000000 18.000000
## max 60609.0 12.000000 31.000000 2007.0 540.000000 23.000000
##
## dow dow.1
## count 3850.000000 7.000000
## mean 3.160000 3.000000
## std 1.934994 2.160247
## min 0.000000 0.000000
## 25% 1.000000 1.500000
## 50% 3.000000 3.000000
## 75% 5.000000 4.500000
## max 6.000000 6.000000
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
df = pd.read_csv('C:/Users/User/Desktop/time series/thesis/TimeSeriesData.csv')
plt.plot(df.traffic)
# plt.savefig('data.png',dpi=600)
