\[ 在R中调用X13来做季节调整并预测 \]

问题描述与摘要

要处理的数据是10个地区2005年到2013年的快消品消费量,是时间序列问题。但是消费非常容易受到节假日的影响,因此需要做节假日的调整。常见的调整有 trading days、 holidays。但是holidays中最大的问题就是中国春节。春节是按照农历来过的,每年都会不一样,这样就给分析带来了麻烦。
X13提供插入节假日调整因子的参数,用genhol产生调整系数,导入X13进行回归为解决这个问题提供了可能。
本文首先处理数据并导入R中转化为时间序列,然后截取2005到2012年的数据建模,用2013年的数据来检验预测的好坏。先采用默认X13方法处理,再与arima比较,最后加入调整因子。在最后一步中由于不确定节前节后多少天会对模型有影响,所以需要循环调试出最佳模型,这一步会相当复杂。

1.准备数据

setwd("C:/Users/asus/Desktop/我的文件/统计实验作业/2/数据")
getdata<-function(name){
  rm(list=ls())
  gc()
  y<- array(data=NA,dim=c(120,7,11))
  
  for(i in 1:length(name)){
    a<-as.matrix(read.csv(name[i],header=F))
    y[,,i]<-a
  }
  sumdata<-y[,,1]
  for(i in 2:10)
  {
    sumdata<-sumdata+y[,,i]
  }
  y[,,11]<-sumdata
  return(y)
}
name<- scan("areas.txt",what="character")
name
##  [1] "城区营销部.csv" "当阳营销部.csv" "五峰营销部.csv" "兴山营销部.csv"
##  [5] "夷陵营销部.csv" "宜都营销部.csv" "远安营销部.csv" "长阳营销部.csv"
##  [9] "枝江营销部.csv" "秭归营销部.csv"
getdata(name) -> y
# 各地区销量总和的时间序列
tsdata<-ts(y[,,11],start=c(2005,1),end=c(2012,12),frequency=12)
# 去除第六类雪茄类,因为不好分析而且没有多大意义
t7<-tsdata[,7]-tsdata[,6]
originals7<-y[,,11][1:108,7]- y[,,11][1:108,6]
plot(t7,type="o",col="darkblue") 

2.Use X13 method (default)

2.1 set system environment to use X13

Sys.setenv(X13_PATH="C:\\Users\\asus\\Desktop\\我的文件\\统计实验作业\\2\\x13\\x13as")
if (require("seasonal")!=T)
    install.packages("seasonal")  
## Loading required package: seasonal
## 
## seasonal now supports the HTML version of X13, which offers a more
## accessible output via the out() function. For best user experience, 
## download the HTML version from:
## 
##   http://www.census.gov/srd/www/x13as/x13down_pc.html
## 
## and copy x13ashtml.exe to:
## 
##   C:\Users\asus\Desktop\我的文件\统计实验作业\2\x13\x13as
library(seasonal)
checkX13()
## Congratulations! 'seasonal' should work fine!
##               - the X13_PATH is correctly specified
##               - the binary executable file has been found
##               - a test run has been successful
## 
## seasonal now supports the HTML version of X13, which offers a more
## accessible output via the out() function. For best user experience, 
## download the HTML version from:
## 
##   http://www.census.gov/srd/www/x13as/x13down_pc.html
## 
## and copy x13ashtml.exe to:
## 
##   C:\Users\asus\Desktop\我的文件\统计实验作业\2\x13\x13as

2.2 X13 默认

m<-seas(t7)
summary(m)
## 
## Call:
## seas(x = t7)
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## Constant           0.01083    0.00642   1.687  0.09165 .  
## AO2011.Feb        -0.23340    0.05343  -4.369 1.25e-05 ***
## AR-Nonseasonal-01 -0.37664    0.10094  -3.731  0.00019 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## SEATS adj.  ARIMA: (1 0 0)(0 1 0)  Obs.: 96  Transform: log
## AICc:  2724, BIC:  2733  QS seas. test (adj. series):    0  
## Box-Ljung (no autocorr.): 18.64   Shapiro (normality): 0.9786
plot(m,type="o",trend=T)

2.3 趋势图

plot(final(m),col="cyan") # final adjusted value

plot(series(m,"s12"),col="hotpink") #trend  [plot(trend(m))]

2.4 画出季节指数图

monthplot(m)

plot(series(m,"s10")) #季节指数

monthplot(m,choice = "irregular")

2.5 模型的评价

因为有随机扰动的干扰,在评价模型的时候应该把各年的加起来算每一年的实际值与预测值的相对误差

#绝对相对误差
fit<-rowSums(t(array(final(m),dim = c(12,8))))
yfit<-c(fit,sum(series(m,"forecast.forecasts")[,1][1:12]))
## specs have been added to the model: forecast
yo<-rowSums(t(array(originals7,dim = c(12,9))))
year<-2005:2013
result<-data.frame(year,yfit,yo)
result$AD<-with(result,(abs(yfit-yo))/yo * 100)
result
##   year      yfit        yo        AD
## 1 2005 371606108 375606752 1.0651152
## 2 2006 374335297 380187500 1.5392939
## 3 2007 385774382 378153666 2.0152432
## 4 2008 384311808 378581577 1.5136054
## 5 2009 385085263 384448097 0.1657352
## 6 2010 391492753 387086557 1.1382973
## 7 2011 392594801 398983123 1.6011510
## 8 2012 402377648 404917304 0.6272036
## 9 2013 410256928 411067767 0.1972518
#平均绝对相对误差(%)
(MAD<- with(result,mean(abs(yfit-yo))/mean(yo) * 100))
## [1] 1.085614

运用X13默认参数,我们得到的平均绝对误差为 1.0856137 .

3. 尝试用 ARIMA model

3.1 The ARIMA model that used in X13

fivebestmdl(m)
##            arima    bic
## 1 (1 0 0)(0 1 0) -2.049
## 2 (1 0 0)(0 1 1) -2.038
## 3 (0 0 1)(0 1 0) -2.026
## 4 (0 0 2)(0 1 0) -1.999
## 5 (2 0 0)(0 1 0) -1.998

3.2 The ARIMA model

library(forecast)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Loading required package: timeDate
## This is forecast 5.9
(arma7<-auto.arima(t7))
## Series: t7 
## ARIMA(0,0,1)(0,1,1)[12] with drift         
## 
## Coefficients:
##           ma1     sma1      drift
##       -0.5420  -0.3324  30084.826
## s.e.   0.0915   0.1401   8619.648
## 
## sigma^2 estimated as 7.966e+12:  log likelihood=-1166.79
## AIC=2341.58   AICc=2342.08   BIC=2351.3
f7<-forecast(arma7)
plot(f7,type="o")
lines(ts(originals7[97:112],start=c(2013,1),end=c(2014,4),frequency=12),col="red")
points(ts(originals7[97:112],start=c(2013,1),end=c(2014,4),frequency=12),col="red")

#相对绝对误差
{fit.a<-rowSums(t(array(f7$fitted,dim = c(12,8))))
yfit.a<-c(fit.a,sum(f7$mean[1:12]))
yo<-rowSums(t(array(originals7,dim = c(12,9))))
result.a<-data.frame(year,yfit.a,yo)
result.a$AD<-with(result.a,(abs(yfit.a-yo))/yo * 100)
result.a
#平均相对绝对误差(%)
(MAD.a<- with(result.a,mean(abs(yfit.a-yo))/mean(yo) * 100))
}
## [1] 1.432821
图中蓝线是预测的结果,红线是原始数据值.从图中看,arima模型还是不错的。平均绝对误差为 1.4328208. 下面尝试调整X13模型的arima参数,看看有没有效果。

4. 调整X13模型的参数

m1<- seas(t7,
           transform.function = "log",
           arima.model = "( 0 0 1)( 0 1 1)12")
#绝对相对误差
{fit.1<-rowSums(t(array(final(m1),dim = c(12,8))))
yfit.1<-c(fit.1,sum(series(m1,"forecast.forecasts")[,1][1:12]))
yo<-rowSums(t(array(originals7,dim = c(12,9))))
result.1<-data.frame(year,yfit.1,yo)
result.1$AD<-with(result.1,(abs(yfit.1-yo))/yo * 100)
result.1
#平均绝对相对误差(%)
(MAD.1<- with(result.1,mean(abs(yfit.1-yo))/mean(yo) * 100))
}
## specs have been added to the model: forecast
## [1] 1.098513
m2<-seas(t7,
         transform.function = "log",
         arima.model = "( 1 0 0)( 0 1 1)12",)
#绝对相对误差
{fit.2<-rowSums(t(array(final(m2),dim = c(12,8))))
yfit.2<-c(fit.2,sum(series(m2,"forecast.forecasts")[,1][1:12]))
result.2<-data.frame(year,yfit.2,yo)
result.2$AD<-with(result.2,(abs(yfit.2-yo))/yo * 100)
result.2
#平均绝对相对误差(%)
(MAD.2<- with(result.2,mean(abs(yfit.2-yo))/mean(yo) * 100))
}
## specs have been added to the model: forecast
## [1] 1.097507
m3<-seas(t7,
         transform.function = "log",
         arima.model = "( 0 0 1)( 0 1 1)")
#相对绝对误差
{fit.3<-rowSums(t(array(final(m3),dim = c(12,8))))
yfit.3<-c(fit.3,sum(series(m3,"forecast.forecasts")[,1][1:12]))
result.3<-data.frame(year,yfit.3,yo)
result.3$AD<-with(result.3,(abs(yfit.3-yo))/yo * 100)
result.3
#平均相对绝对误差(%)
(MAD.3<- with(result.3,mean(abs(yfit.3-yo))/mean(yo) * 100))
}
## specs have been added to the model: forecast
## [1] 1.098513

5个模型的平均绝对相对误差

cbind(MAD,MAD.a,MAD.1,MAD.2,MAD.3)
##           MAD    MAD.a    MAD.1    MAD.2    MAD.3
## [1,] 1.085614 1.432821 1.098513 1.097507 1.098513

比较上面的5个模型,发现通过改变X13模型中的arima参数,并没有改善。需要来调整其他参数,通过前面的分析我们知道了春节效应的影响,所以下面就用genhol来产生春节效应因子。

5.Genhol

通过组合实验300次,选择了(21,3,7,21).这一组参数,也就是年前21天开始到年前3天,和年后7天到年后21天这两段时间对总销量影响比较厉害,通过调整回归变量,使2005年到2013平均绝对误差达到了0.4488%,预测的2013年相对绝对误差为 0.4045% 。

data(holiday)
cny1<-window(genhol(cny, start = -21, end = -3, center = "calendar"),start=c(2005,1),end=c(2016,12))
cny2 <- window(genhol(cny, start = -2, end = 7, center = "calendar"),start=c(2005,1),end=c(2016,12))
cny3<- window(genhol(cny, start = 8, end = 21, center = "calendar"),start=c(2005,1),end=c(2016,12))
t1 <- cbind(cny1,cny2,cny3)
mt<- seas(t7,xreg = t1)
#相对绝对误差
fit.mt<-rowSums(t(array(final(mt),dim = c(12,8))))
yfit.mt<-c(fit.mt,sum(series(mt,"forecast.forecasts")[,1][1:12]))
## specs have been added to the model: forecast
yo<-rowSums(t(array(originals7,dim = c(12,9))))
result.mt<-data.frame(year,yfit.mt,yo)
result.mt$AD<-with(result.mt,(abs(yfit.mt-yo))/yo * 100)
result.mt
##   year   yfit.mt        yo        AD
## 1 2005 374102685 375606752 0.4004368
## 2 2006 375283183 380187500 1.2899734
## 3 2007 379163258 378153666 0.2669794
## 4 2008 381792705 378581577 0.8481997
## 5 2009 383483043 384448097 0.2510232
## 6 2010 388396441 387086557 0.3383955
## 7 2011 398565509 398983123 0.1046695
## 8 2012 405639172 404917304 0.1782755
## 9 2013 409404886 411067767 0.4045273
#平均相对绝对误差(%)
(MAD<-with(result.mt,mean(abs(yfit.mt-yo))/mean(yo)*100))
## [1] 0.4488786

 ---赵 炜  ~~11140040~~