mix_model.R

## 该文件分析美国工艺成员的百分比的变化趋势
## daitu
## 统计预测与决策的作业
## 将使用混合预测模型
## 数据右1830年到2015年的数据


## 加载包
library(readr)
library(reshape)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:reshape':
## 
##     rename

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(forecast)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: timeDate

## This is forecast 7.3

library(ggfortify)

## 
## Attaching package: 'ggfortify'

## The following object is masked from 'package:forecast':
## 
##     gglagplot

## 读取数据
## 该数据为调查数据
UMdata <- read_csv("unions_states.csv")

## Parsed with column specification:
## cols(
##   Code = col_integer(),
##   State = col_character(),
##   Sector = col_character(),
##   Obs = col_integer(),
##   Employment = col_integer(),
##   Members = col_integer(),
##   Covered = col_integer(),
##   PctMem = col_double(),
##   PctCov = col_double(),
##   Year = col_integer()
## )

## 提取我们所需要的数据
UMdata2 <- UMdata %>%
  group_by(Year) %>%
  summarise(Employment = sum(Employment), # 工人总数
            Members = sum(Members)) %>%  # 工会成员认输
  mutate(PctMem = Members/Employment * 100)
  
summary(UMdata2)

##       Year        Employment           Members             PctMem     
##  Min.   :1983   Min.   :111464625   Min.   :16503689   Min.   :11.00  
##  1st Qu.:1991   1st Qu.:128415686   1st Qu.:18597256   1st Qu.:12.28  
##  Median :1999   Median :143427433   Median :20394437   Median :14.32  
##  Mean   :1999   Mean   :138453423   Mean   :20090089   Mean   :14.75  
##  3rd Qu.:2007   3rd Qu.:146857305   3rd Qu.:21521407   3rd Qu.:16.95  
##  Max.   :2015   Max.   :155414681   Max.   :24150856   Max.   :21.67

UMdata2p = melt(as.data.frame(UMdata2[c("Year","Employment","Members")]),
                id.vars = "Year")
## 时间序列数据可视化
p1 <- ggplot(UMdata2)+
  theme_bw(base_family = "STKaiti") +
  geom_line(aes(x = Year,y = Employment),colour = "blue") +
  geom_point(aes(x = Year,y = Employment),colour = "blue",shape = 7) +
  geom_line(aes(x = Year,y = Members),colour = "red") +
  geom_point(aes(x = Year,y = Members),colour = "red",shape = 8) +
  scale_y_continuous(labels = function(y) paste(y/10^6,"M",sep = "")) +
  labs(x = "年份",y = "人数",title = "美国工作人数")

# p1 <- ggplot(UMdata2p,aes(x = Year,y = value))+
#   theme_bw(base_family = "STKaiti") +
#   geom_line(aes(colour = variable)) + 
#   geom_point(aes(shape = variable))+ 
#   scale_y_continuous(labels = function(y) paste(y/10^6,"M",sep = "")) +
#   theme(legend.position = "top") +
#   labs(x = "年份",y = "人数",title = "美国工作人数")

p1

p2 <-  ggplot(UMdata2)+
  theme_bw(base_family = "STKaiti") +
  geom_line(aes(x = Year,y = PctMem),colour = "blue") +
  geom_point(aes(x = Year,y = PctMem),colour = "blue") +
  labs(x = "年份",y = "百分比",title = "美国工会人数百分比")

grid.arrange(p1,p2,nrow = 1)

## 对百分比的走势进行建模预测####
## 我们使用
traind <- UMdata2[1:30,]
testd <- UMdata2[31:33,]

## 模型一，回归模型
## 使用回归模型预测
traind$Year2 <- traind$Year^2
testd$Year2 <- testd$Year^2
UMdata2$Year2 <- UMdata2$Year^2

# lm1 <- lm(PctMem ~ Year,data = traind)
# summary(lm1)


lm1 <- lm(PctMem ~ Year + Year2,data = traind)
summary(lm1)

## 
## Call:
## lm(formula = PctMem ~ Year + Year2, data = traind)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.45312 -0.26377  0.00935  0.18637  1.03714 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.013e+04  3.641e+03   8.275 6.98e-09 ***
## Year        -2.984e+01  3.645e+00  -8.185 8.64e-09 ***
## Year2        7.391e-03  9.125e-04   8.100 1.06e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3343 on 27 degrees of freedom
## Multiple R-squared:  0.9868, Adjusted R-squared:  0.9858 
## F-statistic:  1009 on 2 and 27 DF,  p-value: < 2.2e-16

## 可视化回归分析
par(mfrow = c(2,2))
plot(lm1)

## 对数据预测
UMdata2$lm1.p <- predict(lm1,UMdata2)
## 查看数据你和的效果
ggplot()+
  theme_bw(base_family = "STKaiti") +
  geom_line(data = traind,aes(x = Year,y = PctMem),colour = "blue") +
  geom_point(data = traind,aes(x = Year,y = PctMem),colour = "blue",shape = 1) +
  geom_line(data = UMdata2,aes(x = Year,y = lm1.p),colour = "red") +
  geom_point(data = UMdata2,aes(x = Year,y = lm1.p),colour = "red",shape = 2) +
  geom_line(data = testd,aes(x = Year,y = PctMem),colour = "black") +
  geom_point(data = testd,aes(x = Year,y = PctMem),colour = "black",shape = 3) +
  theme(legend.text = element_text()) +
  labs(x = "年份",y = "百分比",title = "二次回归模型")

## 真实值与预测值的差
perror <- data.frame(LMerr = UMdata2$PctMem[31:33] - UMdata2$lm1.p[31:33])
UMdata2$PctMem[31:33] - UMdata2$lm1.p[31:33]

## [1] -0.2487769 -0.3436639 -0.3689510

## 模型2:时间序列模型ARIMA
## 根据图像我们可以使用ARIMA模型
## 构建时序数据
PctMers <- ts(traind$PctMem,start = 1983)
PctMers

## Time Series:
## Start = 1983 
## End = 2012 
## Frequency = 1 
##  [1] 21.66683 20.23070 19.30621 18.77613 18.19102 17.80142 17.43160
##  [8] 17.01402 16.94600 16.53290 16.44595 16.02033 15.37369 15.07364
## [15] 14.56882 14.31802 14.31511 13.85194 13.87770 13.63068 13.10867
## [22] 12.67736 12.55138 12.00297 12.10210 12.49727 12.28210 11.80674
## [29] 11.75698 11.17049

par(family = "STKaiti",mfrow= c(1,1))
ts.plot(PctMers,main = "美国工会人数百分比",xlab = "时间",ylab = "百分比")

## 自动构建ARIMA
P.arima <- auto.arima(PctMers)
summary(P.arima)

## Series: PctMers 
## ARIMA(0,2,1)                    
## 
## Coefficients:
##           ma1
##       -0.5468
## s.e.   0.2210
## 
## sigma^2 estimated as 0.106:  log likelihood=-7.97
## AIC=19.94   AICc=20.42   BIC=22.6
## 
## Training set error measures:
##                      ME      RMSE       MAE       MPE     MAPE      MASE
## Training set 0.06958824 0.3088381 0.2703008 0.3560171 1.895758 0.6794747
##                      ACF1
## Training set -0.004578832

## 使用模型进行预测
P.forecast <- forecast(P.arima, level = c(95), h = 3)
# autoplot(P.forecast,predict.colour = 'red', predict.linetype = 'dashed')
## 预测的数据
P.forecast$mean

## Time Series:
## Start = 2013 
## End = 2015 
## Frequency = 1 
## [1] 10.81964 10.46880 10.11796

## 计算与实际的误差
UMdata2$PctMem[31:33] - as.numeric(P.forecast$mean)

## [1] 0.4343044 0.6151373 0.8803514

perror$ARIMAerr <- UMdata2$PctMem[31:33] - as.numeric(P.forecast$mean)
## 可视化ARIMA(0,2,1)模型的效果
UMdata2$ARIMA <- c(as.numeric(P.forecast$fitted),as.numeric(P.forecast$mean))

ggplot()+
  theme_bw(base_family = "STKaiti") +
  geom_line(data = traind,aes(x = Year,y = PctMem),colour = "blue") +
  geom_point(data = traind,aes(x = Year,y = PctMem),colour = "blue",shape = 1) +
  geom_line(data = UMdata2,aes(x = Year,y = ARIMA),colour = "red") +
  geom_point(data = UMdata2,aes(x = Year,y = ARIMA),colour = "red",shape = 2) +
  geom_line(data = testd,aes(x = Year,y = PctMem),colour = "black") +
  geom_point(data = testd,aes(x = Year,y = PctMem),colour = "black",shape = 3) +
  theme(legend.text = element_text()) +
  labs(x = "年份",y = "百分比",title = "ARIMA模型")

## 将二次函数和ARIMA模型以一定的权重融合
## 找到最合适的权重
w1 <- 0.65
w2 <- 0.35

UMdata2$both <- w1*UMdata2$lm1.p + w2*UMdata2$ARIMA

ggplot()+
  theme_bw(base_family = "STKaiti") +
  geom_line(data = traind,aes(x = Year,y = PctMem),colour = "blue") +
  geom_point(data = traind,aes(x = Year,y = PctMem),colour = "blue",shape = 1) +
  geom_line(data = UMdata2,aes(x = Year,y = both),colour = "red") +
  geom_point(data = UMdata2,aes(x = Year,y = both),colour = "red",shape = 2) +
  geom_line(data = testd,aes(x = Year,y = PctMem),colour = "black") +
  geom_point(data = testd,aes(x = Year,y = PctMem),colour = "black",shape = 3) +
  theme(legend.text = element_text()) +
  labs(x = "年份",y = "百分比",title = "混合模型")

## 误差 
perror$MIXerr <- UMdata2$PctMem[31:33] - UMdata2$both[31:33]
UMdata2$PctMem[31:33] - UMdata2$both[31:33]

## [1] -0.009698409 -0.008083470  0.068304855

## 找到最合是的权重
w1 <- seq(0,1,by = 0.01)
w2 <- 1-w1
errs <- numeric(length = length(w1))
## 
for(ii in 1:length(w1)){
  both <- w1[ii]*UMdata2$lm1.p + w2[ii]*UMdata2$ARIMA
  ## 绝对值只和最小
  errs[ii] <- mean(abs(UMdata2$PctMem[1:30] - both[1:30]))
}

ggplot() +
  theme_bw(base_family = "STKaiti") +
  geom_line(aes(x = w1,y = errs)) +
  geom_point(aes(x = w1[which(errs == min(errs))],y = min(errs)),
             colour = "red") +
  geom_text(aes(x = w1[which(errs == min(errs))]-0.05,y = min(errs)+0.01,label = "w1=0.59,w2=0.41"))+
  labs(y = "平均绝对值误差",title = "权重的选取")

## 构建误差最小的模型权重
w1 <- w1[which(errs == min(errs))]
w2 <- w2[which(errs == min(errs))]


UMdata2$both <- w1*UMdata2$lm1.p + w2*UMdata2$ARIMA

ggplot()+
  theme_bw(base_family = "STKaiti") +
  geom_line(data = traind,aes(x = Year,y = PctMem),colour = "blue") +
  geom_point(data = traind,aes(x = Year,y = PctMem),colour = "blue",shape = 1) +
  geom_line(data = UMdata2,aes(x = Year,y = both),colour = "red") +
  geom_point(data = UMdata2,aes(x = Year,y = both),colour = "red",shape = 2) +
  geom_line(data = testd,aes(x = Year,y = PctMem),colour = "black") +
  geom_point(data = testd,aes(x = Year,y = PctMem),colour = "black",shape = 3) +
  theme(legend.text = element_text()) +
  labs(x = "年份",y = "百分比",title = "混合模型")

## 误差 
perror$MIXerr <- UMdata2$PctMem[31:33] - UMdata2$both[31:33]
UMdata2$PctMem[31:33] - UMdata2$both[31:33]

## [1] 0.03128647 0.04944460 0.14326300

## 预测误差的对比图
perror$Year <- UMdata2$Year[31:33]

melt(perror,id.vars = "Year") %>%
  ggplot(aes(x = Year,y = value,fill = variable)) +
  theme_bw(base_family = "STKaiti") +
  geom_bar(stat = "identity",position = position_dodge()) +
  scale_x_continuous(labels = function(x) paste(x,"年",sep = "")) +
  labs(x = "时间",y = "预测误差",title = "三个模型误差的对比")

mix_model.R

daitu

Wed Nov 16 10:54:21 2016