## 该文件分析美国工艺成员的百分比的变化趋势
## daitu
## 统计预测与决策的作业
## 将使用混合预测模型
## 数据右1830年到2015年的数据
## 加载包
library(readr)
library(reshape)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:reshape':
##
## rename
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(forecast)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: timeDate
## This is forecast 7.3
library(ggfortify)
##
## Attaching package: 'ggfortify'
## The following object is masked from 'package:forecast':
##
## gglagplot
## 读取数据
## 该数据为调查数据
UMdata <- read_csv("unions_states.csv")
## Parsed with column specification:
## cols(
## Code = col_integer(),
## State = col_character(),
## Sector = col_character(),
## Obs = col_integer(),
## Employment = col_integer(),
## Members = col_integer(),
## Covered = col_integer(),
## PctMem = col_double(),
## PctCov = col_double(),
## Year = col_integer()
## )
## 提取我们所需要的数据
UMdata2 <- UMdata %>%
group_by(Year) %>%
summarise(Employment = sum(Employment), # 工人总数
Members = sum(Members)) %>% # 工会成员认输
mutate(PctMem = Members/Employment * 100)
summary(UMdata2)
## Year Employment Members PctMem
## Min. :1983 Min. :111464625 Min. :16503689 Min. :11.00
## 1st Qu.:1991 1st Qu.:128415686 1st Qu.:18597256 1st Qu.:12.28
## Median :1999 Median :143427433 Median :20394437 Median :14.32
## Mean :1999 Mean :138453423 Mean :20090089 Mean :14.75
## 3rd Qu.:2007 3rd Qu.:146857305 3rd Qu.:21521407 3rd Qu.:16.95
## Max. :2015 Max. :155414681 Max. :24150856 Max. :21.67
UMdata2p = melt(as.data.frame(UMdata2[c("Year","Employment","Members")]),
id.vars = "Year")
## 时间序列数据可视化
p1 <- ggplot(UMdata2)+
theme_bw(base_family = "STKaiti") +
geom_line(aes(x = Year,y = Employment),colour = "blue") +
geom_point(aes(x = Year,y = Employment),colour = "blue",shape = 7) +
geom_line(aes(x = Year,y = Members),colour = "red") +
geom_point(aes(x = Year,y = Members),colour = "red",shape = 8) +
scale_y_continuous(labels = function(y) paste(y/10^6,"M",sep = "")) +
labs(x = "年份",y = "人数",title = "美国工作人数")
# p1 <- ggplot(UMdata2p,aes(x = Year,y = value))+
# theme_bw(base_family = "STKaiti") +
# geom_line(aes(colour = variable)) +
# geom_point(aes(shape = variable))+
# scale_y_continuous(labels = function(y) paste(y/10^6,"M",sep = "")) +
# theme(legend.position = "top") +
# labs(x = "年份",y = "人数",title = "美国工作人数")
p1

p2 <- ggplot(UMdata2)+
theme_bw(base_family = "STKaiti") +
geom_line(aes(x = Year,y = PctMem),colour = "blue") +
geom_point(aes(x = Year,y = PctMem),colour = "blue") +
labs(x = "年份",y = "百分比",title = "美国工会人数百分比")
grid.arrange(p1,p2,nrow = 1)

## 对百分比的走势进行建模预测####
## 我们使用
traind <- UMdata2[1:30,]
testd <- UMdata2[31:33,]
## 模型一,回归模型
## 使用回归模型预测
traind$Year2 <- traind$Year^2
testd$Year2 <- testd$Year^2
UMdata2$Year2 <- UMdata2$Year^2
# lm1 <- lm(PctMem ~ Year,data = traind)
# summary(lm1)
lm1 <- lm(PctMem ~ Year + Year2,data = traind)
summary(lm1)
##
## Call:
## lm(formula = PctMem ~ Year + Year2, data = traind)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.45312 -0.26377 0.00935 0.18637 1.03714
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.013e+04 3.641e+03 8.275 6.98e-09 ***
## Year -2.984e+01 3.645e+00 -8.185 8.64e-09 ***
## Year2 7.391e-03 9.125e-04 8.100 1.06e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3343 on 27 degrees of freedom
## Multiple R-squared: 0.9868, Adjusted R-squared: 0.9858
## F-statistic: 1009 on 2 and 27 DF, p-value: < 2.2e-16
## 可视化回归分析
par(mfrow = c(2,2))
plot(lm1)

## 对数据预测
UMdata2$lm1.p <- predict(lm1,UMdata2)
## 查看数据你和的效果
ggplot()+
theme_bw(base_family = "STKaiti") +
geom_line(data = traind,aes(x = Year,y = PctMem),colour = "blue") +
geom_point(data = traind,aes(x = Year,y = PctMem),colour = "blue",shape = 1) +
geom_line(data = UMdata2,aes(x = Year,y = lm1.p),colour = "red") +
geom_point(data = UMdata2,aes(x = Year,y = lm1.p),colour = "red",shape = 2) +
geom_line(data = testd,aes(x = Year,y = PctMem),colour = "black") +
geom_point(data = testd,aes(x = Year,y = PctMem),colour = "black",shape = 3) +
theme(legend.text = element_text()) +
labs(x = "年份",y = "百分比",title = "二次回归模型")

## 真实值与预测值的差
perror <- data.frame(LMerr = UMdata2$PctMem[31:33] - UMdata2$lm1.p[31:33])
UMdata2$PctMem[31:33] - UMdata2$lm1.p[31:33]
## [1] -0.2487769 -0.3436639 -0.3689510
## 模型2:时间序列模型ARIMA
## 根据图像我们可以使用ARIMA模型
## 构建时序数据
PctMers <- ts(traind$PctMem,start = 1983)
PctMers
## Time Series:
## Start = 1983
## End = 2012
## Frequency = 1
## [1] 21.66683 20.23070 19.30621 18.77613 18.19102 17.80142 17.43160
## [8] 17.01402 16.94600 16.53290 16.44595 16.02033 15.37369 15.07364
## [15] 14.56882 14.31802 14.31511 13.85194 13.87770 13.63068 13.10867
## [22] 12.67736 12.55138 12.00297 12.10210 12.49727 12.28210 11.80674
## [29] 11.75698 11.17049
par(family = "STKaiti",mfrow= c(1,1))
ts.plot(PctMers,main = "美国工会人数百分比",xlab = "时间",ylab = "百分比")

## 自动构建ARIMA
P.arima <- auto.arima(PctMers)
summary(P.arima)
## Series: PctMers
## ARIMA(0,2,1)
##
## Coefficients:
## ma1
## -0.5468
## s.e. 0.2210
##
## sigma^2 estimated as 0.106: log likelihood=-7.97
## AIC=19.94 AICc=20.42 BIC=22.6
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.06958824 0.3088381 0.2703008 0.3560171 1.895758 0.6794747
## ACF1
## Training set -0.004578832
## 使用模型进行预测
P.forecast <- forecast(P.arima, level = c(95), h = 3)
# autoplot(P.forecast,predict.colour = 'red', predict.linetype = 'dashed')
## 预测的数据
P.forecast$mean
## Time Series:
## Start = 2013
## End = 2015
## Frequency = 1
## [1] 10.81964 10.46880 10.11796
## 计算与实际的误差
UMdata2$PctMem[31:33] - as.numeric(P.forecast$mean)
## [1] 0.4343044 0.6151373 0.8803514
perror$ARIMAerr <- UMdata2$PctMem[31:33] - as.numeric(P.forecast$mean)
## 可视化ARIMA(0,2,1)模型的效果
UMdata2$ARIMA <- c(as.numeric(P.forecast$fitted),as.numeric(P.forecast$mean))
ggplot()+
theme_bw(base_family = "STKaiti") +
geom_line(data = traind,aes(x = Year,y = PctMem),colour = "blue") +
geom_point(data = traind,aes(x = Year,y = PctMem),colour = "blue",shape = 1) +
geom_line(data = UMdata2,aes(x = Year,y = ARIMA),colour = "red") +
geom_point(data = UMdata2,aes(x = Year,y = ARIMA),colour = "red",shape = 2) +
geom_line(data = testd,aes(x = Year,y = PctMem),colour = "black") +
geom_point(data = testd,aes(x = Year,y = PctMem),colour = "black",shape = 3) +
theme(legend.text = element_text()) +
labs(x = "年份",y = "百分比",title = "ARIMA模型")

## 将二次函数和ARIMA模型以一定的权重融合
## 找到最合适的权重
w1 <- 0.65
w2 <- 0.35
UMdata2$both <- w1*UMdata2$lm1.p + w2*UMdata2$ARIMA
ggplot()+
theme_bw(base_family = "STKaiti") +
geom_line(data = traind,aes(x = Year,y = PctMem),colour = "blue") +
geom_point(data = traind,aes(x = Year,y = PctMem),colour = "blue",shape = 1) +
geom_line(data = UMdata2,aes(x = Year,y = both),colour = "red") +
geom_point(data = UMdata2,aes(x = Year,y = both),colour = "red",shape = 2) +
geom_line(data = testd,aes(x = Year,y = PctMem),colour = "black") +
geom_point(data = testd,aes(x = Year,y = PctMem),colour = "black",shape = 3) +
theme(legend.text = element_text()) +
labs(x = "年份",y = "百分比",title = "混合模型")

## 误差
perror$MIXerr <- UMdata2$PctMem[31:33] - UMdata2$both[31:33]
UMdata2$PctMem[31:33] - UMdata2$both[31:33]
## [1] -0.009698409 -0.008083470 0.068304855
## 找到最合是的权重
w1 <- seq(0,1,by = 0.01)
w2 <- 1-w1
errs <- numeric(length = length(w1))
##
for(ii in 1:length(w1)){
both <- w1[ii]*UMdata2$lm1.p + w2[ii]*UMdata2$ARIMA
## 绝对值只和最小
errs[ii] <- mean(abs(UMdata2$PctMem[1:30] - both[1:30]))
}
ggplot() +
theme_bw(base_family = "STKaiti") +
geom_line(aes(x = w1,y = errs)) +
geom_point(aes(x = w1[which(errs == min(errs))],y = min(errs)),
colour = "red") +
geom_text(aes(x = w1[which(errs == min(errs))]-0.05,y = min(errs)+0.01,label = "w1=0.59,w2=0.41"))+
labs(y = "平均绝对值误差",title = "权重的选取")

## 构建误差最小的模型权重
w1 <- w1[which(errs == min(errs))]
w2 <- w2[which(errs == min(errs))]
UMdata2$both <- w1*UMdata2$lm1.p + w2*UMdata2$ARIMA
ggplot()+
theme_bw(base_family = "STKaiti") +
geom_line(data = traind,aes(x = Year,y = PctMem),colour = "blue") +
geom_point(data = traind,aes(x = Year,y = PctMem),colour = "blue",shape = 1) +
geom_line(data = UMdata2,aes(x = Year,y = both),colour = "red") +
geom_point(data = UMdata2,aes(x = Year,y = both),colour = "red",shape = 2) +
geom_line(data = testd,aes(x = Year,y = PctMem),colour = "black") +
geom_point(data = testd,aes(x = Year,y = PctMem),colour = "black",shape = 3) +
theme(legend.text = element_text()) +
labs(x = "年份",y = "百分比",title = "混合模型")

## 误差
perror$MIXerr <- UMdata2$PctMem[31:33] - UMdata2$both[31:33]
UMdata2$PctMem[31:33] - UMdata2$both[31:33]
## [1] 0.03128647 0.04944460 0.14326300
## 预测误差的对比图
perror$Year <- UMdata2$Year[31:33]
melt(perror,id.vars = "Year") %>%
ggplot(aes(x = Year,y = value,fill = variable)) +
theme_bw(base_family = "STKaiti") +
geom_bar(stat = "identity",position = position_dodge()) +
scale_x_continuous(labels = function(x) paste(x,"年",sep = "")) +
labs(x = "时间",y = "预测误差",title = "三个模型误差的对比")
