#file location
fL<-"https://gattonweb.uky.edu/sheather/book/docs/datasets/boxoffice.txt"
dta <- read.table(fL, header=T)
str(dta)
## 'data.frame': 32 obs. of 2 variables:
## $ GrossBoxOffice: num 95.3 86.4 119.4 124.4 154.2 ...
## $ year : int 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 ...
了解資料結構
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(nlme)
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
library(ggplot2)
載入需要使用的package
dta3v<- dta %>%
mutate(Year_75 = 1:32 ) %>%
rename(Box_Office = GrossBoxOffice)
str(dta3v)
## 'data.frame': 32 obs. of 3 variables:
## $ Box_Office: num 95.3 86.4 119.4 124.4 154.2 ...
## $ year : int 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 ...
## $ Year_75 : int 1 2 3 4 5 6 7 8 9 10 ...
創建Year_75年代序位欄,並把GrossBoxOffice欄名改為Box_Office
Box_office_t = β0 + β1 × Year_75t + εt, t = 1, …, 32, where εt ~ N(0, σ).
m1 <- lm(Box_Office ~ Year_75, data=dta3v)
summary(m1)
##
## Call:
## lm(formula = Box_Office ~ Year_75, data = dta3v)
##
## Residuals:
## Min 1Q Median 3Q Max
## -116.382 -79.197 6.083 62.260 121.697
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -55.931 28.034 -1.995 0.0552 .
## Year_75 29.534 1.483 19.919 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 77.44 on 30 degrees of freedom
## Multiple R-squared: 0.9297, Adjusted R-squared: 0.9274
## F-statistic: 396.8 on 1 and 30 DF, p-value: < 2.2e-16
knitr::kable(summary(m1)$coefficients)
Estimate | Std. Error | t value | Pr(>|t|) | |
---|---|---|---|---|
(Intercept) | -55.93105 | 28.034364 | -1.995089 | 0.055186 |
Year_75 | 29.53446 | 1.482698 | 19.919403 | 0.000000 |
年代對總票房的線性效果
# resid
knitr::kable(as.data.frame(summary(m1g <- gls(Box_Office ~ Year_75, data = dta3v))$tTable))
Value | Std.Error | t-value | p-value | |
---|---|---|---|---|
(Intercept) | -55.93105 | 28.034364 | -1.995089 | 0.055186 |
Year_75 | 29.53446 | 1.482698 | 19.919403 | 0.000000 |
dta3v1 <- dta3v %>%
mutate(res1 = resid(m1g, type = "normalized"))
用gls進行殘差分析,並把殘差項進行標準化,數據儲存到data3v1。在這邊沒有指定gls的估算法,所以以預設RMEL進行估算。
lmtest::dwtest(res1 ~ year, data=dta3v1)
##
## Durbin-Watson test
##
## data: res1 ~ year
## DW = 0.24809, p-value = 2.344e-13
## alternative hypothesis: true autocorrelation is greater than 0
使用Durbin-Watson test檢測回歸分析中的殘差是否有自我相關性(autocorrelation),分析結果顯示DW=0.25大於0,所以總票房與時間序列有自相關存在。
ggplot(dta3v, aes(Year_75, Box_Office)) +
geom_point(aes(Year_75, Box_Office), color = "grey70") +
stat_smooth(method = "lm", se = F, size = rel(1), lty = 2, color = "blue") +
labs(x = "Year since 1975", y = "Gross Box Office (in million $)") +
theme(legend.position = "NONE")+
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
先畫線性圖看看總票房與時間序列的變化圖。看起來隨時間總票房線性FIT不是很好。
acf(resid(m1g, type = "normalized"), main = "LM", ylim = c(-1, 1))
繪製自相關格子圖,y軸設定在-1到1之間,看看殘差隨時間序列延遲效果是否平穩 從自相關圖來看,呈現三角對稱形式,不存在截尾或拖尾,屬於單調序列的典型表現>形式,原始資料屬於不平穩序列。
Box_office_t = β0 + β1 × Year_75t + εt, t = 1, …, 32, where εt = εt-1 + νt, and νt ~ N(0, σ).
因為資料顯示總票房與時間序列自相關屬不平穩的序列型態,因此這次在殘差模型中>改用“ML”的方法。
knitr::kable(as.data.frame(summary(m2g <- gls(Box_Office ~ Year_75, data = dta3v, method="ML", correlation=corAR1(form= ~ Year_75)))$tTable))
Value | Std.Error | t-value | p-value | |
---|---|---|---|---|
(Intercept) | 4.514082 | 72.743933 | 0.0620544 | 0.9509311 |
Year_75 | 27.075395 | 3.447663 | 7.8532593 | 0.0000000 |
dta3v2 <- dta3v %>%
mutate(res2 = resid(m2g, type = "normalized"))
# Durbin-Watson test
lmtest::dwtest(res2 ~ year, data=dta3v2)
##
## Durbin-Watson test
##
## data: res2 ~ year
## DW = 2.2012, p-value = 0.6495
## alternative hypothesis: true autocorrelation is greater than 0
P值>0.05, 不具autocorrelation 效果
# RES plot
ggplot(dta3v2, aes(Year_75, res2)) +
geom_point(aes(Year_75, res2), color = "grey70") +
geom_hline(yintercept = 0, size = rel(1), color = "grey60")+
labs(x = "Year since 1975", y = "Standarized Residuals") +
theme(legend.position = "NONE")+
theme_bw()
標準化殘差散佈圖,大部分都分布在1~-1之間,看起來分布還算均勻
# acf
acf(resid(m2g, type = "normalized"), main = "GLS", ylim = c(-1, 1))
再次繪製自相關圖,結果呈現截尾的狀態。時間序列對總票房的效果隻字相關性趨於平穩。