#install.packages("forecast")
simpler <- lm(Petal.Length ~ Sepal.Length, data =iris)
biggermod <- lm(Petal.Length ~ Sepal.Length + Species, data =iris)
summary(biggermod) #why can't we use this to decide?
##
## Call:
## lm(formula = Petal.Length ~ Sepal.Length + Species, data = iris)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.76390 -0.17875 0.00716 0.17461 0.79954
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.70234 0.23013 -7.397 1.01e-11 ***
## Sepal.Length 0.63211 0.04527 13.962 < 2e-16 ***
## Speciesversicolor 2.21014 0.07047 31.362 < 2e-16 ***
## Speciesvirginica 3.09000 0.09123 33.870 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2826 on 146 degrees of freedom
## Multiple R-squared: 0.9749, Adjusted R-squared: 0.9744
## F-statistic: 1890 on 3 and 146 DF, p-value: < 2.2e-16
Doesn’t answer exactly our question. It answers a related, but not identical research question.
anova(biggermod, simpler)
## Analysis of Variance Table
##
## Model 1: Petal.Length ~ Sepal.Length + Species
## Model 2: Petal.Length ~ Sepal.Length
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 146 11.657
## 2 148 111.459 -2 -99.802 624.99 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
logLik(simpler)
## 'log Lik.' -190.5675 (df=3)
logLik(biggermod)
## 'log Lik.' -21.23709 (df=5)
mydf <- length(coef(biggermod)) - length(coef(simpler))
(teststat<- -2*as.numeric(logLik(simpler) - logLik(biggermod)))
## [1] 338.6608
pchisq(teststat, df = mydf, lower.tail = FALSE)
## [1] 2.888934e-74
Returns the log likelihood value. What we want is to find the difference between the log likelihoods and then double it. This is 338 in this case and it is positive… yay!
Either calculate the degrees of freedom or the test stat first. It doesn’t matter.
Now, we compare them by doing the degrees of freedom.
length((coef(biggermod)))
## [1] 4
length((coef(simpler)))
## [1] 2
We see we have 4 betas in the bigger model and 2 in the smaller model. The difference is 2
The p-value is 2.8889 * (10^-74)
Basically the species adds a whole bunch of new information even after accounting for length.
library(forecast)
## Warning: package 'forecast' was built under R version 3.4.4
library(quantmod)
## Warning: package 'quantmod' was built under R version 3.4.4
## Loading required package: xts
## Warning: package 'xts' was built under R version 3.4.4
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.4.4
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: TTR
## Warning: package 'TTR' was built under R version 3.4.3
## Version 0.4-0 included new data defaults. See ?getSymbols.
library(tseries)
## Warning: package 'tseries' was built under R version 3.4.4
library(forecast)
library(xts)
births <- scan("http://robjhyndman.com/tsdldata/data/nybirths.dat")
births <- births[-c(1:24)]
births <- ts(births, frequency=12, start=c(1946,1))
Recall from last time, the following:
mod1 <- arima(births, order = c(1,0,1), season = list( order = c( 1,1,1), period=12))
mod2 <- arima(births, order = c(2,0,2), season = list( order = c( 1,1,1), period=12))
mod1
##
## Call:
## arima(x = births, order = c(1, 0, 1), seasonal = list(order = c(1, 1, 1), period = 12))
##
## Coefficients:
## ar1 ma1 sar1 sma1
## 0.9884 -0.1564 -0.2199 -0.8820
## s.e. 0.0287 0.1747 0.1004 0.1498
##
## sigma^2 estimated as 0.3924: log likelihood = -136.81, aic = 283.61
mod2
##
## Call:
## arima(x = births, order = c(2, 0, 2), seasonal = list(order = c(1, 1, 1), period = 12))
##
## Coefficients:
## ar1 ar2 ma1 ma2 sar1 sma1
## 1.2898 -0.2902 -0.5257 -0.2915 -0.2450 -0.8684
## s.e. 0.1738 0.1735 0.1660 0.1090 0.1029 0.1309
##
## sigma^2 estimated as 0.3295: log likelihood = -125.31, aic = 264.62
mydf <- length(coef(mod2)) - length(coef(mod1))
(teststat<- -2*as.numeric(logLik(mod1) - logLik(mod2)))
## [1] 22.98939
pchisq(teststat, df = mydf, lower.tail = FALSE)
## [1] 1.018396e-05
We are going to use these two models to look at the output.
Mod1 is nested in Mod2 … ALWAYS make sure this is the case or you will be doing something that is NOT allowed.
6-4 = 2, so we have 2 degrees of freedom
We have significant evidence that model 2 is beetter than model 1
We don’t know that it is the BEST model, but we do know it is better than the simpler model. This is another rule to do the model selection stuff.
data(wineind)
plot(wineind)
head(wineind)
## Jan Feb Mar Apr May Jun
## 1980 15136 16733 20016 17708 18019 19227
names(wineind)
## NULL
mod1 <- arima(wineind, order = c(1,0,1), season = list( order = c( 1,1,1), period=12))
mod2 <- arima(wineind, order = c(2,0,2), season = list( order = c( 1,1,1), period=12))
mod1
##
## Call:
## arima(x = wineind, order = c(1, 0, 1), seasonal = list(order = c(1, 1, 1), period = 12))
##
## Coefficients:
## ar1 ma1 sar1 sma1
## 0.9922 -0.8978 0.1994 -0.7746
## s.e. 0.0109 0.0323 0.1299 0.1055
##
## sigma^2 estimated as 5295670: log likelihood = -1505.84, aic = 3021.67
mod2
##
## Call:
## arima(x = wineind, order = c(2, 0, 2), seasonal = list(order = c(1, 1, 1), period = 12))
##
## Coefficients:
## ar1 ar2 ma1 ma2 sar1 sma1
## 0.0962 0.8891 0.0917 -0.8969 0.2023 -0.7744
## s.e. 0.0472 0.0463 0.0814 0.0747 0.1323 0.1090
##
## sigma^2 estimated as 5062547: log likelihood = -1503.66, aic = 3021.32
mydf <- length(coef(mod2)) - length(coef(mod1))
(teststat<- -2*as.numeric(logLik(mod1) - logLik(mod2)))
## [1] 4.348894
pchisq(teststat, df = mydf, lower.tail = FALSE)
## [1] 0.113671
I see that the test stat is 4.348894 and the p-value is .113671
This means that mod1 is the better fit.
`
mod3 <- arima(wineind, order = c(1,1,1), season = list( order = c( 1,1,1), period=12))
mod4 <- arima(wineind, order = c(2,2,2), season = list( order = c( 1,1,1), period=12))
mod1
##
## Call:
## arima(x = wineind, order = c(1, 0, 1), seasonal = list(order = c(1, 1, 1), period = 12))
##
## Coefficients:
## ar1 ma1 sar1 sma1
## 0.9922 -0.8978 0.1994 -0.7746
## s.e. 0.0109 0.0323 0.1299 0.1055
##
## sigma^2 estimated as 5295670: log likelihood = -1505.84, aic = 3021.67
mod2
##
## Call:
## arima(x = wineind, order = c(2, 0, 2), seasonal = list(order = c(1, 1, 1), period = 12))
##
## Coefficients:
## ar1 ar2 ma1 ma2 sar1 sma1
## 0.0962 0.8891 0.0917 -0.8969 0.2023 -0.7744
## s.e. 0.0472 0.0463 0.0814 0.0747 0.1323 0.1090
##
## sigma^2 estimated as 5062547: log likelihood = -1503.66, aic = 3021.32
mydf <- length(coef(mod4)) - length(coef(mod3))
(teststat<- -2*as.numeric(logLik(mod3) - logLik(mod4)))
## [1] 6.824367
pchisq(teststat, df = mydf, lower.tail = FALSE)
## [1] 0.03296913
The p-value is .032969 which means that mod3 is the better model.