library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(tsibble)
##
## Attaching package: 'tsibble'
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(tsibbledata)
library(ggplot2)
library(ggfortify)
library(dplyr)
library(fable)
## Loading required package: fabletools
library(fabletools)
library(feasts)
jan <- vic_elec %>%
filter(yearmonth(Time) == yearmonth("2014 Jan")) %>%
index_by(Date = as.Date(Date)) %>%
summarise(
Demand = sum(Demand),
Temperature = max(Temperature)
)
jan %>%
gather("Variable", "Value", Demand, Temperature) %>%
ggplot(aes(x = Date, y = Value)) +
geom_line() +
facet_grid(vars(Variable), scales = "free_y") +
labs(title = "Electricity Demand")
jan %>%
ggplot(aes(x=Temperature, y=Demand)) +
geom_point() +
geom_smooth(method="lm", se=FALSE) +
labs(title = "Electricity Demand")
## `geom_smooth()` using formula 'y ~ x'
a.January is during the Australian Summer, increased temperatures will garner higher demand.
library(tidyverse)
library(tsibble)
library(tsibbledata)
library(ggplot2)
library(ggfortify)
library(dplyr)
library(fable)
library(fabletools)
library(feasts)
fit <- jan %>% model(TSLM(Demand ~ Temperature))
fit %>% gg_tsresiduals()
There doesnt’t appear to be any part of the data is is an outlier or influential
d15 <- jan %>%
model(TSLM(Demand ~ Temperature)) %>%
forecast(
new_data(jan, 1) %>%
mutate(Temperature = 15)
) %>%
autoplot(jan)
d35 <- jan %>%
model(TSLM(Demand ~ Temperature)) %>%
forecast(
new_data(jan, 1) %>%
mutate(Temperature = 35)
) %>%
autoplot(jan)
d15
d35
The 35 degree forceast looks possible, but the 15 degree forecast has the upper values all what would be close to all time lows in demand making it unlikely in my opinion.
library(stats)
fit <- lm(Demand ~ Temperature, data=jan)
newdata<-data.frame(Temperature=c(15,35))
predict(fit, newdata,interval="predict")
## fit lwr upr
## 1 151398.4 97951.22 204845.5
## 2 274484.2 222783.69 326184.8
plot(Demand~Temperature, data=vic_elec, main= "Demand vs. Temp")
Question 2.
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'forecast':
## method from
## autoplot.Arima ggfortify
## autoplot.acf ggfortify
## autoplot.ar ggfortify
## autoplot.bats ggfortify
## autoplot.decomposed.ts ggfortify
## autoplot.ets ggfortify
## autoplot.forecast ggfortify
## autoplot.stl ggfortify
## autoplot.ts ggfortify
## fitted.ar ggfortify
## fortify.ts ggfortify
## residuals.ar ggfortify
##
## Attaching package: 'forecast'
## The following objects are masked from 'package:fabletools':
##
## accuracy, forecast
autoplot(olympic_running)
## Plot variable not specified, automatically selected `.vars = Time`
olympic <- lm(Time ~ ., olympic_running)
olympic
##
## Call:
## lm(formula = Time ~ ., data = olympic_running)
##
## Coefficients:
## (Intercept) Year Length Sexwomen
## 734.9863 -0.3905 0.1766 32.9732
based off the coefficient, .39 seconds per year
checkresiduals(olympic)
##
## Breusch-Godfrey test for serial correlation of order up to 10
##
## data: Residuals
## LM test = 179.8, df = 10, p-value < 2.2e-16
because of the pattern, I dont think it is a good fit
Question 3. For some reason I am having a very frustating issue where my code runs perfectly in chunks, but when I try to knit it says it cannot find the object “souvenirs” I took screen shot of
library(forecast)
library(tseries)
#autoplot(souvenirs)
There is a very obvious trend and seasonal pattern
#souv<- ts(souvenirs$Sales,start=c(1987,1), end=c(1993,6), frequency=12)
#log.souvenirs = log(souv)
#dummy.fest = rep(0, length(souv))
#dummy.fest[seq_along(dummy.fest)%%12 == 3] = 1
#dummy.fest[3] = 0
#dummy.fest = ts(dummy.fest, freq = 12, start=c(1987,1))
#new.data = data.frame(log.souvenirs, dummy.fest)
#fit = tslm(log.souvenirs ~ trend + season + dummy.fest, data=new.data)
#fore.data = data.frame(dummy.fest = rep(0, 12))
#fore.data[3,] = 1
#forecast(fit, newdata=fore.data)
#autoplot(fit$residuals)
boxplot(residuals(fit)~cycle(residuals(fit)))
e. On some months, the residuals are a lot more varied than others. This
can indicate some problems with the model.
fit$coefficients
## (Intercept) Temperature
## 59083.929 6154.295
checkresiduals(fit)
##
## Breusch-Godfrey test for serial correlation of order up to 6
##
## data: Residuals
## LM test = 6.8794, df = 6, p-value = 0.3321
#fore.data = data.frame(dummy.fest = rep(0, 36))
#reds = forecast(fit, newdata=fore.data)
#preds
Question 4.
library(fpp3)
## ── Attaching packages ──────────────────────────────────────────── fpp3 0.4.0 ──
## ✔ lubridate 1.8.0
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ lubridate::interval() masks tsibble::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
autoplot(us_gasoline)
## Plot variable not specified, automatically selected `.vars = Barrels`
gas <- ts(us_gasoline$Barrels, start=(1991-1), end=(2017-1),frequency = 52)
gas <- window(gas, end = 2005)
for(num in c(1, 2, 3, 5, 10)){
var_name <- paste("fit",
as.character(num),
"_gasoline_2004",
sep = "")
assign(var_name,
tslm(gas ~ trend + fourier(
gas,
K = num
))
)
print(
autoplot(gas) +
autolayer(get(var_name)$fitted.values,
series = as.character(num)) +
ggtitle(var_name) +
ylab("gas") +
guides(colour = guide_legend(title = " Fourier "))
)}
for(i in c(1, 2, 3, 5, 10)){
fit.name <- paste(
"fit", as.character(i), "_gasoline_2004",
sep = ""
)
writeLines(
paste(
"\n", fit.name, "\n"
)
)
print(CV(get(fit.name)))
}
##
## fit1_gasoline_2004
##
## CV AIC AICc BIC AdjR2
## 8.012058e-02 -1.969129e+03 -1.969052e+03 -1.945827e+03 8.427750e-01
##
## fit2_gasoline_2004
##
## CV AIC AICc BIC AdjR2
## 7.919464e-02 -1.978217e+03 -1.978072e+03 -1.945593e+03 8.449887e-01
##
## fit3_gasoline_2004
##
## CV AIC AICc BIC AdjR2
## 7.470266e-02 -2.023838e+03 -2.023604e+03 -1.981892e+03 8.541546e-01
##
## fit5_gasoline_2004
##
## CV AIC AICc BIC AdjR2
## 7.365185e-02 -2.034993e+03 -2.034518e+03 -1.974405e+03 8.569479e-01
##
## fit10_gasoline_2004
##
## CV AIC AICc BIC AdjR2
## 7.173572e-02 -2.056054e+03 -2.054595e+03 -1.948861e+03 8.624864e-01
10 has the best AICC value
checkresiduals(fit10_gasoline_2004)
##
## Breusch-Godfrey test for serial correlation of order up to 104
##
## data: Residuals from Linear regression model
## LM test = 128.54, df = 104, p-value = 0.05168
fc_gasoline_2005 <- forecast(
fit10_gasoline_2004,
newdata=data.frame(fourier(
gas, K = 10, h = 52)
)
)
fc_gasoline_2005
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2005.019 8.901276 8.557367 9.245186 8.374931 9.427622
## 2005.038 8.940132 8.596126 9.284138 8.413638 9.466626
## 2005.058 8.982049 8.638038 9.326060 8.455548 9.508551
## 2005.077 9.056949 8.712952 9.400946 8.530469 9.583429
## 2005.096 9.143925 8.799930 9.487920 8.617448 9.670401
## 2005.115 9.192581 8.848582 9.536579 8.666099 9.719062
## 2005.135 9.185356 8.841349 9.529363 8.658862 9.711851
## 2005.154 9.160913 8.816906 9.504919 8.634419 9.687406
## 2005.173 9.169559 8.825559 9.513560 8.643075 9.696044
## 2005.192 9.216866 8.872868 9.560865 8.690384 9.743348
## 2005.212 9.264222 8.920221 9.608223 8.737736 9.790708
## 2005.231 9.281951 8.937946 9.625957 8.755458 9.808444
## 2005.250 9.285826 8.941821 9.629831 8.759334 9.812318
## 2005.269 9.312227 8.968226 9.656228 8.785741 9.838713
## 2005.288 9.367159 9.023159 9.711159 8.840675 9.893643
## 2005.308 9.416957 9.072955 9.760958 8.890470 9.943444
## 2005.327 9.433266 9.089261 9.777272 8.906774 9.959758
## 2005.346 9.433725 9.089720 9.777729 8.907234 9.960216
## 2005.365 9.463951 9.119950 9.807952 8.937465 9.990437
## 2005.385 9.540677 9.196677 9.884678 9.014193 10.067162
## 2005.404 9.625983 9.281981 9.969985 9.099496 10.152471
## 2005.423 9.665636 9.321631 10.009640 9.139144 10.192127
## 2005.442 9.647059 9.303055 9.991063 9.120569 10.173550
## 2005.462 9.610131 9.266130 9.954132 9.083645 10.136617
## 2005.481 9.601794 9.257794 9.945795 9.075310 10.128279
## 2005.500 9.629756 9.285754 9.973759 9.103268 10.156244
## 2005.519 9.664433 9.320428 10.008438 9.137941 10.190924
## 2005.538 9.676302 9.332298 10.020306 9.149812 10.202792
## 2005.558 9.658713 9.314712 10.002714 9.132227 10.185199
## 2005.577 9.615773 9.271772 9.959773 9.089288 10.142258
## 2005.596 9.544821 9.200818 9.888823 9.018333 10.071309
## 2005.615 9.445303 9.101298 9.789309 8.918811 9.971795
## 2005.635 9.341433 8.997429 9.685437 8.814943 9.867923
## 2005.654 9.279412 8.935411 9.623412 8.752926 9.805897
## 2005.673 9.290336 8.946336 9.634336 8.763851 9.816820
## 2005.692 9.357646 9.013643 9.701648 8.831157 9.884134
## 2005.712 9.428370 9.084364 9.772376 8.901877 9.954863
## 2005.731 9.457460 9.113457 9.801464 8.930971 9.983950
## 2005.750 9.437719 9.093720 9.781719 8.911235 9.964203
## 2005.769 9.389996 9.045997 9.733996 8.863513 9.916480
## 2005.788 9.337513 8.993510 9.681516 8.811024 9.864001
## 2005.808 9.298876 8.954869 9.642883 8.772382 9.825371
## 2005.827 9.296233 8.952230 9.640237 8.769744 9.822723
## 2005.846 9.346559 9.002561 9.690557 8.820078 9.873040
## 2005.865 9.430551 9.086553 9.774549 8.904070 9.957032
## 2005.885 9.480414 9.136411 9.824418 8.953925 10.006904
## 2005.904 9.423219 9.079208 9.767229 8.896719 9.949719
## 2005.923 9.251762 8.907762 9.595762 8.725278 9.778246
## 2005.942 9.047458 8.703470 9.391446 8.520993 9.573924
## 2005.962 8.918927 8.574940 9.262915 8.392462 9.445392
## 2005.981 8.911344 8.567381 9.255307 8.384917 9.437771
## 2006.000 8.978870 8.634891 9.322849 8.452418 9.505323
autoplot(fc_gasoline_2005)
gas_05 <- ts(us_gasoline$Barrels, start=(1991-1), end=(2005-1),frequency = 52)
autoplot(gas_05)
The prediction forecast was very accurate in predicting 2005.
Question 5.
global_economy %>%
filter(Country=="Afghanistan") %>%
tsibble(key = Code, index = Year)%>%
autoplot(Population, show.legend = FALSE) +
labs(title= "Afghanistan Population",
y = "Population")
yes, there is a decrease in population
global_economy %>%
filter(Country=="Afghanistan")%>%
model(TSLM(Population ~ Year)) %>%
report()
## Series: Population
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -5794518 -2582559 744761 2259222 6036280
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -829292529 49730866 -16.68 <2e-16 ***
## Year 425774 25008 17.02 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3188000 on 56 degrees of freedom
## Multiple R-squared: 0.8381, Adjusted R-squared: 0.8352
## F-statistic: 289.9 on 1 and 56 DF, p-value: < 2.22e-16
global_economy %>%
filter(Country=="Afghanistan")%>%
filter(Year<1980)%>%
model(TSLM(Population ~ Year)) %>%
report()
## Series: Population
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -146380.8 -110290.6 -451.2 105877.8 202881.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -470734527 8787062 -53.57 <2e-16 ***
## Year 244657 4462 54.84 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 115100 on 18 degrees of freedom
## Multiple R-squared: 0.994, Adjusted R-squared: 0.9937
## F-statistic: 3007 on 1 and 18 DF, p-value: < 2.22e-16
global_economy %>%
filter(Country=="Afghanistan")%>%
filter(Year>1989)%>%
model(TSLM(Population ~ Year)) %>%
report()
## Series: Population
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -619234 -212927 6598 234280 612277
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.670e+09 1.640e+07 -101.8 <2e-16 ***
## Year 8.451e+05 8.184e+03 103.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 349800 on 26 degrees of freedom
## Multiple R-squared: 0.9976, Adjusted R-squared: 0.9975
## F-statistic: 1.066e+04 on 1 and 26 DF, p-value: < 2.22e-16
It is noticably lower before 1989 and noticable higher after 1989
for1<-global_economy %>%
filter(Country=="Afghanistan")%>%
model(TSLM(Population ~ Year))
for2<-global_economy %>%
filter(Country=="Afghanistan")%>%
filter(Year>1989)%>%
model(TSLM(Population ~ Year))
Question 6. A. 1. The linear and error terms have linearity - this means that all terms must be constants and made up of the multiplication of an independant variable and a parameter 2. The error term’s population mean in 0. - This is necessary because otherwise your model is not a true prediction of the population 3. There is no correlation between the independant variables and the error terms - This can be caused by variables with measurement erros or biases variables
Each observation of ther error term is independant of others(serial correlation) - with time series data, if past data has an impact on future observations it will create this problem
The error terms variance is constant(homoscedacity)- if the spread of the residuals gets larger in one direction then the model no longer meets this expectation.
There are no independent variables that are perfect linear functions of other variables(no perfect multicolinearity) - basically when two independant variables are capturing the entirity of the other in some way
The error term adheres to a normal distribution pattern - this assumption is actually optional, but necessary for producing reliable things like prediction interval and confience intervals.