HW3

1– Plot the wining time against the year.

library(fma)

## Warning: package 'fma' was built under R version 3.5.2

## Loading required package: forecast

## Warning: package 'forecast' was built under R version 3.5.2

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.5.2

library(fpp2)

## Warning: package 'fpp2' was built under R version 3.5.2

## Loading required package: expsmooth

## Warning: package 'expsmooth' was built under R version 3.5.2

autoplot(mens400)

Features: 1.There are missing values 2.The slope is negative

– Fit a regression line

time_400<- time(mens400)
tslm_400<-tslm(mens400 ~ time_400, data=mens400)
autoplot(mens400) +
    geom_abline(slope = tslm_400$coefficients[2],
                intercept = tslm_400$coefficient[1],
                colour = "blue")

tslm_400$coefficient[2]

##    time_400 
## -0.06457385

The average rate per year is decreasing too, and the decreasing rate is -0.06457385

– plot residuals

cbind(Time = time_400, 
      Residuals = tslm_400$residuals) %>%
  as.data.frame() %>%
  ggplot(aes(x = Time, y = Residuals)) +
    geom_point() +
    ylab("Residuals of Regression Line(Unit:s)")

## Warning: Removed 3 rows containing missing values (geom_point).

checkresiduals(tslm_400)

## 
##  Breusch-Godfrey test for serial correlation of order up to 6
## 
## data:  Residuals from Linear regression model
## LM test = 3.6082, df = 6, p-value = 0.7295

The residual plot and checkresidual shows that the regresstion model fits the data well

– predict the winning time I have used linear model to get the prediction interval, and used na.exclude to exclude the missing values I have made assumption that the missing values would have huge effict on the pridiction

lm_400 <- lm(
  mens400 ~ time_400, 
  data = mens400,
  na.action = na.exclude
  )
fc_400 <- forecast(
  lm_400, 
  newdata = data.frame(time_400 = 2020)
  )
autoplot(mens400) +
  autolayer(fc_400, PI = TRUE)

fc_400$upper

##          [,1]     [,2]
## [1,] 43.63487 44.53176

fc_400$lower

##          [,1]     [,2]
## [1,] 40.44975 39.55286

80% interval is from 40.45 to 43.63 95% interval is from 39.55 to 44.53

2–

daily20 <- head(elecdaily, 20)
autoplot(daily20)

tslm_Temp <- tslm(Demand ~ Temperature, data = daily20)
tslm_Temp

## 
## Call:
## tslm(formula = Demand ~ Temperature, data = daily20)
## 
## Coefficients:
## (Intercept)  Temperature  
##      39.212        6.757

The positive relationship is because of the AC. Because the temperature increased so more people consume AC.

checkresiduals(tslm_Temp$residuals)

## Warning in modeldf.default(object): Could not find appropriate degrees of
## freedom for this model.

This model is adequate because the residuals are not correlated. There is outlier.

fc_Temp <- forecast(tslm_Temp, 
  newdata=data.frame(Temperature=c(15,35)))
fc_Temp

##          Point Forecast    Lo 80    Hi 80     Lo 95    Hi 95
## 3.857143       140.5701 108.6810 172.4591  90.21166 190.9285
## 4.000000       275.7146 245.2278 306.2014 227.57056 323.8586

I believe in this forcast because the prediction is with in the normal range

## 80% intervals
fc_Temp$upper[, 1]

## Time Series:
## Start = c(3, 7) 
## End = c(4, 1) 
## Frequency = 7 
## [1] 172.4591 306.2014

fc_Temp$lower[, 1]

## Time Series:
## Start = c(3, 7) 
## End = c(4, 1) 
## Frequency = 7 
## [1] 108.6810 245.2278

## 95% intervals
fc_Temp$upper[, 2]

## Time Series:
## Start = c(3, 7) 
## End = c(4, 1) 
## Frequency = 7 
## [1] 190.9285 323.8586

fc_Temp$lower[, 2]

## Time Series:
## Start = c(3, 7) 
## End = c(4, 1) 
## Frequency = 7 
## [1]  90.21166 227.57056

elecdaily %>%
  as.data.frame() %>%
  ggplot(aes(x=Temperature, y=Demand)) +
    ylab("Electricity Demand") +
    xlab("Temperature") +
    geom_point() +
    geom_smooth(method="lm", se=FALSE)

I think this is not the right model because there are noly 20 records

3–

autoplot(huron)

str(huron)

##  Time-Series [1:98] from 1875 to 1972: 10.38 11.86 10.97 10.8 9.79 ...

head(huron)

## Time Series:
## Start = 1875 
## End = 1880 
## Frequency = 1 
## [1] 10.38 11.86 10.97 10.80  9.79 10.39

h <- 8
tslm_huron <- tslm(huron ~ trend)
fc_tslm_huron <- forecast(tslm_huron, h=h)

4–

autoplot(fancy)

head(fancy, 50)

##           Jan      Feb      Mar      Apr      May      Jun      Jul
## 1987  1664.81  2397.53  2840.71  3547.29  3752.96  3714.74  4349.61
## 1988  2499.81  5198.24  7225.14  4806.03  5900.88  4951.34  6179.12
## 1989  4717.02  5702.63  9957.58  5304.78  6492.43  6630.80  7349.62
## 1990  5921.10  5814.58 12421.25  6369.77  7609.12  7224.75  8121.22
## 1991  4826.64  6470.23                                             
##           Aug      Sep      Oct      Nov      Dec
## 1987  3566.34  5021.82  6423.48  7600.60 19756.21
## 1988  4752.15  5496.43  5835.10 12600.08 28541.72
## 1989  8176.62  8573.17  9690.50 15151.84 34061.01
## 1990  7979.25  8093.06  8476.70 17914.66 30114.41
## 1991

sales increased in Dec and March, but March is a smaller increasment compared with December. It is neccessery to use the logarithms The size of the seasonal variations should be almost same across the whole series to be fitted well to a model. Fancy data shows that seasonal variations increased exponentially. Therefore it is necessary to take logarithms of the data.

Time <- time(fancy)
surfing_festival <- c()
for(i in 1:length(Time)){
  month <- round(12*(Time[i] - floor(Time[i]))) + 1
  year <- floor(Time[i])
  if(year >= 1988 & month == 3){
    surfing_festival[i] <- 1
  } else {
    surfing_festival[i] <- 0
  }
}
tslm_log_fancy <- tslm(
  BoxCox(fancy, 0) ~ trend + season + surfing_festival
  )

autoplot(tslm_log_fancy$residuals)

There is correlation between residuals and time.

HW3

ytu1

February 21, 2019