Data624 HW1

Problem 1

For each of the following series (from the fma package), make a graph of the data. If transforming seems appropriate, do so and describe the effect.

#1a

Monthly total of people on unemployed benefits in Australia (January 1956–July 1992).

library("fma")

## Loading required package: forecast

head(dole,20)

##        Jan   Feb   Mar   Apr   May   Jun   Jul   Aug   Sep   Oct   Nov
## 1956  4742  6128  6494  5379  6011  7003  9164 10333  9614  9545 10096
## 1957 15711 13135 13077 15453 15995 18071 20291 20175                  
##        Dec
## 1956 13277
## 1957

#First, a general plot
plot(dole,
     main="Unemployed benefits in Australia",
     xlab="Year", ylab="People")

#Seasonal plot: 
#no obvious seasonal patterns, other than some decline throughout the year with a correction in December
seasonplot(dole, ylab="People", xlab="Year",
           main="Season plot: Unemployed benefits in Australia",
           year.labels=TRUE, year.labels.left=TRUE, col=1:20, pch=19)

#Attempt to smooth months out by days
monthdays <- rep(c(31,28,31,30,31,30,31,31,30,31,30,31),36)
monthdays[26 + (4*12)*(0:8)] <- 29
par(mfrow=c(2,1))
plot(dole, main="Monthly total of people on unemployed benefits in Australia",
     ylab="People", xlab="Years")
plot(dole/monthdays, main="Average total of people on unemployed benefits in Australia per day",
     ylab="People", xlab="Years")

## Warning in `/.default`(dole, monthdays): longer object length is not a
## multiple of shorter object length

#Correct for population growth (data from http://www.abs.gov.au/AUSSTATS/abs@.nsf/DetailsPage/3105.0.65.0012014?OpenDocument)
yearpop <- c(4828846, 4930160,  5026095,    5132363,    5253073,    5374304,    5470040,    5571613,    5682962,    5793629,    5890642,    5992280,    6108185,    6238338,    6364877,    6632838,    6735679,    6835488,    6941940,    7002232,    7065779,    7145407,    7213574,    7293341, 7391427,   7514339,    7633179,    7730415,    7826368,    7940033,    8057320,    8181746,    8325068,    8446656,    8560533,    8659623,    8744793)

yearpopmon <- rep(yearpop, each=12)
mondiff <- length(yearpopmon) - length(dole)
yearpopmon <- yearpopmon[1:(length(yearpopmon)-mondiff)]

par(mfrow=c(2,1))
plot(dole, main="Monthly total of people on unemployed benefits in Australia",
     ylab="People", xlab="Years")
plot(dole/yearpopmon, main="Average total of people on unemployed benefits in Australia per day",
     ylab="People", xlab="Years")

#1b

Monthly total of accidental deaths in the United States (January 1973–December 1978).

usdeaths

##        Jan   Feb   Mar   Apr   May   Jun   Jul   Aug   Sep   Oct   Nov
## 1973  9007  8106  8928  9137 10017 10826 11317 10744  9713  9938  9161
## 1974  7750  6981  8038  8422  8714  9512 10120  9823  8743  9129  8710
## 1975  8162  7306  8124  7870  9387  9556 10093  9620  8285  8433  8160
## 1976  7717  7461  7776  7925  8634  8945 10078  9179  8037  8488  7874
## 1977  7792  6957  7726  8106  8890  9299 10625  9302  8314  8850  8265
## 1978  7836  6892  7791  8129  9115  9434 10484  9827  9110  9070  8633
##        Dec
## 1973  8927
## 1974  8680
## 1975  8034
## 1976  8647
## 1977  8796
## 1978  9240

plot(usdeaths)

#Seasonal plot: 
#more accidental deaths around the summer months, peaking in July
seasonplot(usdeaths, ylab="People", xlab="Year",
           main="Season plot: Accidental deaths in America",
           year.labels=TRUE, year.labels.left=TRUE, col=1:20, pch=19)

#correct for days (although probably not necessary)
monthdays <- rep(c(31,28,31,30,31,30,31,31,30,31,30,31),6)
par(mfrow=c(2,1))
plot(usdeaths/monthdays, main="Average total of accidental deaths per day",
     ylab="People", xlab="Years")
plot(usdeaths, main="Average total of accidental deaths per month",
     ylab="People", xlab="Years")

#1c

Quarterly production of bricks (in millions of units) at Portland, Australia (March 1956–September 1994).

par(mfrow=c(3,1))
plot(bricksq, ylab="Bricks (Millions)", xlab="Year", main="Quarterly production of bricks at Portland, Australia")
#Log transformation
plot(log(bricksq), ylab="Transofrmed brick production", xlab="year", main="Transformed quarterly brick production")
title(main="Log",line=-1)
#box-cox
lambda <-BoxCox.lambda(bricksq)
plot(BoxCox(bricksq,lambda))
title(main="BoxCox",line=-1)

Problem 2

Consider the daily closing IBM stock prices (data set ibmclose). Produce some plots of the data in order to become familiar with it.

par(mfrow=c(2,1))
plot(ibmclose, main="Closing IBM stock prices", ylab="IBM stock price", xlab="Day")
#boxcox
labmda <-BoxCox.lambda(ibmclose)
plot(BoxCox(ibmclose,lambda))

Split the data into a training set of 300 observations and a test set of 69 observations.

set.seed(101) 
sample <- sample.int(n = 369, size = 300, replace = F)
fulllist <- 0:369
rsample <- fulllist[!(fulllist %in% sample)]
train <- ibmclose[sort(sample)]
test <- ibmclose[sort(rsample)]


train1 <- ibmclose[0:300]
test1 <- ibmclose[301:369]

Try various benchmark methods to forecast the training set and

ibm2 <- window(ibmclose, start=1, end=300)

ibmfit1 <- meanf(ibm2, h=69)
ibmfit2 <- rwf(ibm2, h=69)
ibmfit3 <- snaive(ibm2, h=69)

plot(ibmfit1)
lines(ibmfit2$mean,col=2)
lines(ibmfit3$mean,col=3)
lines(ibmclose)

#We see that the naive and drift give the same forecast here

compare the results on the test set. Which method did best?

ibm3 <- window(ibmclose, start=301)
accuracy(ibmfit1, ibm3)

##                         ME      RMSE       MAE        MPE     MAPE
## Training set  1.660438e-14  73.61532  58.72231  -2.642058 13.03019
## Test set     -1.306180e+02 132.12557 130.61797 -35.478819 35.47882
##                  MASE      ACF1 Theil's U
## Training set 11.52098 0.9895779        NA
## Test set     25.62649 0.9314689  19.05515

accuracy(ibmfit2, ibm3)

##                      ME      RMSE      MAE         MPE     MAPE     MASE
## Training set -0.2809365  7.302815  5.09699 -0.08262872 1.115844 1.000000
## Test set     -3.7246377 20.248099 17.02899 -1.29391743 4.668186 3.340989
##                   ACF1 Theil's U
## Training set 0.1351052        NA
## Test set     0.9314689  2.973486

accuracy(ibmfit3, ibm3)

##                      ME      RMSE      MAE         MPE     MAPE     MASE
## Training set -0.2809365  7.302815  5.09699 -0.08262872 1.115844 1.000000
## Test set     -3.7246377 20.248099 17.02899 -1.29391743 4.668186 3.340989
##                   ACF1 Theil's U
## Training set 0.1351052        NA
## Test set     0.9314689  2.973486

#The naive and drift methods perform far better than the simple mean forecast

Data624 HW1

Andrew Goldberg

9/7/2017

Problem 1

#1a

#1b

#1c

Problem 2