#Loading necessary libraries:
library(forecast)
library(tidytext)
library(stringr)
library(ggplot2)
library(dplyr)
library(tidyr)
library(knitr)
library(pander)
library(ggfortify)
library(rgdal)

Setting the working directory

setwd("C:/Users/Marissa.Valente/Documents/RStuff/MBA_678")

Chapter 2: Time Series Data

For this assignment I will be completing the assigned problems from Chapter 2 in the Practical Time Series Forecasting with R textbook by, Galit Schmueli and Kenneth C. Lichtendahl Jr.

Problem 1: Impact of September 11 on Air Travel in the United States

First I will import the September 11 Travel csv file used for this problem and look at the \(head()\) and \(glimpse()\).

Sep_11 <- read.csv("Sept11Travel.csv")
str(Sep_11)
## 'data.frame':    172 obs. of  4 variables:
##  $ Month: Factor w/ 172 levels "1-Apr","1-Aug",..: 86 75 119 42 130 108 97 53 163 152 ...
##  $ Air  : int  35153577 32965187 39993913 37981886 38419672 42819023 45770315 48763670 38173223 39051877 ...
##  $ Rail : int  454115779 435086002 568289732 568101697 539628385 570694457 618571581 609210368 488444939 514253920 ...
##  $ VMT  : num  163 153 178 179 189 ...
head(Sep_11)
##    Month      Air      Rail    VMT
## 1 Jan-90 35153577 454115779 163.28
## 2 Feb-90 32965187 435086002 153.25
## 3 Mar-90 39993913 568289732 178.42
## 4 Apr-90 37981886 568101697 178.68
## 5 May-90 38419672 539628385 188.88
## 6 Jun-90 42819023 570694457 189.16
glimpse(Sep_11)
## Observations: 172
## Variables: 4
## $ Month <fctr> Jan-90, Feb-90, Mar-90, Apr-90, May-90, Jun-90, Jul-90,...
## $ Air   <int> 35153577, 32965187, 39993913, 37981886, 38419672, 428190...
## $ Rail  <int> 454115779, 435086002, 568289732, 568101697, 539628385, 5...
## $ VMT   <dbl> 163.28, 153.25, 178.42, 178.68, 188.88, 189.16, 195.09, ...

Variables
Air - Actual airline revenue passenger miles
Rail - Rail Passenger Miles
VMT - Vehicle Miles Traveled (Auto)

After looking at the minimum and maximum values for the Air Passenger Miles, Rail Passenger Miles, and VMT monthly values, I converted the values to millions for Air Passgener Miles and Rail Passenger Miles to make it easier to visualize. (See Appendix for calculations)

Air2 <- Sep_11$Air/1000000
Rail2 <- Sep_11$Rail/1000000
VMT <- Sep_11$VMT

Plot each of the three pre-event time series (Air, Rail, Car)

Next I will create time series objects and plot the three pre-event time series.

Figure 1. Air Passenger Miles (in millions) Prior to 9/11 by Date (monthly)

Air_PreSep_11.ts <- ts(Air2, start=c(1990,1), end=c(2001,8), freq=12)
plot(Air_PreSep_11.ts, xlab="Date", ylab="Air Passenger Miles (in millions)", ylim=c(25,70), bty="l")

Figure 2. Rail Passenger Miles (in millions) Prior to 9/11 by Date (monthly)

Rail_PreSep_11.ts <- ts(Rail2, start=c(1990,1), end=c(2001,8), freq=12)
plot(Rail_PreSep_11.ts, xlab="Date", ylab="Rail Passenger Miles (in millions)", ylim=c(300,700), bty="l")

Figure 3. Vehicle Miles Traveled (VMT) Prior to 9/11 by Date (monthly)

VMT_PreSep_11.ts <- ts(VMT, start=c(1990,1), end=c(2001,8), freq=12)
plot(VMT_PreSep_11.ts, xlab="Date", ylab="Vehicle Miles Traveled", ylim=c(140,300), bty="l")

(a) What time series components appear from the plot?

Figures 1, 2, and 3, are the plots for each of the pre event time series (Air, Rail, and VMT), the time series components that appear include level, trend, seasonality, and noise.

Figures 1 and 3 appear to have an upward linear trend with additive seasonality. While Figure 2 appears to be a downward, nonlinear, quadratic-like trend.

Through further analysis we will be able to determine if seasonality and trend truly exist for each one of the three time series.

(b) What type of trend appears? Change the scale of the series, add trend lines, and suppress seasonality to better visualize the trend pattern

Changing the Scale of the Three Series

Figure 4. Air Passenger Miles (in millions, log scale) Prior to 9/11 by Date (monthly)

plot(Air_PreSep_11.ts, log="y", xlab="Date", ylab="Air Passenger  Miles(in millions, log scale)", ylim=c(25,70), bty="l")

Figure 5. Air Passenger Miles (in millions, log scale) Prior to 9/11 by Date (monthly, log scale)

plot(Air_PreSep_11.ts, log="xy", xlab="Date (log scale)", ylab="Air Passenger Miles (in millions, log scale)", ylim=c(25,70), bty="l")

Figure 6. Rail Passenger Miles (in millions, log scale) Prior to 9/11 by Date (monthly)

plot(Rail_PreSep_11.ts, log="y", xlab="Date", ylab="Rail Passenger Miles (in millions, log scale)", ylim=c(300,700), bty="l")

Figure 7. Rail Passenger Miles (in millions, log scale) Prior to 9/11 by Date (monthly, log scale)

plot(Rail_PreSep_11.ts, log="xy", xlab="Date (log scale)", ylab="Rail Passenger Miles (in millions, log scale)", ylim=c(300,700), bty="l")

Figure 8. Vehicle Miles Traveled (VMT) (log scale) Prior to 9/11 by Date (monthly)

plot(VMT_PreSep_11.ts, log="y", xlab="Date", ylab="VMT (log scale)", ylim=c(140,300), bty="l")

Figure 9. Vehicle Miles Traveled (VMT) (log scale) Prior to 9/11 by Date (monthly, log scale)

plot(VMT_PreSep_11.ts, log="xy", xlab="Date (log scale)", ylab="Vehicle Miles Traveled (log scale)", ylim=c(140,300), bty="l")

None of the log scales for any of the three times series objects seems to be more linear. Next we will look at trend lines to determine if/where trends exist in these times series.

Adding Trend Lines

Air_Linear <- tslm(Air_PreSep_11.ts ~ trend)
summary(Air_Linear)
## 
## Call:
## tslm(formula = Air_PreSep_11.ts ~ trend)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.4664 -3.4106 -0.6812  3.3608 11.8235 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 35.72843    0.83475   42.80   <2e-16 ***
## trend        0.17710    0.01027   17.24   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.912 on 138 degrees of freedom
## Multiple R-squared:  0.6829, Adjusted R-squared:  0.6806 
## F-statistic: 297.2 on 1 and 138 DF,  p-value: < 2.2e-16

Figure 10. Air Passenger Miles (in millions) Prior to 9/11 by Date (monthly) with Linear and Quadratic Trend Lines

plot(Air_PreSep_11.ts, xlab="Date", ylab="Air Passenger Miles (in millions)", ylim=c(25,70), bty="l")
lines(Air_Linear$fitted, lwd=2)

Air_Quad <- tslm(Air_PreSep_11.ts ~ trend + I(trend^2))
summary(Air_Quad)
## 
## Call:
## tslm(formula = Air_PreSep_11.ts ~ trend + I(trend^2))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.7066  -3.3855  -0.4943   3.3340  11.9016 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.745e+01  1.253e+00  29.897   <2e-16 ***
## trend       1.042e-01  4.102e-02   2.541   0.0122 *  
## I(trend^2)  5.169e-04  2.818e-04   1.834   0.0688 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.87 on 137 degrees of freedom
## Multiple R-squared:  0.6905, Adjusted R-squared:  0.686 
## F-statistic: 152.8 on 2 and 137 DF,  p-value: < 2.2e-16
lines(Air_Quad$fitted, lty=2, lwd=3)
title("Air Passenger Miles (in millions) 
  Linear and Quadratic Trend Lines")

Looking at the adjusted \(R^2\) value for the Air Passenger Miles prior to 9/11 the Linear trend line can explain 68.1% of the variance, while the Quadratic trend line can explain 68.6% of the variance. Therefore, the quadratic trend line is more accurate, although not by much.

Rail_Linear <- tslm(Rail_PreSep_11.ts ~ trend)
summary(Rail_Linear)
## 
## Call:
## tslm(formula = Rail_PreSep_11.ts ~ trend)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -142.23  -40.75   -4.37   41.27  133.14 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 547.6329    10.5118  52.097  < 2e-16 ***
## trend        -0.8377     0.1294  -6.476 1.53e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 61.86 on 138 degrees of freedom
## Multiple R-squared:  0.2331, Adjusted R-squared:  0.2275 
## F-statistic: 41.94 on 1 and 138 DF,  p-value: 1.532e-09

Figure 11. Rail Passenger Miles (in millions) Prior to 9/11 by Date (monthly) with Linear and Quadratic Trend Lines

plot(Rail_PreSep_11.ts, xlab="Date", ylab="Rail Passenger Miles (in millions)", ylim=c(300,700), bty="l")
lines(Rail_Linear$fitted, lwd=2)

Rail_Quad <- tslm(Rail_PreSep_11.ts ~ trend + I(trend^2))
summary(Rail_Quad)
## 
## Call:
## tslm(formula = Rail_PreSep_11.ts ~ trend + I(trend^2))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -137.635  -37.328   -1.559   41.652  125.113 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 576.828666  15.618390   36.93  < 2e-16 ***
## trend        -2.071369   0.511394   -4.05 8.52e-05 ***
## I(trend^2)    0.008749   0.003513    2.49    0.014 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 60.72 on 137 degrees of freedom
## Multiple R-squared:  0.2663, Adjusted R-squared:  0.2556 
## F-statistic: 24.86 on 2 and 137 DF,  p-value: 6.14e-10
lines(Rail_Quad$fitted, lty=2, lwd=3)
title("Rail Passenger Miles (in millions) 
  Linear and Quadratic Trend Lines")

Looking at the adjusted \(R^2\) value for the Rail Passenger Miles prior to 9/11 the Linear trend line can explain 22.8% of the variance, while the Quadratic trend line can explain 25.6% of the variance. Therefore, the quadratic trend line is more accurate than the linear trend line.

VMT_Linear <- tslm(VMT_PreSep_11.ts ~ trend)
summary(VMT_Linear)
## 
## Call:
## tslm(formula = VMT_PreSep_11.ts ~ trend)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -32.554  -9.185   0.559  11.087  23.272 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 173.17137    2.42094   71.53   <2e-16 ***
## trend         0.44249    0.02979   14.85   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.25 on 138 degrees of freedom
## Multiple R-squared:  0.6152, Adjusted R-squared:  0.6124 
## F-statistic: 220.6 on 1 and 138 DF,  p-value: < 2.2e-16

Figure 12. Vehicle Miles Traveled (VMT) Prior to 9/11 by Date (monthly) with Linear and Quadratic Trend Lines

plot(VMT_PreSep_11.ts, xlab="Date", ylab="Vehicle Miles Traveled", ylim=c(140,300), bty="l")
lines(VMT_Linear$fitted, lwd=2)

VMT_Quad <- tslm(VMT_PreSep_11.ts ~ trend + I(trend^2))
summary(VMT_Quad)
## 
## Call:
## tslm(formula = VMT_PreSep_11.ts ~ trend + I(trend^2))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -33.503  -8.745   0.798  10.974  23.752 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.745e+02  3.674e+00  47.487  < 2e-16 ***
## trend       3.867e-01  1.203e-01   3.214  0.00163 ** 
## I(trend^2)  3.956e-04  8.266e-04   0.479  0.63297    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.29 on 137 degrees of freedom
## Multiple R-squared:  0.6158, Adjusted R-squared:  0.6102 
## F-statistic: 109.8 on 2 and 137 DF,  p-value: < 2.2e-16
lines(VMT_Quad$fitted, lty=2, lwd=3)
title("VMT Linear and Quadratic Trend Lines")

Looking at the adjusted \(R^2\) value for the VMT prior to 9/11 the Linear trend line can explain 61.2% of the variance, while the Quadratic trend line can explain 61.0% of the variance. Therefore, the linear trend line is more accurate, although not by much.

It seems as though all three of the time series objects can be explained by either linear or quadratic trend lines, although the Rail Passenger Miles has a very low adjusted \(R^2\) indicating that the model explains less of the variability of the response data than the other two time series objects.

However, trend still appears in all three of the time series.

Suppressing seasonality

As this chapter suggests, aggregating the time series can help us surpress seasonality, this may allow us to see more trends or seasonality in the three time series objects. I will be aggregating the monthly data for all three time series into years and then I will also create separate plots for each series.

Figure 13. Air Passenger Miles (in millions) Prior to 9/11 by Date (annually)

#aggregate by year

annually_Air <- aggregate(Air_PreSep_11.ts, nfrequency=1, FUN=sum)
plot(annually_Air, bty="l")

Figure 14. Rail Passenger Miles (in millions) Prior to 9/11 by Date (annually)

#aggregate by year

annually_Rail <- aggregate(Rail_PreSep_11.ts, nfrequency=1, FUN=sum)
plot(annually_Rail, bty="l")

Figure 15. Vehicle Miles Traveled (VMT) Prior to 9/11 by Date (annually)

#aggregate by year

annually_VMT <- aggregate(VMT_PreSep_11.ts, nfrequency=1, FUN=sum)
plot(annually_VMT, bty="l")

After looking at Figure 13, 14, and 15 it seems that Rail Passenger Miles were declining from 1991 until 1996, while Air Passenger Miles and Vehicle Miles Traveled (VMT) were increasing steadily during the same period of time.

Now we will look at Air Passenger Miles, Rail Passenger Miles, and VMT by month for the years prior to the 9/11 attack.

Figure 16. Air Passenger Miles (in millions) Prior to 9/11 Broken Out by Month

par(oma = c(0, 0, 0, 2))
Air_xrange <- c(1,14)
Air_yrange <- range(Air_PreSep_11.ts)

plot(Air_xrange, Air_yrange, type="n", xlab="Year", ylab="Air Passenger Miles (in millions)", bty="l")

colors <- rainbow(12)
linetype <- c(1:12)
plotchar <- c(1:12)

for (i in 1:12) {
  currentMonth <- subset(Air_PreSep_11.ts, cycle(Air_PreSep_11.ts)==i)
  lines(currentMonth, type="b", lwd=1.5,
        lty=linetype[i], col=colors[i], pch=plotchar[i])
  
}

title("Air Passenger Miles (in millions) by Month")

legend("right", legend = 1:12, cex = 0.8, col = colors, pch = plotchar, lty = linetype, title = "Month", xpd=NA)

Figure 17. Rail Passenger Miles (in millions) Prior to 9/11 Broken Out by Month

par(oma = c(0, 0, 0, 2))
Rail_xrange <- c(1,14)
Rail_yrange <- range(Rail_PreSep_11.ts)

plot(Rail_xrange, Rail_yrange, type="n", xlab="Year", ylab="Rail Passenger Miles (in millions)", bty="l")

colors <- rainbow(12)
linetype <- c(1:12)
plotchar <- c(1:12)

for (i in 1:12) {
  currentMonth <- subset(Rail_PreSep_11.ts, cycle(Rail_PreSep_11.ts)==i)
  lines(currentMonth, type="b", lwd=1.5,
        lty=linetype[i], col=colors[i], pch=plotchar[i])
  
}

title("Rail Passenger Miles (in millions) by Month")

legend("right", legend = 1:12, cex = 0.8, col = colors, pch = plotchar, lty = linetype, title = "Month", xpd=NA)

Figure 18. VMT Prior to 9/11 Broken Out by Month

par(oma = c(0, 0, 0, 2))
VMT_xrange <- c(1,14)
VMT_yrange <- range(VMT_PreSep_11.ts)

plot(VMT_xrange, VMT_yrange, type="n", xlab="Year", ylab="VMT", bty="l")

colors <- rainbow(12)
linetype <- c(1:12)
plotchar <- c(1:12)

for (i in 1:12) {
  currentMonth <- subset(VMT_PreSep_11.ts, cycle(VMT_PreSep_11.ts)==i)
  lines(currentMonth, type="b", lwd=1.5,
        lty=linetype[i], col=colors[i], pch=plotchar[i])
  
}

title("Vehicle Miles Traveled by Month")

legend("right", legend = 1:12, cex = 0.8, col = colors, pch = plotchar, lty = linetype, title = "Month", xpd=NA)

After looking at Figure 16, 17, and 18 it becomes clear there is seasonality in the three time series data. Even though the data itself changes, the trend shows that the highest miles, whether it be Air Passenger Miles, Rail Passenger Miles, or Vehicle Miles Traveled (VMT), occurred in August over the years from 1990 to 2001 and the lowest month for miles was February over that same time period. Furthermore, in general, the summer months seem to be higher for travel across transportation type, while January and February are lower. See Appendix - Figures J, K, and L for more seasonal graphs.


Problem 3: Shipments of Household Appliances

For problem 3, I will import the Appliance Shipments csv file which contains a series of quarterly shipments (in millions of USD) of U.S. household appliances between 1985 and 1989. After importing I will look at the \(head()\) and \(tail()\) of the dataset.

#Importing the Appliance Shipments
App_Ship <- read.csv("ApplianceShipments.csv")
str(App_Ship)
## 'data.frame':    20 obs. of  2 variables:
##  $ Quarter  : Factor w/ 20 levels "Q1-1985","Q1-1986",..: 1 6 11 16 2 7 12 17 3 8 ...
##  $ Shipments: int  4009 4321 4224 3944 4123 4522 4657 4030 4493 4806 ...
head(App_Ship)
##   Quarter Shipments
## 1 Q1-1985      4009
## 2 Q2-1985      4321
## 3 Q3-1985      4224
## 4 Q4-1985      3944
## 5 Q1-1986      4123
## 6 Q2-1986      4522
tail(App_Ship)
##    Quarter Shipments
## 15 Q3-1988      4417
## 16 Q4-1988      4258
## 17 Q1-1989      4245
## 18 Q2-1989      4900
## 19 Q3-1989      4585
## 20 Q4-1989      4533

Variables
Quarter - Shows the quarter and year of the shipment
Shipments - Shows household appliances shipments (in millions of USD) occurring in the particular time period (Quarterly)

(a) Create a well-formatted time plot of the data

First I will create a time series object from the appliance shipments data. Then I will show the time plot on a quarterly basis, and then an annual basis to see if a trend or seasonality becomes obvious.

Figure 19. U.S. Household Appliance Shipments (in millions of USD) by Quarter

App_Ship.ts <- ts(App_Ship$Shipments, start=c(1985,1), end=c(1989,4), freq=4)
plot(App_Ship.ts, xlab="Year", ylab="Appliances Shipped (in millions of USD, $)", ylim=c(3100,5000), bty="l")

Figure 20. U.S. Household Appliance Shipments (in millions of USD) Annually

App_Ship_annually <- aggregate(App_Ship.ts, nfrequency = 1, FUN=sum)
plot(App_Ship_annually, bty="l")

It is difficult to tell from Figure 19 and Figure 20 if there is a trend or seasonality in the data set. Next I will look at linear and quadratic trend lines against the quarterly data and then look at the data by quarters.

(b) Which of the four components (level, trend, seasonality, noise) seem to be present in this series?

Like all time series data, both noise and level are present in this series. Additionally, the analysis below shows us that some seasonality exists within this dataset and that there is a slight quadratic trend is likely.

Adding Trend Lines

App_Ship_Linear <- tslm(App_Ship.ts ~trend)
summary(App_Ship_Linear)
## 
## Call:
## tslm(formula = App_Ship.ts ~ trend)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -339.06 -171.14    3.13  137.27  393.40 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4167.663    111.035  37.535   <2e-16 ***
## trend         24.494      9.269   2.643   0.0165 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 239 on 18 degrees of freedom
## Multiple R-squared:  0.2795, Adjusted R-squared:  0.2395 
## F-statistic: 6.983 on 1 and 18 DF,  p-value: 0.01655

Figure 21. U.S. Household Appliance Shipments Linear and Quadratic Trend Lines

plot(App_Ship.ts, xlab="Date", ylab="Appliance Shipments (in millions of USD)", ylim=c(3100,5000),bty="l")
lines(App_Ship_Linear$fitted, lwd=2)

App_Ship_Quad <- tslm(App_Ship.ts ~trend + I(trend^2))
summary(App_Ship_Quad)
## 
## Call:
## tslm(formula = App_Ship.ts ~ trend + I(trend^2))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -398.39 -155.53   32.63  181.62  346.62 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3982.931    173.764  22.921 3.19e-14 ***
## trend         74.876     38.109   1.965    0.066 .  
## I(trend^2)    -2.399      1.763  -1.361    0.191    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 233.6 on 17 degrees of freedom
## Multiple R-squared:  0.3503, Adjusted R-squared:  0.2739 
## F-statistic: 4.583 on 2 and 17 DF,  p-value: 0.02559
lines(App_Ship_Quad$fitted, lty=2, lwd=3)
title("Appliance Shipments (in millions of USD) 
      Linear and Quadratic Trend Lines")

Looking at the adjusted \(R^2\) value of the Linear trend line for U.S. household appliances shipments (in millions of USD) 24% of the variance can be explained, while the Quadratic trend line can explain 27% of the variance. Therefore, the quadratic trend line is more accurate.

Figure 22. Appliance Shipments Broken Down by Quarter

par(oma = c(0, 0, 0, 2))

App_Ship_xrange <- c(1,6)
App_Ship_yrange <- range(App_Ship.ts)

plot(App_Ship_xrange, App_Ship_yrange, type="n", xlab="Year", ylab="Appliance Shipments (in millions of USD)", bty="l")

colors <- c(1:4)
linetype <- c(1:4)
plotchar <-c(1:4)

for (i in 1:4) {
  currentQuarter <- subset(App_Ship.ts, cycle(App_Ship.ts)==i)
  lines(currentQuarter, type="b", lwd=1.5,
        lty=linetype[i], col=colors[i], pch=plotchar[i])
}
title("Appliance Shipments Broken Down by Quarter")
legend("right", legend = 1:4, cex = 0.8, col = colors, pch = plotchar, lty = linetype, title = "Quarter", xpd=NA)

Figure 22 shows the breakout of appliance shipments by quarter, over the 5 years the data spans (Q1 1985 - Q4 1989). Although, there is some overlap, it does seem that Q2 had the highest appliance shipments (in millions of USD) across 1985 to 1989, which means seasonality does exist to some extent.

Figure 23. Appliance Shipments Broken Down by Quarter

ggseasonplot(App_Ship.ts)

Figure 23, allows us to further see that although the general seasonality of Q2 exists, it is not the case for 1986. We would likely need more data to see if this seasonal pattern continues.


Problem 6: Forecasting Shampoo Sales

For the last problem in this assignment, I will import the Shampoo Sales csv file which contains data on the monthly sales of a certain shampoo over a 3-year period. I will then look at the \(head()\) and \(tail()\)

Shampoo_Sales<- read.csv("ShampooSales.csv")
str(Shampoo_Sales)
## 'data.frame':    36 obs. of  2 variables:
##  $ Month        : Factor w/ 36 levels "Apr-95","Apr-96",..: 13 10 22 1 25 19 16 4 34 31 ...
##  $ Shampoo.Sales: num  266 146 183 119 180 ...
head(Shampoo_Sales)
##    Month Shampoo.Sales
## 1 Jan-95         266.0
## 2 Feb-95         145.9
## 3 Mar-95         183.1
## 4 Apr-95         119.3
## 5 May-95         180.3
## 6 Jun-95         168.5
tail(Shampoo_Sales)
##     Month Shampoo.Sales
## 31 Jul-97         575.5
## 32 Aug-97         407.6
## 33 Sep-97         682.0
## 34 Oct-97         475.3
## 35 Nov-97         581.3
## 36 Dec-97         646.9

Variables
Month - Shows the month and year of the sales data captured Sales - Monthly sales for a certain type of shampoo over a 3-year period

(a) Create a well-formatted time plot of the data

First I will create a time series object from the shampoo sales data. Then I will show the time plot on a monthly basis, and then an aggregate on a quarterly as well as, annual basis to see if a trend or seasonality becomes obvious.

Figure 24. Shampoo Sales by Month

Shampoo.ts <- ts(Shampoo_Sales$Shampoo.Sales, start=c(1995,1), end=c(1997,12), freq=12)
plot(Shampoo.ts, xlab="Year", ylab="Monthly Shampoo Sales", ylim=c(0,685), bty="l")

Figure 25. Shampoo Sales by Quarter

Quarterly_Shampoo <- aggregate(Shampoo.ts, nfrequency = 4, FUN=sum)
plot(Quarterly_Shampoo, bty="l")

Figure 26. Shampoo Sales by Year

Annual_Shampoo <- aggregate(Shampoo.ts, nfrequency = 1, FUN=sum)
plot(Annual_Shampoo, bty="l")

Figures 24, 25, and 26 allow us to see a general trend, this trend is an upward trend that could be either linear or quadratic, again it is hard to tell from just these figures. However, it does not appear to have seasonality.

Further analysis is conducted below to determine the type of trend, as well as, if seasonality does exist.

(b) Which of the four components (level, trend, seasonality, noice) seem to be present in this series?

Like all time series data, both noise and level are present in this series. There is a trend occurring in the dataset, although further analysis will be conducted to determine what type of trend. At this point it does not look like there is any seasonality in the dataset but further analysis will determine if that is the case.

Adding Trend Lines

Shampoo_Linear <- tslm(Shampoo.ts ~trend)
summary(Shampoo_Linear)
## 
## Call:
## tslm(formula = Shampoo.ts ~ trend)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -108.74  -52.12  -16.13   43.48  194.25 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    89.14      26.72   3.336  0.00207 ** 
## trend          12.08       1.26   9.590 3.37e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 78.5 on 34 degrees of freedom
## Multiple R-squared:  0.7301, Adjusted R-squared:  0.7222 
## F-statistic: 91.97 on 1 and 34 DF,  p-value: 3.368e-11

Figure 27. Shampoo Sales Linear and Quadratic Trend Lines

plot(Shampoo.ts, xlab="Date", ylab="Shampoo Sales", ylim=c(0,685),bty="l")
lines(Shampoo_Linear$fitted, lwd=2)

Shampoo_Quad <- tslm(Shampoo.ts ~trend + I(trend^2))
summary(Shampoo_Quad)
## 
## Call:
## tslm(formula = Shampoo.ts ~ trend + I(trend^2))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -104.148  -42.075   -8.438   33.924  144.582 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 202.8789    33.3002   6.092 7.35e-07 ***
## trend        -5.8801     4.1501  -1.417    0.166    
## I(trend^2)    0.4854     0.1088   4.461 8.93e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 62.93 on 33 degrees of freedom
## Multiple R-squared:  0.8316, Adjusted R-squared:  0.8214 
## F-statistic: 81.51 on 2 and 33 DF,  p-value: 1.708e-13
lines(Shampoo_Quad$fitted, lty=2, lwd=3)
title("Monthly Shampoo Sales  
      Linear and Quadratic Trend Lines")

Looking at the adjusted \(R^2\) value of the Linear trend line for Shampoo sales 72% of the variance can be explained, while the Quadratic trend line can explain 82% of the variance. Therefore, the quadratic trend line is more accurate and more representative of the existing trend.

(c) Do you expect to see seasonlity in sales of shampoo? Why?

I wouldn’t expect there to be seasonality in sales of shampoo, mostly because I would think people using shampoo would have their own routine for when they use shampoo, generally daily. Therefore, I wouldn’t think there would have a seasonality unless the data could be broken down into hours of the day and then we may see more people using shampoo at a certain time in the day. However, I do think there may be seasonality in the types of shampoo purchased, unfortunately this data series only includes the sales for one particular type of shampoo, but I would be curious to look at different types and weather patterns with a hypothesis that more shampoos focused on moisturizing are sold in the drier months. It would also be interesting to see the difference between gender and shampoo sales for different types of shampoo.

Figure 28. Shampoo Sales by Month

par(oma = c(0,0,0,2))

plot(c(1,4), c(115,685), type="n", bty="l", xlab="Date", ylab="Shampoo Sales")

Shampoo_Jan <- subset(Shampoo.ts, cycle(Shampoo.ts)==1)
lines(Shampoo_Jan, col=1, pch=1, type="b")
Shampoo_Feb <- subset(Shampoo.ts, cycle(Shampoo.ts)==2)
lines(Shampoo_Feb, col=2, pch=2, type="b")
Shampoo_Mar <- subset(Shampoo.ts, cycle(Shampoo.ts)==3)
lines(Shampoo_Mar, col=3, pch=3, type="b")
Shampoo_Apr <- subset(Shampoo.ts, cycle(Shampoo.ts)==4)
lines(Shampoo_Apr, col=4, pch=4, type="b")
Shampoo_May <- subset(Shampoo.ts, cycle(Shampoo.ts)==5)
lines(Shampoo_May, col=5, pch=5, type="b")
Shampoo_Jun <- subset(Shampoo.ts, cycle(Shampoo.ts)==6)
lines(Shampoo_Jun, col=6, pch=6, type="b")
Shampoo_Jul <- subset(Shampoo.ts, cycle(Shampoo.ts)==7)
lines(Shampoo_Jul, col=7, pch=7, type="b")
Shampoo_Aug <- subset(Shampoo.ts, cycle(Shampoo.ts)==8)
lines(Shampoo_Aug, col=8, pch=8, type="b")
Shampoo_Sep <- subset(Shampoo.ts, cycle(Shampoo.ts)==9)
lines(Shampoo_Sep, col=9, pch=9, type="b")
Shampoo_Oct <- subset(Shampoo.ts, cycle(Shampoo.ts)==10)
lines(Shampoo_Oct, col=10, pch=10, type="b")
Shampoo_Nov <- subset(Shampoo.ts, cycle(Shampoo.ts)==11)
lines(Shampoo_Feb, col=11, pch=11, type="b")
Shampoo_Dec <- subset(Shampoo.ts, cycle(Shampoo.ts)==12)
lines(Shampoo_Dec, col=12, pch=12, type="b")

#add a legend
legend("right", legend = 1:12, cex=0.8, col=1:12, pch=1:12, lty=linetype, title="Month", xpd=NA)

Figure 28 shows the break out of shampoo sales by month, over the 3 years the data spans (January 1995 to December 1997). It appears there is no clear seasonality within this data series.


Appendix: For Practice Problems 1, 3, and 6.

Problem 1:

min(Sep_11$Air)
## [1] 29672427
max(Sep_11$Air)
## [1] 69003617
min(Sep_11$Rail)
## [1] 326874247
max(Sep_11$Rail)
## [1] 664013874
min(Sep_11$VMT)
## [1] 153.25
max(Sep_11$VMT)
## [1] 261.3

Figure A. Air Passenger Miles - Autoplot Calculation

autoplot(Air_PreSep_11.ts, ts.color = 'red', ts.linetype = 'dashed', xlab="Date", ylab="Air Passenger Miles (in millions)")

Figure B. Rail Passenger Miles - Autoplot Calculation

autoplot(Rail_PreSep_11.ts, ts.color = 'red', ts.linetype = 'dashed', xlab="Date", ylab="Rail Passenger Miles (in millions)")

Figure C. VMT - Autoplot Calculation

autoplot(VMT_PreSep_11.ts, ts.color = 'red', ts.linetype = 'dashed', xlab="Date", ylab="Vehicle Miles Traveled")

Figure D. Zooming in on the Air Passenger Miles (in millions) by Date for the 12 Months Prior to 9/11

Air_PreSep_12MonthsPrior <- window(Air_PreSep_11.ts, 2000, c(2001,8))
plot(Air_PreSep_12MonthsPrior, ylab="Air Passenger Miles (in millions)", ylim=c(25,70), bty="l")

Figure E. Air Passenger Miles (in millions) for January 2001 through April 2004

Year_2001_Sep_11.ts <- ts(Air2, start=c(2001,1), end=c(2004,4), freq=12)
plot(Year_2001_Sep_11.ts, xlab="Date", ylab="Air Passenger Miles (in millions)", ylim=c(25,70), bty="l")

Figure F. Air Passenger Miles (in millions) for July 2001 through September 2002

Before_After_Sep_11_Months.ts <- ts(Air2, start=c(2001,7), end=c(2002,7), freq=12)
plot(Before_After_Sep_11_Months.ts, xlab="Date", ylab="Air Passenger Miles (in millions)", ylim=c(30,55), bty="l")

Figure G. Air Passenger Miles (in millions) Before and After 9/11 (January 1990 through April 2004)

Air.ts <- ts(Air2, start=c(1990,1), end=c(2004,4), freq=12)
plot(Air.ts, xlab="Date", ylab="Air Passenger Miless (in millions)", ylim=c(25,70), bty="l")

Figure H. Air Passenger Miles (in millions) Before and After 9/11 (January 1990 through April 2004)

Air.ts <- ts(Air2, start=c(1990,1), end=c(2004,4), freq=1)
plot(Air.ts, xlab="Date", ylab="Air Passenger Miless (in millions)", ylim=c(25,70), bty="l")

Figure I. Air Passenger Miles (in millions) Prior to 9/11 Broken Out by Month

par(oma = c(0,0,0,2))

plot(c(1,14), c(25,70), type="n", bty="l", xlab="Date", ylab="Air Passenger Miles (in millions)")

Air_janRiders <- subset(Air_PreSep_11.ts, cycle(Air_PreSep_11.ts)==1)
lines(Air_janRiders, col=1, pch=1, type="b")
Air_febRiders <- subset(Air_PreSep_11.ts, cycle(Air_PreSep_11.ts)==2)
lines(Air_febRiders, col=2, pch=2, type="b")
Air_marRiders <- subset(Air_PreSep_11.ts, cycle(Air_PreSep_11.ts)==3)
lines(Air_marRiders, col=3, pch=3, type="b")
Air_aprRiders <- subset(Air_PreSep_11.ts, cycle(Air_PreSep_11.ts)==4)
lines(Air_aprRiders, col=4, pch=4, type="b")
Air_mayRiders <- subset(Air_PreSep_11.ts, cycle(Air_PreSep_11.ts)==5)
lines(Air_mayRiders, col=5, pch=5, type="b")
Air_junRiders <- subset(Air_PreSep_11.ts, cycle(Air_PreSep_11.ts)==6)
lines(Air_junRiders, col=6, pch=6, type="b")
Air_julRiders <- subset(Air_PreSep_11.ts, cycle(Air_PreSep_11.ts)==7)
lines(Air_julRiders, col=7, pch=7, type="b")
Air_augRiders <- subset(Air_PreSep_11.ts, cycle(Air_PreSep_11.ts)==8)
lines(Air_augRiders, col=8, pch=8, type="b")
Air_sepRiders <- subset(Air_PreSep_11.ts, cycle(Air_PreSep_11.ts)==9)
lines(Air_sepRiders, col=9, pch=9, type="b")
Air_octRiders <- subset(Air_PreSep_11.ts, cycle(Air_PreSep_11.ts)==10)
lines(Air_octRiders, col=10, pch=10, type="b")
Air_novRiders <- subset(Air_PreSep_11.ts, cycle(Air_PreSep_11.ts)==11)
lines(Air_novRiders, col=11, pch=11, type="b")
Air_decRiders <- subset(Air_PreSep_11.ts, cycle(Air_PreSep_11.ts)==12)
lines(Air_decRiders, col=12, pch=12, type="b")

#add a legend
legend("right", legend = 1:12, cex=0.8, col=1:12, pch=1:12, lty=linetype, title="Month", xpd=NA)

Figure J. Air Passenger Miles (in millions) Prior to 9/11 Broken Out by Month

ggseasonplot(Air_PreSep_11.ts)

Figure K. Rail Passenger Miles (in millions) Prior to 9/11 Broken Out by Month

ggseasonplot(Rail_PreSep_11.ts)

Figure L. VMT Prior to 9/11 Broken Out by Month

ggseasonplot(VMT_PreSep_11.ts)

Problem 3:

Calculating the minimum and maximum values for Shipments (in millions of USD).

min(App_Ship$Shipments)
## [1] 3944
max(App_Ship$Shipments)
## [1] 4900

Figure M. Appliance Shipments (in millions of USD, log scale) by Year (log scale)

plot(App_Ship.ts, log="xy", xlab="Year (log scale)", ylab="Appliances Shipped (log scale)", ylim=c(3100,5000), bty="l")

Problem 6:

Calculating the minimum and maximum values for Shampoo Sales.

min(Shampoo_Sales$Shampoo.Sales)
## [1] 119.3
max(Shampoo_Sales$Shampoo.Sales)
## [1] 682