## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──
## ✔ tibble 3.2.1 ✔ tsibble 1.1.6
## ✔ dplyr 1.1.4 ✔ tsibbledata 0.4.1
## ✔ tidyr 1.3.1 ✔ feasts 0.4.1
## ✔ lubridate 1.9.3 ✔ fable 0.4.1
## ✔ ggplot2 3.5.1
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ purrr 1.0.2 ✔ stringr 1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## [1] "2025-03-30 17:09:46 EDT"
Consider the GDP information in data set called global_economy, which is already embedded in fpp3 package (no need to upload externally)
## # A tsibble: 15,150 x 9 [1Y]
## # Key: Country [263]
## Country Code Year GDP Growth CPI Imports Exports Population
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanistan AFG 1960 537777811. NA NA 7.02 4.13 8996351
## 2 Afghanistan AFG 1961 548888896. NA NA 8.10 4.45 9166764
## 3 Afghanistan AFG 1962 546666678. NA NA 9.35 4.88 9345868
## 4 Afghanistan AFG 1963 751111191. NA NA 16.9 9.17 9533954
## 5 Afghanistan AFG 1964 800000044. NA NA 18.1 8.89 9731361
## 6 Afghanistan AFG 1965 1006666638. NA NA 21.4 11.3 9938414
## 7 Afghanistan AFG 1966 1399999967. NA NA 18.6 8.57 10152331
## 8 Afghanistan AFG 1967 1673333418. NA NA 14.2 6.77 10372630
## 9 Afghanistan AFG 1968 1373333367. NA NA 15.2 8.90 10604346
## 10 Afghanistan AFG 1969 1408888922. NA NA 15.0 10.1 10854428
## # ℹ 15,140 more rows
# 1.Answer:
At_data<-global_economy %>%
filter(Country =="Austria")
#Plot the GDP Per Capita
ggplot(At_data,aes(x = Year, y = GDP/Population )) +
geom_line() +
labs(title = "GDP per Capita for Austria over time",
x = "Year",
y = "GDP per Capita")For each of the following series, make a graph of the data. If transforming seems appropriate, do so and describe the effect. Comment below in answer:
# 2a.Answer:
At_data<-global_economy %>%
filter(Country =="Austria")
ggplot(At_data,aes(x = Year, y = GDP/Population )) +
geom_line() +
labs(title = "GDP per Capita for Austria over time",
x = "Year",
y = "GDP per Capita")United States GDP from global_economy.
Slaughter of Victorian “Bulls, bullocks and steers” in aus_livestock
# 2c.Answer:
aus_livestock %>% filter(State=="Victoria", Animal=="Bulls, bullocks and steers") %>% autoplot(Count)Victorian Electricity Demand from vic_elec.
# transformation
vic_elec %>% group_by(Date) %>% index_by(Date = yearweek(Time)) %>% summarise(Demand = sum(Demand)) %>% autoplot(Demand)# 3a.Answer:
canadian_gas %>%
autoplot(Volume)+
labs(title = "Canadian Gas Production",
y = "billions of cubic meter")canadian_gas %>%
gg_subseries(Volume)+
labs(title = "Canadian Gas Production",
y = "billions of cubic meter")canadian_gas %>%
gg_season(Volume)+
labs(title = "Canadian Gas Production",
y = "billions of cubic meter")Do an STL decomposition of the data. You will need to choose a seasonal window to allow for the changing shape of the seasonal component.
# 3b.Answer:
canadian_gas %>%
model(
STL(Volume ~ trend(window = 25) +
season(window = 16),
robust = TRUE)) %>%
components() %>%
autoplot()+
labs(title = "Canadian Gas STL decomposition ")How does the seasonal shape change over time? [Hint: Try plotting the seasonal component using gg_season().]
# 3c.Answer:
canadian_gas %>% gg_season(Volume)+
labs(title = "Canadian Gas Production", y = "billions of cubic meter")produce a plausible seasonally adjusted series? What are these numbers, plot the series.
# 3d.Answer:
canadian_gas %>%
model(
STL(Volume ~ trend(window = 21) + season(window = 13), robust = TRUE)
) %>%
components() %>%
ggplot(aes(x = Month)) +
geom_line(aes(y = Volume, colour = "Data")) +
geom_line(aes(y = season_adjust, colour = "Seasonally Adjusted")) +
geom_line(aes(y = trend, colour = "Trend")) +
labs(title = "STL decomposition of Canadian Gas Production") +
scale_colour_manual(
values = c("red", "green", "purple"),
breaks = c("Data", "Seasonally Adjusted", "Trend")
)For retail time series, use the below code:
# run the code
set.seed(12345678)
myseries <- aus_retail %>%
filter(`Series ID` == sample(aus_retail$`Series ID`,1))Create a training dataset consisting of observations before 2011
## # A tsibble: 6 x 5 [1M]
## # Key: State, Industry [1]
## State Industry `Series ID` Month Turnover
## <chr> <chr> <chr> <mth> <dbl>
## 1 Northern Territory Clothing, footwear and perso… A3349767W 2010 Jul 16.1
## 2 Northern Territory Clothing, footwear and perso… A3349767W 2010 Aug 13.8
## 3 Northern Territory Clothing, footwear and perso… A3349767W 2010 Sep 13.6
## 4 Northern Territory Clothing, footwear and perso… A3349767W 2010 Oct 12.3
## 5 Northern Territory Clothing, footwear and perso… A3349767W 2010 Nov 11.7
## 6 Northern Territory Clothing, footwear and perso… A3349767W 2010 Dec 17.9
Check that your data have been split appropriately by producing the following plot.
Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).
Check the residuals.
Produce forecasts for the test data with given code below:
## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`
Joining, by = c(“State”, “Industry”, “Series ID”, “Month”, “Turnover”)
Compare the accuracy of your forecasts against the actual values with given code below:
## # A tibble: 1 × 12
## State Industry .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Norther… Clothin… SNAIV… Trai… 0.439 1.21 0.915 5.23 12.4 1 1 0.768
## # A tibble: 1 × 12
## .model State Industry .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Nort… Clothin… Test 0.836 1.55 1.24 5.94 9.06 1.36 1.28 0.601
## # A tibble: 1 × 12
## State Industry .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Norther… Clothin… SNAIV… Trai… 0.439 1.21 0.915 5.23 12.4 1 1 0.768
## # A tibble: 1 × 12
## .model State Industry .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Nort… Clothin… Test 0.836 1.55 1.24 5.94 9.06 1.36 1.28 0.601
How sensitive are the accuracy measures to the amount of training data used?
# 4g Answer:
bind_rows(
accuracy(fit),
accuracy(fc, myseries)
) %>%
select(-State, -Industry, -.model)## # A tibble: 2 × 9
## .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Training 0.439 1.21 0.915 5.23 12.4 1 1 0.768
## 2 Test 0.836 1.55 1.24 5.94 9.06 1.36 1.28 0.601
Create a training set for Australian takeaway food turnover (aus_retail) by withholding the last four years as a test set.
# 5a.Answer:
set.seed(2100)
takeaway_food_turnover <-
aus_retail %>%
filter(Industry =="Takeaway food services") %>%
summarise(Tota_Turnover = sum(Turnover))
takeaway_food_turnover %>% autoplot(Tota_Turnover)takeaway_train_set <-
takeaway_food_turnover %>%
filter(year(Month) < 2018) #with hold last 4 years
takeaway_train_set %>% autoplot(Tota_Turnover)Fit all the appropriate benchmark methods to the training set and forecast the periods covered by the test set.
# 5b.Answer:
takeaway_dcmp = takeaway_train_set %>%
model(stl = STL(Tota_Turnover))
components(takeaway_dcmp) %>% autoplot(show.legend = FALSE)takeaway_fit <- takeaway_train_set %>%
model(
Naive = NAIVE(Tota_Turnover),
Seasonal_naive = SNAIVE(Tota_Turnover),
Drift = RW(Tota_Turnover ~ drift())
)
takeaway_fc <- takeaway_fit %>% forecast(h = 4)
takeaway_fc %>% autoplot(takeaway_train_set)Compute the accuracy of your forecasts. Which method does best?
## # A tibble: 3 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Drift Test -244. 253. 244. -16.8 16.8 5.46 4.47 -0.594
## 2 Naive Test -235. 245. 235. -16.2 16.2 5.26 4.32 -0.552
## 3 Seasonal_naive Test 9.28 26.8 22.8 0.650 1.55 0.511 0.472 -0.153
Using the code below, get a series (it gets a series randomly by using sample() function):
set.seed(12345678)
myseries <- aus_retail %>%
filter(`Series ID` == sample(aus_retail$`Series ID`,1))see head of your series to check it is a tsibble data, and remove NA’s if there is any with these commands:
## # A tsibble: 6 x 5 [1M]
## # Key: State, Industry [1]
## State Industry `Series ID` Month Turnover
## <chr> <chr> <chr> <mth> <dbl>
## 1 Northern Territory Clothing, footwear and perso… A3349767W 1988 Apr 2.3
## 2 Northern Territory Clothing, footwear and perso… A3349767W 1988 May 2.9
## 3 Northern Territory Clothing, footwear and perso… A3349767W 1988 Jun 2.6
## 4 Northern Territory Clothing, footwear and perso… A3349767W 1988 Jul 2.8
## 5 Northern Territory Clothing, footwear and perso… A3349767W 1988 Aug 2.9
## 6 Northern Territory Clothing, footwear and perso… A3349767W 1988 Sep 3
What is the name of the series you randomly choose? Write it.
## # A tsibble: 6 x 5 [1M]
## # Key: State, Industry [1]
## State Industry `Series ID` Month Turnover
## <chr> <chr> <chr> <mth> <dbl>
## 1 Northern Territory Clothing, footwear and perso… A3349767W 1988 Apr 2.3
## 2 Northern Territory Clothing, footwear and perso… A3349767W 1988 May 2.9
## 3 Northern Territory Clothing, footwear and perso… A3349767W 1988 Jun 2.6
## 4 Northern Territory Clothing, footwear and perso… A3349767W 1988 Jul 2.8
## 5 Northern Territory Clothing, footwear and perso… A3349767W 1988 Aug 2.9
## 6 Northern Territory Clothing, footwear and perso… A3349767W 1988 Sep 3
Run a linear regression of Turnover on trend.(Hint: use TSLM() and trend() functions)
# 6b.Answer:
myseries_Tslm <- myseries %>% model(trend_model = TSLM(Turnover ~ trend())) %>% report()## Series: Turnover
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0795 -1.1704 -0.1640 0.9683 7.4514
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.5313376 0.1983464 17.80 <2e-16 ***
## trend() 0.0307747 0.0009291 33.12 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.901 on 367 degrees of freedom
## Multiple R-squared: 0.7493, Adjusted R-squared: 0.7486
## F-statistic: 1097 on 1 and 367 DF, p-value: < 2.22e-16
See the regression result by report() command.
# 6c.Answer:
myseries_Tslm <- myseries %>% model(trend_model = TSLM(Turnover ~ trend())) %>% report()## Series: Turnover
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0795 -1.1704 -0.1640 0.9683 7.4514
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.5313376 0.1983464 17.80 <2e-16 ***
## trend() 0.0307747 0.0009291 33.12 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.901 on 367 degrees of freedom
## Multiple R-squared: 0.7493, Adjusted R-squared: 0.7486
## F-statistic: 1097 on 1 and 367 DF, p-value: < 2.22e-16
By using this model, forecast it for the next 3 years. What are the values of the next 3 years, monthly values?
## # A fable: 3 x 6 [1M]
## # Key: State, Industry, .model [1]
## State Industry .model Month
## <chr> <chr> <chr> <mth>
## 1 Northern Territory Clothing, footwear and personal accessory … trend… 2019 Jan
## 2 Northern Territory Clothing, footwear and personal accessory … trend… 2019 Feb
## 3 Northern Territory Clothing, footwear and personal accessory … trend… 2019 Mar
## # ℹ 2 more variables: Turnover <dist>, .mean <dbl>
Plot the forecast values along with the original data.
Get the residuals from the model. And check the residuals to check whether or not it satisfies the requirements for white noise error terms.(hint: augment() and gg_tsresiduals() functions)
## # A tsibble: 369 x 8 [1M]
## # Key: State, Industry, .model [1]
## State Industry .model Month Turnover .fitted .resid .innov
## <chr> <chr> <chr> <mth> <dbl> <dbl> <dbl> <dbl>
## 1 Northern Territory Clothing, … trend… 1988 Apr 2.3 3.56 -1.26 -1.26
## 2 Northern Territory Clothing, … trend… 1988 May 2.9 3.59 -0.693 -0.693
## 3 Northern Territory Clothing, … trend… 1988 Jun 2.6 3.62 -1.02 -1.02
## 4 Northern Territory Clothing, … trend… 1988 Jul 2.8 3.65 -0.854 -0.854
## 5 Northern Territory Clothing, … trend… 1988 Aug 2.9 3.69 -0.785 -0.785
## 6 Northern Territory Clothing, … trend… 1988 Sep 3 3.72 -0.716 -0.716
## 7 Northern Territory Clothing, … trend… 1988 Oct 3.1 3.75 -0.647 -0.647
## 8 Northern Territory Clothing, … trend… 1988 Nov 3 3.78 -0.778 -0.778
## 9 Northern Territory Clothing, … trend… 1988 Dec 4.2 3.81 0.392 0.392
## 10 Northern Territory Clothing, … trend… 1989 Jan 2.7 3.84 -1.14 -1.14
## # ℹ 359 more rows
Half-hourly electricity demand for Victoria, Australia is contained in vic_elec. Extract the January 2014 electricity demand, and aggregate this data to daily with daily total demands and maximum temperatures. Run the code below:
jan_vic_elec <- vic_elec %>%
filter(yearmonth(Time) == yearmonth("2014 Jan")) %>%
index_by(Date = as_date(Time)) %>%
summarise(Demand = sum(Demand), Temperature = max(Temperature))Plot the data and find the regression model for Demand with temperature as a predictor variable. Why is there a positive relationship?
## Series: Demand
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -49978.2 -10218.9 -121.3 18533.2 35440.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 59083.9 17424.8 3.391 0.00203 **
## Temperature 6154.3 601.3 10.235 3.89e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24540 on 29 degrees of freedom
## Multiple R-squared: 0.7832, Adjusted R-squared: 0.7757
## F-statistic: 104.7 on 1 and 29 DF, p-value: 3.8897e-11
jan_vic_elec %>%
ggplot(aes(x=Temperature, y=Demand)) +
geom_point() +
geom_smooth(method="lm", se=FALSE) +
labs(title = "Electricity Demand") +
theme(plot.title = element_text(hjust = 0.5))## `geom_smooth()` using formula = 'y ~ x'
Produce a residual plot. Is the model adequate? Are there any outliers or influential observations?
Use the model to forecast the electricity demand that you would expect for the next day if the maximum temperature was 15∘C and compare it with the forecast if the with maximum temperature was 35∘C. Do you believe these forecasts?
jan_vic_elec %>%
model(TSLM(Demand ~ Temperature)) %>%
forecast(new_data(jan_vic_elec, 1) %>%
mutate(Temperature = 15)) %>%
autoplot(jan_vic_elec)jan_vic_elec %>%
model(TSLM(Demand ~ Temperature)) %>%
forecast(new_data(jan_vic_elec, 1) %>%
mutate(Temperature = 35)) %>%
autoplot(jan_vic_elec)Do you believe these forecasts? The following R code will get you started:
jan_vic_elec %>%
model(TSLM(Demand ~ Temperature)) %>%
forecast(
new_data(jan_vic_elec, 1) %>%
mutate(Temperature = 15)
) %>%
autoplot(jan_vic_elec)## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
Read the shampoo data given in excel (Import Dataset as Excel)
#a. View the shampoo sales data. How many variables are there? Find how many rows and columns in the data?
library(readxl)
shampoo <- read_excel("shampoo-2.xlsx")
head(shampoo)## # A tibble: 6 × 2
## Month sales
## <dttm> <dbl>
## 1 1995-01-01 00:00:00 266
## 2 1995-02-01 00:00:00 146.
## 3 1995-03-01 00:00:00 183.
## 4 1995-04-01 00:00:00 119.
## 5 1995-05-01 00:00:00 180.
## 6 1995-06-01 00:00:00 168.
## [1] "there are 36 rows and 2 columns"
## tibble [36 × 2] (S3: tbl_df/tbl/data.frame)
## $ Month: POSIXct[1:36], format: "1995-01-01" "1995-02-01" ...
## $ sales: num [1:36] 266 146 183 119 180 ...
## [1] "there are two variables one is month and one is sales"
## [1] "the dataset contains monthly data"
#c. Convert the data into tibble , then tsibble
shampoo$Month <- as_date(shampoo$Month)
shampoo_new<- as_tibble(shampoo)
shampoo_new <- shampoo_new %>%
mutate(Months = yearmonth(Month)) %>%
select(-Month) %>%
as_tsibble(
index = Months,
validate = TRUE
)
#d. Plot the shampoo sales. What do you see from the data pattern? What does x-axis represent?
# Comment here. Use plot() and autoplot().Put the name for y axis, and a title for the graph.
plot(shampoo_new$sales, type = "l",
ylab = "Sales",
main = "Sales_years")## [1] "From the plot graph, I don't see any pattern that follows trend or seasonality."
## [1] "From the autoplot graph, I see that there is a seasonality followed."
#e. What is the average, and median of shampoo sales. Put it on a histogram.
x = mean(shampoo_new$sales)
y = median(shampoo_new$sales)
hist(shampoo_new$sales)
abline(v = x,
col = "green",
lwd = 3)
abline(v = median(shampoo_new$sales),
col = "blue",
lwd = 3)
text(x = x * 1.5,
y = x * 1.5,
paste("Mean = ", x),
col = "green",
cex = 1)
text(x = y * 1.5,
y = y * 1.5,
paste("Median = ", y),
col = "blue",
cex = 1)## [1] "The average sale of shampoo is 312.6 and median of shampoo sale is 280.15."
#f. Get seasonal plot. What do you see/ is there any pattern, is tehre any seasonality.
shampoo_new %>%
gg_season(sales)## [1] "there is no seasonality from the graph."
#g. Get a linear regression line with trend and dummy for each month (Hint: use trend and season in regression equation).
shampoo_fc <- shampoo_new %>%
model(TSLM(sales ~ trend() + season()))
report(shampoo_fc)## Series: sales
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -129.60 -62.32 -4.84 53.76 152.72
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 113.867 55.740 2.043 0.0527 .
## trend() 11.754 1.534 7.664 8.88e-08 ***
## season()year2 -33.154 73.630 -0.450 0.6567
## season()year3 -53.808 73.678 -0.730 0.4726
## season()year4 -24.628 73.757 -0.334 0.7415
## season()year5 -56.015 73.869 -0.758 0.4560
## season()year6 -27.802 74.012 -0.376 0.7106
## season()year7 7.244 74.187 0.098 0.9231
## season()year8 -37.043 74.393 -0.498 0.6233
## season()year9 27.536 74.629 0.369 0.7155
## season()year10 -32.518 74.897 -0.434 0.6682
## season()year11 9.895 75.194 0.132 0.8964
## season()year12 -4.259 75.522 -0.056 0.9555
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 90.16 on 23 degrees of freedom
## Multiple R-squared: 0.7592, Adjusted R-squared: 0.6336
## F-statistic: 6.043 on 12 and 23 DF, p-value: 0.00011612
augment(shampoo_fc) %>%
ggplot(aes(x = Months)) +
geom_line(aes(y = sales, colour = "Data")) +
geom_line(aes(y = .fitted, colour = "Fitted")) +
labs(y=" Sales",title ="Month Sales") +
scale_colour_manual(values = c(Data = "brown", Fitted = "#D55E00"))#h. Comment on each estimated coefficient of the model.Are they statistically significant at 5 % significance level?
report(shampoo_fc)## Series: sales
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -129.60 -62.32 -4.84 53.76 152.72
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 113.867 55.740 2.043 0.0527 .
## trend() 11.754 1.534 7.664 8.88e-08 ***
## season()year2 -33.154 73.630 -0.450 0.6567
## season()year3 -53.808 73.678 -0.730 0.4726
## season()year4 -24.628 73.757 -0.334 0.7415
## season()year5 -56.015 73.869 -0.758 0.4560
## season()year6 -27.802 74.012 -0.376 0.7106
## season()year7 7.244 74.187 0.098 0.9231
## season()year8 -37.043 74.393 -0.498 0.6233
## season()year9 27.536 74.629 0.369 0.7155
## season()year10 -32.518 74.897 -0.434 0.6682
## season()year11 9.895 75.194 0.132 0.8964
## season()year12 -4.259 75.522 -0.056 0.9555
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 90.16 on 23 degrees of freedom
## Multiple R-squared: 0.7592, Adjusted R-squared: 0.6336
## F-statistic: 6.043 on 12 and 23 DF, p-value: 0.00011612
## [1] "The model is coefficient and is statistically significant at 5% level."
#i. Which month has the highest sales?
shampoo_new %>%
mutate(only_months = month(Months)) %>%
select(-Months) %>%
index_by(only_months) %>%
summarise(highest_sales = sum(sales)) %>%
top_n(1)## Selecting by highest_sales
## # A tsibble: 1 x 2 [1]
## only_months highest_sales
## <dbl> <dbl>
## 1 11 1182.
## [1] "11th month has highest sale which is total of 1182."
#j. Forecast it for the next year. What are the values
forecast_trends <- shampoo_fc %>%
forecast(h = 12)
forecast_trends## # A fable: 12 x 4 [1M]
## # Key: .model [1]
## .model Months
## <chr> <mth>
## 1 TSLM(sales ~ trend() + season()) 1998 Jan
## 2 TSLM(sales ~ trend() + season()) 1998 Feb
## 3 TSLM(sales ~ trend() + season()) 1998 Mar
## 4 TSLM(sales ~ trend() + season()) 1998 Apr
## 5 TSLM(sales ~ trend() + season()) 1998 May
## 6 TSLM(sales ~ trend() + season()) 1998 Jun
## 7 TSLM(sales ~ trend() + season()) 1998 Jul
## 8 TSLM(sales ~ trend() + season()) 1998 Aug
## 9 TSLM(sales ~ trend() + season()) 1998 Sep
## 10 TSLM(sales ~ trend() + season()) 1998 Oct
## 11 TSLM(sales ~ trend() + season()) 1998 Nov
## 12 TSLM(sales ~ trend() + season()) 1998 Dec
## # ℹ 2 more variables: sales <dist>, .mean <dbl>
#k. Plot the forecast with original data.
shampoo_new %>%
autoplot(sales) +
geom_line(data = fitted(shampoo_fc),
aes(y = .fitted, colour = .model)) +
autolayer(forecast_trends, alpha = 0.5, level = 95) +
labs(y = "Sales",
title = "Monthly Sales")## [1] "Yes,the residuals of the model is white noise."
#m. By using the regression model, forecast the 1 year ahead, and then check the accuracy of the forecast. What is MSE, RMSE values?
forecast_trends <- shampoo_fc %>%
forecast(h = "1 year")
forecast_trends## # A fable: 12 x 4 [1M]
## # Key: .model [1]
## .model Months
## <chr> <mth>
## 1 TSLM(sales ~ trend() + season()) 1998 Jan
## 2 TSLM(sales ~ trend() + season()) 1998 Feb
## 3 TSLM(sales ~ trend() + season()) 1998 Mar
## 4 TSLM(sales ~ trend() + season()) 1998 Apr
## 5 TSLM(sales ~ trend() + season()) 1998 May
## 6 TSLM(sales ~ trend() + season()) 1998 Jun
## 7 TSLM(sales ~ trend() + season()) 1998 Jul
## 8 TSLM(sales ~ trend() + season()) 1998 Aug
## 9 TSLM(sales ~ trend() + season()) 1998 Sep
## 10 TSLM(sales ~ trend() + season()) 1998 Oct
## 11 TSLM(sales ~ trend() + season()) 1998 Nov
## 12 TSLM(sales ~ trend() + season()) 1998 Dec
## # ℹ 2 more variables: sales <dist>, .mean <dbl>
## [1] 98031.53
## [1] 313.0999
## [1] "MSE = 98031.33\n RMSE = 313.099"