## Warning: package 'fpp3' was built under R version 4.3.3
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──
## ✔ tibble 3.2.1 ✔ tsibble 1.1.6
## ✔ dplyr 1.1.4 ✔ tsibbledata 0.4.1
## ✔ tidyr 1.3.1 ✔ feasts 0.4.1
## ✔ lubridate 1.9.3 ✔ fable 0.4.1
## ✔ ggplot2 3.5.0
## Warning: package 'tidyr' was built under R version 4.3.3
## Warning: package 'tsibble' was built under R version 4.3.3
## Warning: package 'tsibbledata' was built under R version 4.3.3
## Warning: package 'feasts' was built under R version 4.3.3
## Warning: package 'fabletools' was built under R version 4.3.3
## Warning: package 'fable' was built under R version 4.3.3
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ purrr 1.0.2 ✔ stringr 1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## [1] "2025-03-28 10:33:17 EDT"
Consider the GDP information in data set called global_economy, which is already embedded in fpp3 package (no need to upload externally)
## # A tsibble: 15,150 x 9 [1Y]
## # Key: Country [263]
## Country Code Year GDP Growth CPI Imports Exports Population
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanistan AFG 1960 537777811. NA NA 7.02 4.13 8996351
## 2 Afghanistan AFG 1961 548888896. NA NA 8.10 4.45 9166764
## 3 Afghanistan AFG 1962 546666678. NA NA 9.35 4.88 9345868
## 4 Afghanistan AFG 1963 751111191. NA NA 16.9 9.17 9533954
## 5 Afghanistan AFG 1964 800000044. NA NA 18.1 8.89 9731361
## 6 Afghanistan AFG 1965 1006666638. NA NA 21.4 11.3 9938414
## 7 Afghanistan AFG 1966 1399999967. NA NA 18.6 8.57 10152331
## 8 Afghanistan AFG 1967 1673333418. NA NA 14.2 6.77 10372630
## 9 Afghanistan AFG 1968 1373333367. NA NA 15.2 8.90 10604346
## 10 Afghanistan AFG 1969 1408888922. NA NA 15.0 10.1 10854428
## # ℹ 15,140 more rows
## Rows: 15,150
## Columns: 9
## Key: Country [263]
## $ Country <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan",…
## $ Code <fct> AFG, AFG, AFG, AFG, AFG, AFG, AFG, AFG, AFG, AFG, AFG, AFG,…
## $ Year <dbl> 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969,…
## $ GDP <dbl> 537777811, 548888896, 546666678, 751111191, 800000044, 1006…
## $ Growth <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ CPI <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Imports <dbl> 7.024793, 8.097166, 9.349593, 16.863910, 18.055555, 21.4128…
## $ Exports <dbl> 4.132233, 4.453443, 4.878051, 9.171601, 8.888893, 11.258279…
## $ Population <dbl> 8996351, 9166764, 9345868, 9533954, 9731361, 9938414, 10152…
# Choose a random country, e.g., "India"
india_gdp <- global_economy %>%
filter(Country == "India")
# Create a GDP per capita variable
india_gdp <- india_gdp %>%
mutate(GDP_per_capita = GDP / Population)
# Plot GDP per capita over time
india_gdp %>%
autoplot(GDP_per_capita) +
labs(title = "India: GDP per Capita Over Time",
y = "GDP per Capita (USD)", x = "Year")
For each of the following series, make a graph of the data. If transforming seems appropriate, do so and describe the effect. Comment below in answer:
# 2a.Answer:
# Original GDP per capita plot
india_gdp %>%
autoplot(GDP_per_capita) +
labs(title = "India: GDP per Capita (Original)",
y = "GDP per Capita", x = "Year")
# Apply log transformation
india_gdp %>%
mutate(log_GDP_per_capita = log(GDP_per_capita)) %>%
autoplot(log_GDP_per_capita) +
labs(title = "India: Log of GDP per Capita",
y = "Log(GDP per Capita)", x = "Year")
United States GDP from global_economy.
# 2b.Answer:
# Filter for United States
us_gdp <- global_economy %>%
filter(Country == "United States")
# Plot original GDP
us_gdp %>%
autoplot(GDP) +
labs(title = "United States: GDP (Original)",
y = "GDP (in Billions USD)", x = "Year")
## # A tsibble: 58 x 9 [1Y]
## # Key: Country [1]
## Country Code Year GDP Growth CPI Imports Exports Population
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 United States USA 1960 5.43e11 NA 13.6 4.20 4.97 180671000
## 2 United States USA 1961 5.63e11 2.30 13.7 4.03 4.90 183691000
## 3 United States USA 1962 6.05e11 6.10 13.9 4.13 4.81 186538000
## 4 United States USA 1963 6.39e11 4.40 14.0 4.09 4.87 189242000
## 5 United States USA 1964 6.86e11 5.80 14.2 4.10 5.10 191889000
## 6 United States USA 1965 7.44e11 6.40 14.4 4.24 4.99 194303000
## 7 United States USA 1966 8.15e11 6.50 14.9 4.55 5.02 196560000
## 8 United States USA 1967 8.62e11 2.50 15.3 4.63 5.05 198712000
## 9 United States USA 1968 9.42e11 4.80 16.0 4.94 5.08 200706000
## 10 United States USA 1969 1.02e12 3.10 16.8 4.95 5.09 202677000
## # ℹ 48 more rows
Slaughter of Victorian “Bulls, bullocks and steers” in aus_livestock
# 2c.Answer:
# Load aus_livestock data
library(fpp3)
# Filter for "Bulls, bullocks and steers" in Victoria
vic_cattle <- aus_livestock %>%
filter(Animal == "Bulls, bullocks and steers",
State == "Victoria")
# Plot the original series
vic_cattle %>%
autoplot(Count) +
labs(title = "Slaughter of Bulls, Bullocks and Steers in Victoria",
y = "Count", x = "Year")
# Apply log transformation
vic_cattle %>%
mutate(log_Count = log(Count)) %>%
autoplot(log_Count) +
labs(title = "Log of Slaughter Count in Victoria",
y = "Log(Count)", x = "Year")
Victorian Electricity Demand from vic_elec.
# 2d.Answer:
# Load vic_elec data
library(fpp3)
# Plot the original electricity demand
vic_elec %>%
autoplot(Demand) +
labs(title = "Victorian Electricity Demand (Original)",
y = "Demand (MW)", x = "Time")
# Apply log transformation to Demand
vic_elec %>%
mutate(log_Demand = log(Demand)) %>%
autoplot(log_Demand) +
labs(title = "Log of Victorian Electricity Demand",
y = "Log(Demand)", x = "Time")
Gas production from aus_production.
# 2e.Answer:
# Load aus_production data
library(fpp3)
# Filter for Gas production
gas_data <- aus_production %>%
select(Quarter, Gas)
# Plot the original Gas production
gas_data %>%
autoplot(Gas) +
labs(title = "Australian Gas Production (Original)",
y = "Gas (Terajoules)", x = "Year")
# Apply log transformation
gas_data %>%
mutate(log_Gas = log(Gas)) %>%
autoplot(log_Gas) +
labs(title = "Log of Australian Gas Production",
y = "Log(Gas)", x = "Year")
# 3a.Answer:
# Load fpp3 package
library(fpp3)
# View the structure of the data
glimpse(canadian_gas)
## Rows: 542
## Columns: 2
## $ Month <mth> 1960 Jan, 1960 Feb, 1960 Mar, 1960 Apr, 1960 May, 1960 Jun, 196…
## $ Volume <dbl> 1.4306, 1.3059, 1.4022, 1.1699, 1.1161, 1.0113, 0.9660, 0.9773,…
# Plot the time series
autoplot(canadian_gas) +
labs(title = "Canadian Monthly Gas Production",
y = "Billions of Cubic Metres", x = "Year")
## Plot variable not specified, automatically selected `.vars = Volume`
# Seasonal plot to check seasonality each year
gg_season(canadian_gas) +
labs(title = "Seasonal Plot: Canadian Gas Production",
y = "Billions of Cubic Metres")
## Plot variable not specified, automatically selected `y = Volume`
# Subseries plot to see seasonality within months across years
gg_subseries(canadian_gas) +
labs(title = "Subseries Plot: Canadian Gas Production",
y = "Billions of Cubic Metres")
## Plot variable not specified, automatically selected `y = Volume`
Do an STL decomposition of the data. You will need to choose a seasonal window to allow for the changing shape of the seasonal component.
# 3b.Answer:
# STL decomposition with adaptive seasonal window
library(fpp3)
# STL decomposition using the correct column names
canadian_gas %>%
model(
STL(Volume ~ season(window = "periodic"))
) %>%
components() %>%
autoplot() +
labs(title = "STL Decomposition of Canadian Gas Production",
y = "Billions of Cubic Metres")
How does the seasonal shape change over time? [Hint: Try plotting the seasonal component using gg_season().]
# 3c.Answer:
# Perform STL decomposition with adaptive window to capture changing seasonality
# Perform STL decomposition with a numeric seasonal window (e.g., 13 months)
decomp <- canadian_gas %>%
model(
STL(Volume ~ season(window = 13)) # 13 gives flexibility for changing shape
) %>%
components()
# Plot the seasonal component to see how it changes over time
decomp %>%
autoplot(season_year) +
labs(title = "Seasonal Component Over Time: Canadian Gas Production",
y = "Seasonal Effect", x = "Month")
produce a plausible seasonally adjusted series? What are these numbers, plot the series.
# 3d.Answer:
# Perform STL decomposition with flexible seasonal window
decomp <- canadian_gas %>%
model(
STL(Volume ~ season(window = 13))
) %>%
components()
# View the seasonally adjusted series
decomp %>%
select(Month, season_adjust) %>%
print(n = 10) # Show first 10 rows
## # A tsibble: 542 x 2 [1M]
## Month season_adjust
## <mth> <dbl>
## 1 1960 Jan 0.878
## 2 1960 Feb 1.09
## 3 1960 Mar 1.07
## 4 1960 Apr 1.15
## 5 1960 May 1.24
## 6 1960 Jun 1.40
## 7 1960 Jul 1.40
## 8 1960 Aug 1.34
## 9 1960 Sep 1.39
## 10 1960 Oct 1.33
## # ℹ 532 more rows
# Plot seasonally adjusted series
decomp %>%
autoplot(season_adjust) +
labs(title = "Seasonally Adjusted Canadian Gas Production",
y = "Seasonally Adjusted Volume (Billions of Cubic Metres)",
x = "Year")
For retail time series, use the below code:
# run the code
set.seed(12345678)
myseries <- aus_retail %>%
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
# View the selected time series
glimpse(myseries)
## Rows: 369
## Columns: 5
## Key: State, Industry [1]
## $ State <chr> "Northern Territory", "Northern Territory", "Northern Terr…
## $ Industry <chr> "Clothing, footwear and personal accessory retailing", "Cl…
## $ `Series ID` <chr> "A3349767W", "A3349767W", "A3349767W", "A3349767W", "A3349…
## $ Month <mth> 1988 Apr, 1988 May, 1988 Jun, 1988 Jul, 1988 Aug, 1988 Sep…
## $ Turnover <dbl> 2.3, 2.9, 2.6, 2.8, 2.9, 3.0, 3.1, 3.0, 4.2, 2.7, 2.5, 2.4…
# Plot the retail series
myseries %>%
autoplot(Turnover) +
labs(title = "Randomly Selected Australian Retail Time Series",
y = "Turnover (Million AUD)", x = "Year")
Create a training dataset consisting of observations before 2011
myseries_train <- myseries %>%
filter(year(Month) < 2011)
# Check date range of training data
range(myseries_train$Month)
## <yearmonth[2]>
## [1] "1988 Apr" "2010 Dec"
# Plot the training data
myseries_train %>%
autoplot(Turnover) +
labs(title = "Training Data: Retail Turnover (Before 2011)",
y = "Turnover (Million AUD)", x = "Year")
Check that your data have been split appropriately by producing the following plot.
autoplot(myseries, Turnover) +
autolayer(myseries_train, Turnover, colour = "red") +
labs(title = "Retail Turnover: Full Series vs Training Data",
y = "Turnover (Million AUD)", x = "Year") +
guides(colour = guide_legend(title = "Series")) +
scale_color_manual(values = c("black", "red"),
labels = c("Full Series", "Training Data"))
Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).
#Answer:
# Fit the seasonal naïve model
fit <- myseries_train %>%
model(SNAIVE(Turnover))
# View the model summary
report(fit)
## Series: Turnover
## Model: SNAIVE
##
## sigma^2: 1.2856
Check the residuals.
# 4d Answer:
# Do the residuals appear to be uncorrelated and normally distributed?
# Answ:
#Time Plot: The residuals appear to fluctuate randomly around zero with no obvious patterns — a good sign.
#ACF Plot: If most spikes are within the blue bounds, residuals are uncorrelated, suggesting the model has captured seasonality effectively.
#Histogram / Density Plot: If roughly bell-shaped and centered at zero, residuals are approximately normally distributed.
#Yes, the residuals appear to be uncorrelated and roughly normally distributed, which suggests the seasonal naïve model provides a reasonable fit.
# Check residuals
fit %>% gg_tsresiduals()
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).
Produce forecasts for the test data with given code below:
# 4e Answer:
# Forecast using the fitted SNAIVE model on the test set (post-2010)
fc <- fit %>%
forecast(new_data = anti_join(myseries, myseries_train))
## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`
# Plot forecasts along with the full series
fc %>%
autoplot(myseries) +
labs(title = "SNaïve Forecast vs Actual Retail Turnover",
y = "Turnover (Million AUD)", x = "Year")
Joining, by = c(“State”, “Industry”, “Series ID”, “Month”, “Turnover”)
Compare the accuracy of your forecasts against the actual values with given code below:
## # A tibble: 1 × 12
## State Industry .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Norther… Clothin… SNAIV… Trai… 0.439 1.21 0.915 5.23 12.4 1 1 0.768
## # A tibble: 1 × 12
## .model State Industry .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Nort… Clothin… Test 0.836 1.55 1.24 5.94 9.06 1.36 1.28 0.601
#Interpretation:
#The first output (fit %>% accuracy()) shows how well the model fits the training data
#The second output (fc %>% accuracy(myseries)) shows how well the model predicts the test data (from 2011 onward).
#The out-of-sample errors (from fc) are generally larger than in-sample errors, which is expected. The SNaïve model captures the seasonal pattern but does not adjust for trend or other dynamics — so while simple, it serves as a benchmark model.
How sensitive are the accuracy measures to the amount of training data used?
# 4g Answer:
# Try a shorter training period (e.g., before 2006 instead of 2011)
short_train <- myseries %>% filter(year(Month) < 2006)
# Refit the SNaïve model
short_fit <- short_train %>% model(SNAIVE(Turnover))
# Forecast using reduced training set
short_fc <- short_fit %>% forecast(new_data = myseries %>% filter(year(Month) >= 2006))
# Compare accuracy
short_fc %>% accuracy(myseries)
## # A tibble: 1 × 12
## .model State Industry .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Nort… Clothin… Test 3.49 3.86 3.49 26.6 26.6 3.79 3.09 0.529
#The accuracy measures are sensitive to the amount of training data:
#Less training data may not fully capture the seasonal pattern or trend, especially in time series with evolving behavior.
#Using more training data generally improves forecast accuracy because the model has a better understanding of recurring seasonal shapes and long-term changes.
#However, if the older data is outdated or not representative of current patterns, too much training data can sometimes reduce accuracy.
#Forecast accuracy typically improves with enough recent training data to capture relevant patterns, but overly long history may introduce outdated behavior. The ideal training window balances data quantity with relevance.
Create a training set for Australian takeaway food turnover (aus_retail) by withholding the last four years as a test set.
# 5a.Answer:
takeaway <- aus_retail %>%
filter(Industry == "Takeaway food services") %>%
summarise(Turnover = sum(Turnover))
train <- takeaway %>%
filter(year(Month) <= 2014)
tail(train)
## # A tsibble: 6 x 2 [1M]
## Month Turnover
## <mth> <dbl>
## 1 2014 Jul 1328.
## 2 2014 Aug 1335.
## 3 2014 Sep 1338.
## 4 2014 Oct 1390.
## 5 2014 Nov 1391.
## 6 2014 Dec 1494.
Fit all the appropriate benchmark methods to the training set and forecast the periods covered by the test set.
Compute the accuracy of your forecasts. Which method does best?
## # A tibble: 4 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 naive Test -12.4 119. 96.4 -1.49 6.66 2.30 2.25 0.613
## 2 drift Test -93.7 130. 108. -6.82 7.67 2.58 2.46 0.403
## 3 snaive Test 177. 192. 177. 11.7 11.7 4.22 3.64 0.902
## 4 mean Test 829. 838. 829. 55.7 55.7 19.8 15.8 0.613
Do the residuals from the best method resemble white noise?
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
Using the code below, get a series (it gets a series randomly by using sample() function):
set.seed(12345678)
myseries <- aus_retail %>%
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
see head of your series to check it is a tsibble data, and remove NA’s if there is any with these commands:
## # A tsibble: 6 x 5 [1M]
## # Key: State, Industry [1]
## State Industry `Series ID` Month Turnover
## <chr> <chr> <chr> <mth> <dbl>
## 1 Northern Territory Clothing, footwear and perso… A3349767W 1988 Apr 2.3
## 2 Northern Territory Clothing, footwear and perso… A3349767W 1988 May 2.9
## 3 Northern Territory Clothing, footwear and perso… A3349767W 1988 Jun 2.6
## 4 Northern Territory Clothing, footwear and perso… A3349767W 1988 Jul 2.8
## 5 Northern Territory Clothing, footwear and perso… A3349767W 1988 Aug 2.9
## 6 Northern Territory Clothing, footwear and perso… A3349767W 1988 Sep 3
Run a linear regression of Turnover on trend.(Hint: use TSLM() and trend() functions)
See the regression result by report() command.
## Series: Turnover
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0795 -1.1704 -0.1640 0.9683 7.4514
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.5313376 0.1983464 17.80 <2e-16 ***
## trend() 0.0307747 0.0009291 33.12 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.901 on 367 degrees of freedom
## Multiple R-squared: 0.7493, Adjusted R-squared: 0.7486
## F-statistic: 1097 on 1 and 367 DF, p-value: < 2.22e-16
By using this model, forecast it for the next 3 years. What are the values of the next 3 years, monthly values?
## # A fable: 36 x 6 [1M]
## # Key: State, Industry, .model [1]
## State Industry .model Month
## <chr> <chr> <chr> <mth>
## 1 Northern Territory Clothing, footwear and personal accessory… TSLM(… 2019 Jan
## 2 Northern Territory Clothing, footwear and personal accessory… TSLM(… 2019 Feb
## 3 Northern Territory Clothing, footwear and personal accessory… TSLM(… 2019 Mar
## 4 Northern Territory Clothing, footwear and personal accessory… TSLM(… 2019 Apr
## 5 Northern Territory Clothing, footwear and personal accessory… TSLM(… 2019 May
## 6 Northern Territory Clothing, footwear and personal accessory… TSLM(… 2019 Jun
## 7 Northern Territory Clothing, footwear and personal accessory… TSLM(… 2019 Jul
## 8 Northern Territory Clothing, footwear and personal accessory… TSLM(… 2019 Aug
## 9 Northern Territory Clothing, footwear and personal accessory… TSLM(… 2019 Sep
## 10 Northern Territory Clothing, footwear and personal accessory… TSLM(… 2019 Oct
## # ℹ 26 more rows
## # ℹ 2 more variables: Turnover <dist>, .mean <dbl>
Plot the forecast values along with the original data.
# When use autoplot() and use the name of the original series in the autoplot function as autoplot(myseries) ,
#then it plots forecasts with the orginal together. No need to use autolayer() command.
# If you use autoplot() without the original series name, it only plots the forecasted values:
fit %>% forecast(h=36 )%>% autoplot()
Get the residuals from the model. And check the residuals to check whether or not it satisfies the requirements for white noise error terms.(hint: augment() and gg_tsresiduals() functions)
Half-hourly electricity demand for Victoria, Australia is contained in vic_elec. Extract the January 2014 electricity demand, and aggregate this data to daily with daily total demands and maximum temperatures. Run the code below:
jan_vic_elec <- vic_elec %>%
filter(yearmonth(Time) == yearmonth("2014 Jan")) %>%
index_by(Date = as_date(Time)) %>%
summarise(Demand = sum(Demand), Temperature = max(Temperature))
Plot the data and find the regression model for Demand with temperature as a predictor variable. Why is there a positive relationship?
## Series: Demand
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -49978.2 -10218.9 -121.3 18533.2 35440.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 59083.9 17424.8 3.391 0.00203 **
## Temperature 6154.3 601.3 10.235 3.89e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24540 on 29 degrees of freedom
## Multiple R-squared: 0.7832, Adjusted R-squared: 0.7757
## F-statistic: 104.7 on 1 and 29 DF, p-value: 3.8897e-11
## Series: Demand
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -49978.2 -10218.9 -121.3 18533.2 35440.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 59083.9 17424.8 3.391 0.00203 **
## Temperature 6154.3 601.3 10.235 3.89e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24540 on 29 degrees of freedom
## Multiple R-squared: 0.7832, Adjusted R-squared: 0.7757
## F-statistic: 104.7 on 1 and 29 DF, p-value: 3.8897e-11
#It is clear that a positive relationship exists for this data. It is largely driven by days with high temperatur that is resulting in more electricity being demanded (presumably to keep things cool).
Produce a residual plot. Is the model adequate? Are there any outliers or influential observations?
Use the model to forecast the electricity demand that you would expect for the next day if the maximum temperature was 15∘C and compare it with the forecast if the with maximum temperature was 35∘C. Do you believe these forecasts?
# 7c.Answer:
library(fpp3)
# Step 1: Aggregate to daily data for Jan 2014
jan_vic_elec <- vic_elec %>%
filter(yearmonth(Time) == yearmonth("2014 Jan")) %>%
index_by(Date = as_date(Time)) %>%
summarise(
Demand = sum(Demand),
Temperature = max(Temperature)
)
# Step 2: Fit linear regression model
fit <- jan_vic_elec %>%
model(tslm_model = TSLM(Demand ~ Temperature))
# Step 3: Create two future (non-overlapping) dates
next_day <- bind_rows(
new_data(jan_vic_elec, 1) %>% mutate(Temperature = 15, Scenario = "Cold Day"),
new_data(jan_vic_elec, 1) %>% mutate(Temperature = 35, Scenario = "Hot Day") %>%
mutate(Date = Date + 1) # shift 1 day forward to avoid duplicate index
)
# Step 4: Forecast using model
fc <- forecast(fit, new_data = next_day)
# Step 5: Plot results with forecasted points
autoplot(jan_vic_elec, Demand) +
geom_point(data = fc, aes(x = Date, y = .mean, color = Scenario), size = 3) +
labs(title = "Forecast: Electricity Demand on Cold vs Hot Days",
y = "Daily Demand (MW)", x = "Date") +
scale_color_manual(values = c("Cold Day" = "blue", "Hot Day" = "red"))
## # A tibble: 2 × 3
## Scenario Temperature .mean
## <chr> <dbl> <dbl>
## 1 Cold Day 15 151398.
## 2 Hot Day 35 274484.
#Using the linear regression model Demand ~ Temperature, I forecasted electricity demand for two hypothetical next-day scenarios:
#Cold Day (15°C)
#Hot Day (35°C)
#The model predicts higher electricity demand on the hot day, which is consistent with real-world behavior in summer — when air conditioning usage increases sharply with temperature.
#Do I believe these forecasts?
#yes, they are reasonable. The model reflects a clear and logical pattern: hotter temperatures lead to higher demand. However, the model is based only on January 2014, so for more reliable forecasting, it would be better to train the model on a larger and more varied dataset.
Do you believe these forecasts? The following R code will get you started:
jan_vic_elec %>%
model(TSLM(Demand ~ Temperature)) %>%
forecast(
new_data(jan_vic_elec, 1) %>%
mutate(Temperature = 15)
) %>%
autoplot(jan_vic_elec)
Give prediction intervals for your forecasts.
## # A tsibble: 2 x 6 [1D]
## Date Demand
## <date> <dist>
## 1 2014-02-01 N(151398, 6.8e+08)
## 2 2014-02-02 N(274484, 6.4e+08)
## # ℹ 4 more variables: .mean <dbl>, Temperature <dbl>, Scenario <chr>,
## # `95%` <hilo>
Read the shampoo data given in excel (Import Dataset as Excel)
#a. View the shampoo sales data. How many variables are there? Find how many rows and columns in the data?
library(readxl)
setwd("C:/Users/pooja/OneDrive/Desktop/RAssignment2")
shampoo <- read_excel("shampoo-2.xlsx")
#View(shampoo)
#b. Is the data annual, monthly, quarterly?
str(shampoo) #data frame
## tibble [36 × 2] (S3: tbl_df/tbl/data.frame)
## $ Month: POSIXct[1:36], format: "1995-01-01" "1995-02-01" ...
## $ sales: num [1:36] 266 146 183 119 180 ...
## tibble [36 × 2] (S3: tbl_df/tbl/data.frame)
## $ Month: POSIXct[1:36], format: "1995-01-01" "1995-02-01" ...
## $ sales: num [1:36] 266 146 183 119 180 ...
#c. Convert the data into tibble , then tsibble
mydata = shampoo %>%
mutate(MONTHLY = yearmonth(Month)) %>% as_tsibble(index = MONTHLY) %>% select(-Month)
str(mydata)
## tbl_ts [36 × 2] (S3: tbl_ts/tbl_df/tbl/data.frame)
## $ sales : num [1:36] 266 146 183 119 180 ...
## $ MONTHLY: mth [1:36] 1995 Jan, 1995 Feb, 1995 Mar, 1995 Apr, 1995 May, 1995 Jun,...
## - attr(*, "key")= tibble [1 × 1] (S3: tbl_df/tbl/data.frame)
## ..$ .rows: list<int> [1:1]
## .. ..$ : int [1:36] 1 2 3 4 5 6 7 8 9 10 ...
## .. ..@ ptype: int(0)
## - attr(*, "index")= chr "MONTHLY"
## ..- attr(*, "ordered")= logi TRUE
## - attr(*, "index2")= chr "MONTHLY"
## - attr(*, "interval")= interval [1:1] 1M
## ..@ .regular: logi TRUE
## # A tsibble: 36 x 2 [1M]
## sales MONTHLY
## <dbl> <mth>
## 1 266 1995 Jan
## 2 146. 1995 Feb
## 3 183. 1995 Mar
## 4 119. 1995 Apr
## 5 180. 1995 May
## 6 168. 1995 Jun
## 7 232. 1995 Jul
## 8 224. 1995 Aug
## 9 193. 1995 Sep
## 10 123. 1995 Oct
## # ℹ 26 more rows
#d. Plot the shampoo sales. What do you see from the data pattern? What does x-axis represent?
plot(shampoo$sales, type ="l")
# Comment here. Use plot() and autoplot().Put the name for y axis, and a title for the graph.
#e. What is the average, and median of shampoo sales. Put it on a histogram.
meanSales = mean(shampoo$sales)
meanSales
## [1] 312.6
## Warning in geom_histogram(bin = 10): Ignoring unknown parameters: `bin`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#f. Get seasonal plot. What do you see/ is there any pattern, is tehre any seasonality.
mydata %>% gg_season(sales) # yes, there is a trend. 1997 is higher than 1996 and than 1995.
mydata %>% gg_subseries(sales) # it looks like there is monthly effect. it is higher injul-december interval.
#g. Get a linear regression line with trend and dummy for each month (Hint: use trend and season in regression equation).
fit = mydata %>% model(TSLM(sales ~ trend() + season()))
#h. Comment on each estimated coefficient of the model.Are they statistically significant at 5 % significance level?
report(fit)
## Series: sales
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -129.60 -62.32 -4.84 53.76 152.72
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 113.867 55.740 2.043 0.0527 .
## trend() 11.754 1.534 7.664 8.88e-08 ***
## season()year2 -33.154 73.630 -0.450 0.6567
## season()year3 -53.808 73.678 -0.730 0.4726
## season()year4 -24.628 73.757 -0.334 0.7415
## season()year5 -56.015 73.869 -0.758 0.4560
## season()year6 -27.802 74.012 -0.376 0.7106
## season()year7 7.244 74.187 0.098 0.9231
## season()year8 -37.043 74.393 -0.498 0.6233
## season()year9 27.536 74.629 0.369 0.7155
## season()year10 -32.518 74.897 -0.434 0.6682
## season()year11 9.895 75.194 0.132 0.8964
## season()year12 -4.259 75.522 -0.056 0.9555
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 90.16 on 23 degrees of freedom
## Multiple R-squared: 0.7592, Adjusted R-squared: 0.6336
## F-statistic: 6.043 on 12 and 23 DF, p-value: 0.00011612
#i. Which month has the highest sales?
# december and november.
#j. Forecast it for the next year. What are the values
forecasts = fit %>% forecast(h=12)
forecasts
## # A fable: 12 x 4 [1M]
## # Key: .model [1]
## .model MONTHLY
## <chr> <mth>
## 1 TSLM(sales ~ trend() + season()) 1998 Jan
## 2 TSLM(sales ~ trend() + season()) 1998 Feb
## 3 TSLM(sales ~ trend() + season()) 1998 Mar
## 4 TSLM(sales ~ trend() + season()) 1998 Apr
## 5 TSLM(sales ~ trend() + season()) 1998 May
## 6 TSLM(sales ~ trend() + season()) 1998 Jun
## 7 TSLM(sales ~ trend() + season()) 1998 Jul
## 8 TSLM(sales ~ trend() + season()) 1998 Aug
## 9 TSLM(sales ~ trend() + season()) 1998 Sep
## 10 TSLM(sales ~ trend() + season()) 1998 Oct
## 11 TSLM(sales ~ trend() + season()) 1998 Nov
## 12 TSLM(sales ~ trend() + season()) 1998 Dec
## # ℹ 2 more variables: sales <dist>, .mean <dbl>
# it is autocorrelation at 1 lag.
### Answ: not white noise.
#m. By using the regression model, forecast the 1 year ahead, and then check the accuracy of the forecast. What is MSE, RMSE values?
accuracy(fit)
## # A tibble: 1 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 TSLM(sales ~ tren… Trai… -5.53e-15 72.1 59.6 -4.55 23.1 0.388 0.378 -0.0363