library(forecast)
## Warning: package 'forecast' was built under R version 4.4.2
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(fpp2)
## Warning: package 'fpp2' was built under R version 4.4.2
## ── Attaching packages ────────────────────────────────────────────── fpp2 2.5 ──
## ✔ ggplot2 3.5.1 ✔ expsmooth 2.3
## ✔ fma 2.5
## Warning: package 'fma' was built under R version 4.4.2
## Warning: package 'expsmooth' was built under R version 4.4.2
##
library(ggplot2)
library(readxl)
library(gridExtra)
usnetelec
usgdp
mcopper
enplanements
data("usnetelec")
data("usgdp")
data("mcopper")
data("enplanements")
# Check for missing values
if (anyNA(usnetelec) | anyNA(usgdp) | anyNA(mcopper) | anyNA(enplanements)) {
stop("Error: One or more datasets contain missing values. Handle them before transformation.")
}
# Ensure all values are positive (Box-Cox transformation requires positive values)
if (any(usnetelec <= 0) | any(usgdp <= 0) | any(mcopper <= 0) | any(enplanements <= 0)) {
usnetelec <- usnetelec + 1
usgdp <- usgdp + 1
mcopper <- mcopper + 1
enplanements <- enplanements + 1
}
# Create a function to apply Box-Cox transformation and plot
plot_boxcox <- function(data, name) {
lambda <- BoxCox.lambda(data)
transformed_data <- BoxCox(data, lambda)
df <- data.frame(Time = time(data), Value = transformed_data)
ggplot(df, aes(x = Time, y = Value)) +
geom_line(size = 1) +
ggtitle(paste("Box-Cox Transformation:", name, "\nLambda =", round(lambda, 2))) +
theme_minimal() +
xlab("Time") +
ylab("Transformed Value")
}
# Generate plots for each dataset
p1 <- plot_boxcox(usnetelec, "usnetelec")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
p2 <- plot_boxcox(usgdp, "usgdp")
p3 <- plot_boxcox(mcopper, "mcopper")
p4 <- plot_boxcox(enplanements, "enplanements")
# Display plots
grid.arrange(p1, p2, p3, p4, ncol = 2)
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
### Observations
# Load dataset
data("cangas")
# Box-Cox transformation
lambda <- BoxCox.lambda(cangas)
transformed_data <- BoxCox(cangas, lambda)
# Create data frames for plotting
df_original <- data.frame(Time = time(cangas), Value = as.numeric(cangas))
df_transformed <- data.frame(Time = time(cangas), Value = as.numeric(transformed_data))
# Plot original data
p1 <- ggplot(df_original, aes(x = Time, y = Value)) +
geom_line(size = 1) +
ggtitle("Original Data: cangas") +
theme_minimal() +
xlab("Time") +
ylab("Value")
# Plot Box-Cox transformed data
p2 <- ggplot(df_transformed, aes(x = Time, y = Value)) +
geom_line(size = 1) +
ggtitle(paste("Box-Cox Transformed: cangas \nLambda =", round(lambda, 2))) +
theme_minimal() +
xlab("Time") +
ylab("Transformed Value")
# Display plots side by side
grid.arrange(p1, p2, ncol = 2)
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
The original cangas data shows a clear upward trend with increasing variance. After applying the Box-Cox transformation (λ = 0.58), the variance is slightly stabilized, but the overall trend and fluctuations still remain. The transformation doesn’t fully remove the increasing spread of values, especially in the later years, meaning it doesn’t completely fix the variance issue. The Box-Cox transformation works best when variance increases proportionally with the mean, but in this case, the cangas data has both trend and seasonality that aren’t just due to variance changes. The transformation helps a little but doesn’t fully make the series stationary. A better approach might be differencing the data to remove the trend before applying transformations.
# Define file path (ensure correct escaping of backslashes)
file_path <- "C:/Users/Admin/Downloads/retail.xlsx"
# Read the dataset, skipping the first row
retaildata <- read_excel(file_path, skip = 1)
# Define the column name (update this if the column name is different)
col_name <- "A3349337W"
# Check if column exists
if (!col_name %in% colnames(retaildata)) {
stop("Error: Column name not found in the dataset. Check the correct column name.")
}
# Convert the data to a time series object
myts <- ts(retaildata[[col_name]], frequency = 12, start = c(1982, 4))
# Plot the time series without custom colors
ggplot(data = data.frame(Time = time(myts), Value = as.numeric(myts)), aes(x = Time, y = Value)) +
geom_line(size = 1) +
ggtitle("Retail Time Series Data") +
xlab("Year") +
ylab("Value") +
theme_minimal()
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
ggseasonplot(myts)
ggsubseriesplot(myts)
gglagplot(myts)
ggAcf(myts)
(lambda <- BoxCox.lambda(myts))
## [1] 0.9165544
autoplot(BoxCox(myts,lambda))
### Observations - The Box-Cox lambda value is
0.92, which is close to 1, meaning the data is
already close to being normally distributed.
- The original time series shows a strong upward trend and
seasonal variations, suggesting that variance increases
over time.
- The Box-Cox transformation slightly stabilizes the variance but does
not significantly change the overall trend or pattern.
- Since λ is close to 1, applying a transformation like
log or square root wouldn’t make a drastic
difference.
- A log transformation (log(x)
) or power
transformation (x^0.92
) could still help, but
differencing might be more effective in handling the trend.
dole
, usdeaths
, bricksq
.# Load datasets
data("dole")
data("usdeaths")
data("bricksq")
# Create function for time series plots
plot_series <- function(series, name) {
df <- data.frame(Time = time(series), Value = as.numeric(series))
ggplot(df, aes(x = Time, y = Value)) +
geom_line(size = 1) +
ggtitle(paste("Time Series Plot of", name)) +
theme_minimal() +
xlab("Time") +
ylab("Value")
}
# Plot original data
p1 <- plot_series(dole, "Dole")
p2 <- plot_series(usdeaths, "US Deaths")
p3 <- plot_series(bricksq, "Bricksq")
# Arrange plots in a grid
grid.arrange(p1, p2, p3, ncol = 1)
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
# Load datasets
data("dole")
data("usdeaths")
data("bricksq")
# Compute optimal lambda for each dataset
(lambda_dole <- BoxCox.lambda(dole))
## [1] 0.3290922
(lambda_usdeaths <- BoxCox.lambda(usdeaths))
## [1] -0.03363775
(lambda_bricksq <- BoxCox.lambda(bricksq))
## [1] 0.2548929
# Apply Box-Cox transformation
dole_transformed <- BoxCox(dole, lambda_dole)
usdeaths_transformed <- BoxCox(usdeaths, lambda_usdeaths)
bricksq_transformed <- BoxCox(bricksq, lambda_bricksq)
# Function to plot transformed data
plot_series <- function(series, name) {
df <- data.frame(Time = time(series), Value = as.numeric(series))
ggplot(df, aes(x = Time, y = Value)) +
geom_line(size = 1) +
ggtitle(paste("Box-Cox Transformed:", name)) +
theme_minimal() +
xlab("Time") +
ylab("Transformed Value")
}
# Plot transformed data
p4 <- plot_series(dole_transformed, "Dole (Box-Cox)")
p5 <- plot_series(usdeaths_transformed, "US Deaths (Box-Cox)")
p6 <- plot_series(bricksq_transformed, "Bricksq (Box-Cox)")
# Arrange transformed plots in a grid
grid.arrange(p4, p5, p6, ncol = 1)
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
### Observations
Overall: The Box-Cox transformation helps Dole and Bricksq by stabilizing variance. However, for US Deaths, it has little impact since the data was already relatively stable.
# Load Australian beer production dataset
data("ausbeer")
# Select data from 1992 onwards
beer <- window(ausbeer, start=1992)
# Apply a seasonal naïve forecast
fc <- snaive(beer)
# Plot the forecast
autoplot(fc) +
ggtitle("Seasonal Naïve Forecast of Australian Beer Production") +
xlab("Year") +
ylab("Beer Production") +
theme_minimal()
# Extract and plot residuals
res <- residuals(fc)
autoplot(res) +
ggtitle("Residuals of the Seasonal Naïve Forecast") +
xlab("Year") +
ylab("Residual Value") +
theme_minimal()
# Perform diagnostic checks
checkresiduals(fc)
##
## Ljung-Box test
##
## data: Residuals from Seasonal naive method
## Q* = 32.269, df = 8, p-value = 8.336e-05
##
## Model df: 0. Total lags used: 8
False.
- While normality is desirable, the most important criterion is that
residuals should be uncorrelated and have constant variance
(homoscedasticity). - If residuals are correlated, it suggests
that there is patterned information left in the data
that the model did not capture. - Many forecasting models still perform
well even if the residuals are not perfectly normal.
False.
- Small residuals on the training data do not guarantee good
forecasts. - A model may have small residuals because it
overfits the training data but performs poorly on new
data. - The ability to generalize is more important than just minimizing
residuals on past observations.
False.
- MAPE (Mean Absolute Percentage Error) is widely used
but has limitations, especially when dealing with
zero or near-zero values. - Other metrics like
RMSE (Root Mean Squared Error), MAE (Mean Absolute Error), and
SMAPE (Symmetric MAPE) may be more appropriate depending on the
context.
False.
- A more complex model is not always better and may
lead to overfitting. - Instead, one should: - Check for
simpler alternative models. - Tune
hyperparameters. - Use regularization
techniques. - Consider more relevant features rather
than increasing complexity.
False.
- Forecast accuracy is important, but other factors also
matter, such as: - Interpretability: A simpler
model is often preferred in real-world applications. -
Computational cost: Complex models can be difficult to
implement in production. - Robustness: Some models
might work well in one test set but fail in a different scenario. - A
balance between accuracy, interpretability, and
generalizability is essential.
# Load the dataset
file_path <- "C:/Users/Admin/Downloads/retail.xlsx"
retaildata <- read_excel(file_path, skip = 1)
# Define column name (update if necessary)
col_name <- "A3349337W"
# Validate column exists
if (!col_name %in% colnames(retaildata)) {
stop("Error: Column not found in dataset. Check column name.")
}
# Convert to time series
myts <- ts(retaildata[[col_name]], frequency = 12, start = c(1982, 4))
# Split into training (up to Dec 2010) and test (from Jan 2011 onwards)
myts.train <- window(myts, end=c(2010,12))
myts.test <- window(myts, start=2011)
# Plot train and test sets
autoplot(myts) +
autolayer(myts.train, series="Training") +
autolayer(myts.test, series="Test") +
ggtitle("Retail Time Series: Train vs. Test Split") +
xlab("Year") +
ylab("Sales") +
theme_minimal() +
guides(colour=guide_legend(title="Data Split"))
# Fit a seasonal naïve model
fc <- snaive(myts.train)
# Plot the forecast
autoplot(fc) +
ggtitle("Seasonal Naïve Forecast for Retail Sales") +
xlab("Year") +
ylab("Sales") +
theme_minimal()
# Compute accuracy against the test set
accuracy(fc, myts.test)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 9.460661 26.30758 21.23363 4.655690 12.762886 1.0000000 0.8070166
## Test set 17.212500 21.26067 17.39583 4.748234 4.807728 0.8192584 0.4843871
## Theil's U
## Training set NA
## Test set 0.6934111
# Check residuals for normality and autocorrelation
checkresiduals(fc)
##
## Ljung-Box test
##
## data: Residuals from Seasonal naive method
## Q* = 856.11, df = 24, p-value < 2.2e-16
##
## Model df: 0. Total lags used: 24
visnights
contains quarterly visitor nights (in
millions) from 1998 to 2016 for twenty regions of Australia.# Load dataset
data("visnights")
# Define training sets
train1 <- window(visnights[, "QLDMetro"], end = c(2015, 4)) # Omitting last 1 year
train2 <- window(visnights[, "QLDMetro"], end = c(2014, 4)) # Omitting last 2 years
train3 <- window(visnights[, "QLDMetro"], end = c(2013, 4)) # Omitting last 3 years
# Define corresponding test sets
test1 <- window(visnights[, "QLDMetro"], start = c(2016, 1)) # 1-year test
test2 <- window(visnights[, "QLDMetro"], start = c(2015, 1)) # 2-year test
test3 <- window(visnights[, "QLDMetro"], start = c(2014, 1)) # 3-year test
# Fit seasonal naïve models
fc1 <- snaive(train1, h=4)
fc2 <- snaive(train2, h=4)
fc3 <- snaive(train3, h=4)
# Plot forecasts
p1 <- autoplot(fc1) + ggtitle("Forecast from Train1") + theme_minimal()
p2 <- autoplot(fc2) + ggtitle("Forecast from Train2") + theme_minimal()
p3 <- autoplot(fc3) + ggtitle("Forecast from Train3") + theme_minimal()
# Arrange plots in a grid
grid.arrange(p1, p2, p3, ncol = 1)
# Compute accuracy metrics for each forecast
acc1 <- accuracy(fc1, test1)
acc2 <- accuracy(fc2, test2)
acc3 <- accuracy(fc3, test3)
# Create a data frame to compare accuracy metrics
accuracy_df <- data.frame(
Training_Set = c("Train1 (Omitting 1 year)", "Train2 (Omitting 2 years)", "Train3 (Omitting 3 years)"),
MAPE = c(acc1["Test set", "MAPE"], acc2["Test set", "MAPE"], acc3["Test set", "MAPE"]),
RMSE = c(acc1["Test set", "RMSE"], acc2["Test set", "RMSE"], acc3["Test set", "RMSE"]),
MAE = c(acc1["Test set", "MAE"], acc2["Test set", "MAE"], acc3["Test set", "MAE"])
)
# Display the accuracy table
knitr::kable(accuracy_df, caption = "Comparison of Forecast Accuracy Metrics")
Training_Set | MAPE | RMSE | MAE |
---|---|---|---|
Train1 (Omitting 1 year) | 6.159821 | 0.9358727 | 0.7094002 |
Train2 (Omitting 2 years) | 3.057463 | 0.4117902 | 0.3133488 |
Train3 (Omitting 3 years) | 8.476977 | 1.0586583 | 0.8625501 |
# Visualization of MAPE values
ggplot(accuracy_df, aes(x = Training_Set, y = MAPE, fill = Training_Set)) +
geom_bar(stat = "identity", color = "black") +
ggtitle("MAPE Comparison Across Training Sets") +
xlab("Training Set") +
ylab("MAPE (%)") +
theme_minimal() +
theme(legend.position = "none")