Task 3 | Regression vs Time Series

Ekonometrika

In this practicum case study, we will analyze sales data from a retail store and develop forecasting models using both regression and time series approaches. The goal is to predict future sales revenue accurately, allowing the store to optimize inventory management and resource allocation.

Dataset

The dataset consists of historical sales data collected over the past two years, including information on :

  • Date: Date of Sale
  • Promotional_Spending: Amount of Promotional Spending
  • Price: Product Price
  • Weather_Condition: Weather Condition
  • Sales_Revenue : Total Sales Revenue
# Load required libraries
library(tidyverse)
library(lubridate)

# Set seed for reproducibility
set.seed(123)

# Number of observations
n <- 100

# Simulate date range
start_date <- ymd("2022-01-01")
end_date <- ymd("2022-04-10")  # Update end date to have 100 days
dates <- seq(start_date, end_date, by = "day")

# Simulate predictor variables
promotional_spending <- runif(n, min = 1000, max = 5000)
price <- rnorm(n, mean = 50, sd = 10)
weather_conditions <- sample(c("sunny", "cloudy", "rainy"), size = n, replace = TRUE)

# Simulate sales revenue
sales_trend <- 0.1 * seq(1, n)
seasonal_pattern <- sin(seq(1, n) * 2 * pi / 365 * 7) * 100
sales_noise <- rnorm(n, mean = 0, sd = 100)
sales_revenue <- 1000 + sales_trend + seasonal_pattern + sales_noise

# Create dataframe
simulated_data <- tibble(
  Date = dates,
  Promotional_Spending = promotional_spending,
  Price = price,
  Weather_Conditions = weather_conditions,
  Sales_Revenue = sales_revenue
)

# Display the first few rows of the dataset
head(simulated_data)

Develop a Regression Model

Develop a regression model to understand the relationship between sales revenue and various predictors such as promotional spending, pricing, and external factors.

# Load required library
library(tidyverse)

# Fit linear regression model
model <- lm(Sales_Revenue ~ Promotional_Spending + Price + Weather_Conditions, data = simulated_data)

# Summarize the model
summary(model)
## 
## Call:
## lm(formula = Sales_Revenue ~ Promotional_Spending + Price + Weather_Conditions, 
##     data = simulated_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -340.29  -71.58   11.21   82.72  315.61 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             992.92772   81.16696  12.233   <2e-16 ***
## Promotional_Spending      0.01922    0.01197   1.605    0.112    
## Price                    -0.64144    1.42687  -0.450    0.654    
## Weather_Conditionsrainy -22.74556   36.17645  -0.629    0.531    
## Weather_Conditionssunny -23.58060   32.57721  -0.724    0.471    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 134.7 on 95 degrees of freedom
## Multiple R-squared:  0.03842,    Adjusted R-squared:  -0.002062 
## F-statistic: 0.9491 on 4 and 95 DF,  p-value: 0.4393

note disini kita akan meload data nya. bahwa

# Load required libraries
library(ggplot2)
library(ggfortify)
library(gridExtra)
library(cowplot)

# Predict sales revenue using the model
simulated_data$Predicted_Sales <- predict(model)

# Residual vs Fitted plot
residual_plot <- ggplot(simulated_data, aes(x = Predicted_Sales, y = resid(model))) +
  geom_point(color = "#77B0AA") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "#8B322C") +
  labs(x = "Fitted Values", y = "Residuals", title = "Residuals vs Fitted") +
  theme_minimal()

# Distribution of Residuals plot
residual_distribution <- ggplot(simulated_data, aes(x = resid(model))) +
  geom_histogram(binwidth = 100, fill = "#436850", color = "#12372A") +
  labs(x = "Residuals", y = "Frequency", title = "Distribution of Residuals") +
  theme_minimal()

# QQ Plot
qq_plot <- ggplot(simulated_data, aes(sample = resid(model))) +
  stat_qq(color = "#CAA6A6") +
  stat_qq_line(color = "#9B4444") +
  labs(title = "QQ Plot of Residuals") +
  theme_minimal()

# Combine plots
plot_grid(residual_plot, residual_distribution, qq_plot, nrow = 2)

Build a Time Series

Build a time series model to capture the temporal patterns and trends in sales revenue, accounting for seasonality and other time-related effects.

# Load required libraries
library(forecast)
library(gridExtra)

# Convert Date column to Date format if not already
simulated_data$Date <- as.Date(simulated_data$Date)

# Set Date column as time series
sales_ts <- ts(simulated_data$Sales_Revenue, frequency = 7)  # Assuming weekly seasonality

# Fit ARIMA model
arima_model <- auto.arima(sales_ts)

# Actual vs. Fitted plot
actual_vs_fitted <- autoplot(forecast(arima_model), main = "Actual vs Fitted", xlab = "Date", ylab = "Sales Revenue")

# Residuals plot
residuals_plot <- ggplot(data.frame(residuals = residuals(arima_model)), aes(x = 1:length(residuals), y = residuals)) +
  geom_point(color = "#496989") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "#561C24") +
  labs(title = "Residuals", x = "Index", y = "Residuals")

# Combine plots
plot_grid(actual_vs_fitted, residuals_plot, nrow = 2)

acf(sales_ts)

dilihat dari plotnya. data nya tidak bersifat musiman, teapi bersifat siklus.

Evaluate and Compare

Evaluate and compare the performance of both models in forecasting future sales revenue.

# Generate forecasts for both models
arima_forecast <- forecast(arima_model, h = 30)  # Forecast for the next 30 periods

# Calculate evaluation metrics for ARIMA model
arima_mse <- mean((as.numeric(arima_forecast$mean) - simulated_data$Sales_Revenue)^2)
arima_mae <- mean(abs(as.numeric(arima_forecast$mean) - simulated_data$Sales_Revenue))
arima_mape <- mean(abs((as.numeric(arima_forecast$mean) - simulated_data$Sales_Revenue) / simulated_data$Sales_Revenue)) * 100

# Calculate evaluation metrics for linear regression model
linear_reg_mse <- mean((rep(simulated_data$Sales_Revenue[length(simulated_data$Sales_Revenue)], 30) - simulated_data$Sales_Revenue[length(simulated_data$Sales_Revenue)])^2)
linear_reg_mae <- mean(abs(rep(simulated_data$Sales_Revenue[length(simulated_data$Sales_Revenue)], 30) - simulated_data$Sales_Revenue))
linear_reg_mape <- mean(abs((rep(simulated_data$Sales_Revenue[length(simulated_data$Sales_Revenue)], 30) - simulated_data$Sales_Revenue) / simulated_data$Sales_Revenue)) * 100

# Print evaluation metrics
cat("Evaluation metrics for ARIMA model:\n")
## Evaluation metrics for ARIMA model:
cat("MSE:", arima_mse, "\n")
## MSE: 18802.16
cat("MAE:", arima_mae, "\n")
## MAE: 109.4466
cat("MAPE:", arima_mape, "%\n\n")
## MAPE: 11.29185 %
cat("Evaluation metrics for Linear Regression model:\n")
## Evaluation metrics for Linear Regression model:
cat("MSE:", linear_reg_mse, "\n")
## MSE: 0
cat("MAE:", linear_reg_mae, "\n")
## MAE: 218.977
cat("MAPE:", linear_reg_mape, "%\n")
## MAPE: 20.61591 %

Note - Forecast:
Kode tersebut menghasilkan peramalan untuk kedua model. Untuk ARIMA, menggunakan fungsi forecast(). - Menghitung Metrik Evaluasi:
Untuk model ARIMA:
Mean Squared Error (MSE)
Mean Absolute Error (MAE)
Mean Absolute Percentage Error (MAPE)
- Untuk model regresi linear:
MSE
MAE
MAPE
Metrik-metrik ini dihitung dengan membandingkan nilai-nilai yang diramalkan dengan data simulasi aktual (simulated_data$Sales_Revenue). - Mencetak Metrik Evaluasi: Akhirnya, kode tersebut mencetak metrik evaluasi untuk model ARIMA menggunakan fungsi cat().