Task 3 | Regression vs Time Series
Ekonometrika
In this practicum case study, we will analyze sales data from a retail store and develop forecasting models using both regression and time series approaches. The goal is to predict future sales revenue accurately, allowing the store to optimize inventory management and resource allocation.
Dataset
The dataset consists of historical sales data collected over the past two years, including information on :
- Date: Date of Sale
- Promotional_Spending: Amount of Promotional Spending
- Price: Product Price
- Weather_Condition: Weather Condition
- Sales_Revenue : Total Sales Revenue
# Load required libraries
library(tidyverse)
library(lubridate)
# Set seed for reproducibility
set.seed(123)
# Number of observations
n <- 100
# Simulate date range
start_date <- ymd("2022-01-01")
end_date <- ymd("2022-04-10") # Update end date to have 100 days
dates <- seq(start_date, end_date, by = "day")
# Simulate predictor variables
promotional_spending <- runif(n, min = 1000, max = 5000)
price <- rnorm(n, mean = 50, sd = 10)
weather_conditions <- sample(c("sunny", "cloudy", "rainy"), size = n, replace = TRUE)
# Simulate sales revenue
sales_trend <- 0.1 * seq(1, n)
seasonal_pattern <- sin(seq(1, n) * 2 * pi / 365 * 7) * 100
sales_noise <- rnorm(n, mean = 0, sd = 100)
sales_revenue <- 1000 + sales_trend + seasonal_pattern + sales_noise
# Create dataframe
simulated_data <- tibble(
Date = dates,
Promotional_Spending = promotional_spending,
Price = price,
Weather_Conditions = weather_conditions,
Sales_Revenue = sales_revenue
)
# Display the first few rows of the dataset
head(simulated_data)Develop a Regression Model
Develop a regression model to understand the relationship between sales revenue and various predictors such as promotional spending, pricing, and external factors.
# Load required library
library(tidyverse)
# Fit linear regression model
model <- lm(Sales_Revenue ~ Promotional_Spending + Price + Weather_Conditions, data = simulated_data)
# Summarize the model
summary(model)##
## Call:
## lm(formula = Sales_Revenue ~ Promotional_Spending + Price + Weather_Conditions,
## data = simulated_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -340.29 -71.58 11.21 82.72 315.61
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 992.92772 81.16696 12.233 <2e-16 ***
## Promotional_Spending 0.01922 0.01197 1.605 0.112
## Price -0.64144 1.42687 -0.450 0.654
## Weather_Conditionsrainy -22.74556 36.17645 -0.629 0.531
## Weather_Conditionssunny -23.58060 32.57721 -0.724 0.471
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 134.7 on 95 degrees of freedom
## Multiple R-squared: 0.03842, Adjusted R-squared: -0.002062
## F-statistic: 0.9491 on 4 and 95 DF, p-value: 0.4393
note disini kita akan meload data nya. bahwa
# Load required libraries
library(ggplot2)
library(ggfortify)
library(gridExtra)
library(cowplot)
# Predict sales revenue using the model
simulated_data$Predicted_Sales <- predict(model)
# Residual vs Fitted plot
residual_plot <- ggplot(simulated_data, aes(x = Predicted_Sales, y = resid(model))) +
geom_point(color = "#77B0AA") +
geom_hline(yintercept = 0, linetype = "dashed", color = "#8B322C") +
labs(x = "Fitted Values", y = "Residuals", title = "Residuals vs Fitted") +
theme_minimal()
# Distribution of Residuals plot
residual_distribution <- ggplot(simulated_data, aes(x = resid(model))) +
geom_histogram(binwidth = 100, fill = "#436850", color = "#12372A") +
labs(x = "Residuals", y = "Frequency", title = "Distribution of Residuals") +
theme_minimal()
# QQ Plot
qq_plot <- ggplot(simulated_data, aes(sample = resid(model))) +
stat_qq(color = "#CAA6A6") +
stat_qq_line(color = "#9B4444") +
labs(title = "QQ Plot of Residuals") +
theme_minimal()
# Combine plots
plot_grid(residual_plot, residual_distribution, qq_plot, nrow = 2)Build a Time Series
Build a time series model to capture the temporal patterns and trends in sales revenue, accounting for seasonality and other time-related effects.
# Load required libraries
library(forecast)
library(gridExtra)
# Convert Date column to Date format if not already
simulated_data$Date <- as.Date(simulated_data$Date)
# Set Date column as time series
sales_ts <- ts(simulated_data$Sales_Revenue, frequency = 7) # Assuming weekly seasonality
# Fit ARIMA model
arima_model <- auto.arima(sales_ts)
# Actual vs. Fitted plot
actual_vs_fitted <- autoplot(forecast(arima_model), main = "Actual vs Fitted", xlab = "Date", ylab = "Sales Revenue")
# Residuals plot
residuals_plot <- ggplot(data.frame(residuals = residuals(arima_model)), aes(x = 1:length(residuals), y = residuals)) +
geom_point(color = "#496989") +
geom_hline(yintercept = 0, linetype = "dashed", color = "#561C24") +
labs(title = "Residuals", x = "Index", y = "Residuals")
# Combine plots
plot_grid(actual_vs_fitted, residuals_plot, nrow = 2)dilihat dari plotnya. data nya tidak bersifat musiman, teapi bersifat siklus.
Evaluate and Compare
Evaluate and compare the performance of both models in forecasting future sales revenue.
# Generate forecasts for both models
arima_forecast <- forecast(arima_model, h = 30) # Forecast for the next 30 periods
# Calculate evaluation metrics for ARIMA model
arima_mse <- mean((as.numeric(arima_forecast$mean) - simulated_data$Sales_Revenue)^2)
arima_mae <- mean(abs(as.numeric(arima_forecast$mean) - simulated_data$Sales_Revenue))
arima_mape <- mean(abs((as.numeric(arima_forecast$mean) - simulated_data$Sales_Revenue) / simulated_data$Sales_Revenue)) * 100
# Calculate evaluation metrics for linear regression model
linear_reg_mse <- mean((rep(simulated_data$Sales_Revenue[length(simulated_data$Sales_Revenue)], 30) - simulated_data$Sales_Revenue[length(simulated_data$Sales_Revenue)])^2)
linear_reg_mae <- mean(abs(rep(simulated_data$Sales_Revenue[length(simulated_data$Sales_Revenue)], 30) - simulated_data$Sales_Revenue))
linear_reg_mape <- mean(abs((rep(simulated_data$Sales_Revenue[length(simulated_data$Sales_Revenue)], 30) - simulated_data$Sales_Revenue) / simulated_data$Sales_Revenue)) * 100
# Print evaluation metrics
cat("Evaluation metrics for ARIMA model:\n")## Evaluation metrics for ARIMA model:
## MSE: 18802.16
## MAE: 109.4466
## MAPE: 11.29185 %
## Evaluation metrics for Linear Regression model:
## MSE: 0
## MAE: 218.977
## MAPE: 20.61591 %
Note - Forecast:
Kode tersebut menghasilkan
peramalan untuk kedua model. Untuk ARIMA, menggunakan fungsi forecast().
- Menghitung Metrik Evaluasi:
Untuk model ARIMA:
Mean Squared
Error (MSE)
Mean Absolute Error (MAE)
Mean Absolute Percentage
Error (MAPE)
- Untuk model regresi linear:
MSE
MAE
MAPE
Metrik-metrik ini dihitung dengan membandingkan nilai-nilai
yang diramalkan dengan data simulasi aktual
(simulated_data$Sales_Revenue). - Mencetak Metrik Evaluasi: Akhirnya,
kode tersebut mencetak metrik evaluasi untuk model ARIMA menggunakan
fungsi cat().