Input Data

# Load required libraries
# Load required libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)

# Set seed for reproducibility
set.seed(123)

# Number of observations
n <- 100

# Simulate date range
start_date <- ymd("2022-01-01")
end_date <- ymd("2022-04-10")
dates <- seq(start_date, end_date, by = "day")

# Simulate predictor variables
promotional_spending <- runif(n, min = 1000, max = 5000)
price <- rnorm(n, mean = 50, sd = 10)
weather_conditions <- sample(c("sunny", "cloudy", "rainy"), size = n, replace = TRUE)

# Simulate sales revenue
sales_trend <- 0.1 * seq(1, n)
seasonal_pattern <- sin(seq(1, n) * 2 * pi / 365 * 7) * 100
sales_noise <- rnorm(n, mean = 0, sd = 100)
sales_revenue <- 1000 + sales_trend + seasonal_pattern + sales_noise

# Create dataframe
simulated_data <- tibble(
  Date = dates,
  Promotional_Spending = promotional_spending,
  Price = price,
  Weather_Conditions = weather_conditions,
  Sales_Revenue = sales_revenue
)

# Display the first few rows of the dataset
head(simulated_data)
## # A tibble: 6 × 5
##   Date       Promotional_Spending Price Weather_Conditions Sales_Revenue
##   <date>                    <dbl> <dbl> <chr>                      <dbl>
## 1 2022-01-01                2150.  52.5 cloudy                     1049.
## 2 2022-01-02                4153.  49.7 cloudy                      978.
## 3 2022-01-03                2636.  49.6 sunny                      1068.
## 4 2022-01-04                4532.  63.7 cloudy                     1018.
## 5 2022-01-05                4762.  47.7 sunny                      1166.
## 6 2022-01-06                1182.  65.2 sunny                      1083.
ggplot(simulated_data, aes(x = Date, y = Sales_Revenue)) +
  geom_line(color = "red") +
  labs(title = "Penjualan Harian",
       x = "Tanggal",
       y = "Pendapatan Penjualan") +
  theme_minimal()

Model Regresi

model <- lm(Sales_Revenue ~ Promotional_Spending + Price + Weather_Conditions, data = simulated_data)

summary(model)
## 
## Call:
## lm(formula = Sales_Revenue ~ Promotional_Spending + Price + Weather_Conditions, 
##     data = simulated_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -340.29  -71.58   11.21   82.72  315.61 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             992.92772   81.16696  12.233   <2e-16 ***
## Promotional_Spending      0.01922    0.01197   1.605    0.112    
## Price                    -0.64144    1.42687  -0.450    0.654    
## Weather_Conditionsrainy -22.74556   36.17645  -0.629    0.531    
## Weather_Conditionssunny -23.58060   32.57721  -0.724    0.471    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 134.7 on 95 degrees of freedom
## Multiple R-squared:  0.03842,    Adjusted R-squared:  -0.002062 
## F-statistic: 0.9491 on 4 and 95 DF,  p-value: 0.4393

Prediksi Regresi

new_promotional_spending <- c(6000, 7000, 8000)  
new_price <- c(90, 95, 100)  
new_weather_conditions <- c("sunny", "rainy", "cloudy") 

new_data <- data.frame(Promotional_Spending = new_promotional_spending,
                       Price = new_price,
                       Weather_Conditions = new_weather_conditions)

predicted_sales <- predict(model, newdata = new_data)
predicted_sales <- predict(model, newdata = simulated_data)
predicted_data <- cbind(simulated_data, Predicted_Sales = predicted_sales)

ggplot(predicted_data, aes(x = Date)) +
  geom_line(aes(y = Sales_Revenue, color = "Actual Sales")) +
  geom_line(aes(y = Predicted_Sales, color = "Predicted Sales"), linetype = "dashed") +
  labs(title = "Perbandingan Actual dan Prediksi",
       x = "Tanggal",
       y = "Sales Revenue") +
  scale_color_manual(name = "Data", values = c("Actual Sales" = "blue", "Predicted Sales" = "red")) +
  theme_minimal()

Dari plot diatas menunjukan tingkat akurasi yang rendah di model prediksinya, karena garis actual sales jauh dari predicted salesnya.

uji heteroskedasitas

library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
bptest(model)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 7.7981, df = 4, p-value = 0.09926

nilai p yang sesuai adalah 0.09926. Karena nilai p lebih dari 0.05, kita gagal menolak hipotesis nol. Kita tidak memiliki bukti yang cukup untuk mengatakan bahwa heteroskedastisitas hadir dalam model regresi.

Time Series

# Memuat library forecast untuk pemodelan ARIMA
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
# Membuat objek ts untuk analisis deret waktu
sales_ts <- ts(simulated_data$Sales_Revenue, frequency = 365)

# Create a dataframe from the ts object for easier use with plotly
ts_data <- data.frame(Time = 1:length(sales_ts), Sales_Revenue = as.numeric(sales_ts))
arima_model <- auto.arima(sales_ts)

summary(arima_model)
## Series: sales_ts 
## ARIMA(1,0,1) with non-zero mean 
## 
## Coefficients:
##          ar1      ma1      mean
##       0.9198  -0.7377  1001.994
## s.e.  0.0535   0.0803    36.507
## 
## sigma^2 = 15050:  log likelihood = -621.52
## AIC=1251.04   AICc=1251.47   BIC=1261.47
## 
## Training set error measures:
##                     ME     RMSE      MAE      MPE     MAPE MASE       ACF1
## Training set -1.929319 120.8257 94.75599 -1.80019 9.983429  NaN -0.0317043

Output menunjukkan bahwa model ARIMA(1,0,1) berhasil menangkap sebagian besar informasi dalam data, dengan beberapa kesalahan yang tetap ada. Metrik evaluasi memberikan gambaran di mana model mungkin perlu diperbaiki atau dikalibrasi.

Prediksi dengan time series

forecast_arima <- forecast(arima_model, h = 12)

plot_time_series <- plot_ly(x = ~time(sales_ts), y = ~as.numeric(sales_ts), type = 'scatter', mode = 'lines', name = 'Actual') %>%
  add_trace(x = time(forecast_arima$mean), y = forecast_arima$mean, mode = 'lines', name = 'Predicted (ARIMA)') %>%
  layout(title = 'Perbandingan Actal dengan Prediksi - ARIMA Model',
         xaxis = list(title = 'Time'),
         yaxis = list(title = 'Sales Revenue'))

plot_time_series

3. kesimpulan perbandingan data

# Generate forecasts for both models
arima_forecast <- forecast(arima_model, h = 30)  # Forecast for the next 30 periods

# Calculate evaluation metrics for ARIMA model
arima_mse <- mean((as.numeric(arima_forecast$mean) - simulated_data$Sales_Revenue)^2)
## Warning in as.numeric(arima_forecast$mean) - simulated_data$Sales_Revenue:
## longer object length is not a multiple of shorter object length
arima_mae <- mean(abs(as.numeric(arima_forecast$mean) - simulated_data$Sales_Revenue))
## Warning in as.numeric(arima_forecast$mean) - simulated_data$Sales_Revenue:
## longer object length is not a multiple of shorter object length
arima_mape <- mean(abs((as.numeric(arima_forecast$mean) - simulated_data$Sales_Revenue) / simulated_data$Sales_Revenue)) * 100
## Warning in as.numeric(arima_forecast$mean) - simulated_data$Sales_Revenue:
## longer object length is not a multiple of shorter object length
cat("MSE:", arima_mse, "\n")
## MSE: 18802.16
cat("MAE:", arima_mae, "\n")
## MAE: 109.4466
cat("MAPE:", arima_mape, "%\n")
## MAPE: 11.29185 %
linear_reg_mse <- mean((rep(simulated_data$Sales_Revenue[length(simulated_data$Sales_Revenue)], 30) - simulated_data$Sales_Revenue[length(simulated_data$Sales_Revenue)])^2)
linear_reg_mae <- mean(abs(rep(simulated_data$Sales_Revenue[length(simulated_data$Sales_Revenue)], 30) - simulated_data$Sales_Revenue))
## Warning in
## rep(simulated_data$Sales_Revenue[length(simulated_data$Sales_Revenue)], :
## longer object length is not a multiple of shorter object length
linear_reg_mape <- mean(abs((rep(simulated_data$Sales_Revenue[length(simulated_data$Sales_Revenue)], 30) - simulated_data$Sales_Revenue) / simulated_data$Sales_Revenue)) * 100
## Warning in
## rep(simulated_data$Sales_Revenue[length(simulated_data$Sales_Revenue)], :
## longer object length is not a multiple of shorter object length
cat("MSE:", linear_reg_mse, "\n")
## MSE: 0
cat("MAE:", linear_reg_mae, "\n")
## MAE: 218.977
cat("MAPE:", linear_reg_mape, "%\n")
## MAPE: 20.61591 %