set.seed(42)
n <- 300
df <- tibble(
customer_id = paste0("C", 1:n),
gender = sample(c("Male", "Female", "Other"), n, replace = TRUE, prob = c(0.45, 0.5, 0.05)),
age = round(rnorm(n, mean = 35, sd = 10)),
region = sample(c("North", "South", "East", "West"), n, replace = TRUE),
purchase_amount = round(rnorm(n, mean = 200, sd = 50), 2),
satisfaction_score = round(rnorm(n, mean = 7, sd = 1.5), 1),
loyalty_program = sample(c("Yes", "No"), n, replace = TRUE, prob = c(0.6, 0.4)),
delivery_mode = sample(c("Standard", "Express", "In-Store"), n, replace = TRUE),
return_rate = round(runif(n, 0, 0.3), 2),
recommended = sample(c("Yes", "No"), n, replace = TRUE, prob = c(0.7, 0.3))
)
summary(df)
## customer_id gender age region
## Length:300 Length:300 Min. : 8.00 Length:300
## Class :character Class :character 1st Qu.:28.00 Class :character
## Mode :character Mode :character Median :35.00 Mode :character
## Mean :34.82
## 3rd Qu.:41.00
## Max. :60.00
## purchase_amount satisfaction_score loyalty_program delivery_mode
## Min. : 49.1 Min. : 1.900 Length:300 Length:300
## 1st Qu.:166.2 1st Qu.: 5.875 Class :character Class :character
## Median :198.7 Median : 7.100 Mode :character Mode :character
## Mean :196.8 Mean : 6.997
## 3rd Qu.:231.5 3rd Qu.: 8.000
## Max. :360.6 Max. :12.200
## return_rate recommended
## Min. :0.0000 Length:300
## 1st Qu.:0.0900 Class :character
## Median :0.1500 Mode :character
## Mean :0.1546
## 3rd Qu.:0.2300
## Max. :0.3000
df %>% tabyl(loyalty_program, recommended)
## loyalty_program No Yes
## No 43 80
## Yes 63 114
t.test(df$satisfaction_score, conf.level = 0.95)
##
## One Sample t-test
##
## data: df$satisfaction_score
## t = 76.087, df = 299, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 6.815703 7.177630
## sample estimates:
## mean of x
## 6.996667
t.test(df$purchase_amount, conf.level = 0.95)
##
## One Sample t-test
##
## data: df$purchase_amount
## t = 70.335, df = 299, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 191.2521 202.2623
## sample estimates:
## mean of x
## 196.7572
t.test(purchase_amount ~ loyalty_program, data = df)
##
## Welch Two Sample t-test
##
## data: purchase_amount by loyalty_program
## t = -1.6037, df = 234.18, p-value = 0.1101
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
## -20.870046 2.140001
## sample estimates:
## mean in group No mean in group Yes
## 191.2319 200.5969
table_check <- table(df$delivery_mode, df$recommended)
chisq.test(table_check)
##
## Pearson's Chi-squared test
##
## data: table_check
## X-squared = 1.414, df = 2, p-value = 0.4931
CrossTable(df$delivery_mode, df$recommended, chisq = TRUE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 300
##
##
## | df$recommended
## df$delivery_mode | No | Yes | Row Total |
## -----------------|-----------|-----------|-----------|
## Express | 35 | 66 | 101 |
## | 0.013 | 0.007 | |
## | 0.347 | 0.653 | 0.337 |
## | 0.330 | 0.340 | |
## | 0.117 | 0.220 | |
## -----------------|-----------|-----------|-----------|
## In-Store | 31 | 67 | 98 |
## | 0.380 | 0.208 | |
## | 0.316 | 0.684 | 0.327 |
## | 0.292 | 0.345 | |
## | 0.103 | 0.223 | |
## -----------------|-----------|-----------|-----------|
## Standard | 40 | 61 | 101 |
## | 0.521 | 0.285 | |
## | 0.396 | 0.604 | 0.337 |
## | 0.377 | 0.314 | |
## | 0.133 | 0.203 | |
## -----------------|-----------|-----------|-----------|
## Column Total | 106 | 194 | 300 |
## | 0.353 | 0.647 | |
## -----------------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 1.414013 d.f. = 2 p = 0.4931183
##
##
##
anova_model <- aov(satisfaction_score ~ region, data = df)
summary(anova_model)
## Df Sum Sq Mean Sq F value Pr(>F)
## region 3 0.9 0.2947 0.115 0.951
## Residuals 296 757.6 2.5595
TukeyHSD(anova_model)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = satisfaction_score ~ region, data = df)
##
## $region
## diff lwr upr p adj
## North-East -0.12640647 -0.7998883 0.5470754 0.9624034
## South-East -0.10183382 -0.7616245 0.5579569 0.9784923
## West-East -0.01470850 -0.6906671 0.6612501 0.9999361
## South-North 0.02457265 -0.6509646 0.7001099 0.9997021
## West-North 0.11169797 -0.5796390 0.8030350 0.9754681
## West-South 0.08712532 -0.5908812 0.7651318 0.9873584
lm_model <- lm(purchase_amount ~ age + satisfaction_score + return_rate + loyalty_program, data = df)
summary(lm_model)
##
## Call:
## lm(formula = purchase_amount ~ age + satisfaction_score + return_rate +
## loyalty_program, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -139.551 -32.124 -2.005 33.581 152.395
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 180.8824 17.3174 10.445 <2e-16 ***
## age 0.1971 0.2966 0.665 0.507
## satisfaction_score 1.2322 1.7598 0.700 0.484
## return_rate -31.9904 32.6776 -0.979 0.328
## loyalty_programYes 9.0459 5.6893 1.590 0.113
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 48.4 on 295 degrees of freedom
## Multiple R-squared: 0.01534, Adjusted R-squared: 0.001987
## F-statistic: 1.149 on 4 and 295 DF, p-value: 0.3337
set.seed(2025)
months_full <- seq(as.Date("2024-02-01"), by = "month", length.out = 12)
df_ts <- tibble(
month = months_full,
month_num = 1:12,
total_revenue = round(
100000 + 5000 * sin(2 * pi * 1:12 / 12) + rnorm(12, 0, 5000)
)
)
ggplot(df_ts, aes(x = month, y = total_revenue)) +
geom_line(color = "#1f77b4", size = 1.2) +
geom_point(size = 3, color = "#ff7f0e") +
labs(title = "Simulated Monthly Revenue",
x = "Month", y = "Revenue (USD)") +
theme_minimal()
## ETS Forecast:Monthly Revenue
ts_revenue <- ts(df_ts$total_revenue, start = c(2024, 2), frequency = 12)
ets_model <- ets(ts_revenue)
summary(ets_model)
## ETS(A,N,N)
##
## Call:
## ets(y = ts_revenue)
##
## Smoothing parameters:
## alpha = 0.9999
##
## Initial states:
## l = 105603.7212
##
## sigma: 4230.846
##
## AIC AICc BIC
## 234.0348 237.0348 235.4895
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -1198.311 3862.216 3263.459 -1.294131 3.27909 NaN -0.007515003
forecast_ets <- forecast(ets_model, h = 3)
autoplot(forecast_ets) +
labs(title = "ETS Forecast: Next 3 Months",
x = "Month", y = "Revenue") +
theme_minimal()
forecast_df <- data.frame(
Month = as.yearmon(time(forecast_ets$mean)),
Forecast = round(forecast_ets$mean, 2),
Lo_80 = round(forecast_ets$lower[,1], 2),
Hi_80 = round(forecast_ets$upper[,1], 2),
Lo_95 = round(forecast_ets$lower[,2], 2),
Hi_95 = round(forecast_ets$upper[,2], 2)
)
kable(forecast_df, caption = "ETS Revenue Forecast: Confidence & Prediction Intervals")
| Month | Forecast | Lo_80 | Hi_80 | Lo_95 | Hi_95 |
|---|---|---|---|---|---|
| Feb 2025 | 91225.43 | 85803.38 | 96647.48 | 82933.12 | 99517.74 |
| Mar 2025 | 91225.43 | 83557.88 | 98892.98 | 79498.93 | 102951.93 |
| Apr 2025 | 91225.43 | 81834.80 | 100616.07 | 76863.69 | 105587.17 |
# Now run your correlation block
df_marketing <- df %>%
select(purchase_amount, satisfaction_score, return_rate, loyalty_program, recommended) %>%
mutate(
loyalty_program = ifelse(loyalty_program == "Yes", 1, 0),
recommended = ifelse(recommended == "Yes", 1, 0)
)
cor(df_marketing)
## purchase_amount satisfaction_score return_rate
## purchase_amount 1.00000000 0.04327292 -0.057796706
## satisfaction_score 0.04327292 1.00000000 0.019885714
## return_rate -0.05779671 0.01988571 1.000000000
## loyalty_program 0.09522137 0.04556405 -0.014265707
## recommended -0.04470831 -0.07917488 0.008242507
## loyalty_program recommended
## purchase_amount 0.09522137 -0.044708307
## satisfaction_score 0.04556405 -0.079174876
## return_rate -0.01426571 0.008242507
## loyalty_program 1.00000000 -0.006522080
## recommended -0.00652208 1.000000000
# Create encoded promotion variable
df <- df %>%
mutate(
promo_yes = ifelse(recommended == "Yes", 1, 0),
region = factor(region)
)
# Linear model with interaction term
lm_interaction <- lm(purchase_amount ~ promo_yes * region + satisfaction_score + age + return_rate, data = df)
summary(lm_interaction)
##
## Call:
## lm(formula = purchase_amount ~ promo_yes * region + satisfaction_score +
## age + return_rate, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -137.760 -33.757 1.012 31.755 161.522
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 181.1436 19.4362 9.320 <2e-16 ***
## promo_yes -2.4355 11.2482 -0.217 0.829
## regionNorth 10.2774 12.5065 0.822 0.412
## regionSouth 13.8672 13.1243 1.057 0.292
## regionWest 4.0647 14.0128 0.290 0.772
## satisfaction_score 1.4040 1.7809 0.788 0.431
## age 0.2165 0.3014 0.718 0.473
## return_rate -38.6517 33.2259 -1.163 0.246
## promo_yes:regionNorth -1.6690 16.2473 -0.103 0.918
## promo_yes:regionSouth -13.9441 16.3489 -0.853 0.394
## promo_yes:regionWest 8.6233 17.1127 0.504 0.615
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 48.74 on 289 degrees of freedom
## Multiple R-squared: 0.02178, Adjusted R-squared: -0.01207
## F-statistic: 0.6434 on 10 and 289 DF, p-value: 0.7761
# Step 1: Simulate monthly revenue and promo spend
set.seed(2025)
months_full <- seq(as.Date("2024-02-01"), by = "month", length.out = 12)
df_ts <- tibble(
month = months_full,
month_num = 1:12,
total_revenue = round(
100000 + 5000 * sin(2 * pi * 1:12 / 12) + rnorm(12, 0, 5000)
),
promo_spend = round(runif(12, 2000, 8000))
)
# Step 2: Create lagged promo variable
df_ts <- df_ts %>%
mutate(promo_lag1 = lag(promo_spend, 1))
# Step 3: Create time series objects
ts_revenue <- ts(df_ts$total_revenue[2:12], start = c(2024, 3), frequency = 12)
ts_promo <- ts(df_ts$promo_lag1[2:12], start = c(2024, 3), frequency = 12)
# Step 4: Fit time series regression model
ts_model <- tslm(ts_revenue ~ ts_promo)
summary(ts_model)
##
## Call:
## tslm(formula = ts_revenue ~ ts_promo)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10113.2 -4099.5 -725.3 5617.3 8445.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 93537.565 6532.336 14.319 1.69e-07 ***
## ts_promo 1.306 1.237 1.056 0.318
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6275 on 9 degrees of freedom
## Multiple R-squared: 0.1103, Adjusted R-squared: 0.01146
## F-statistic: 1.116 on 1 and 9 DF, p-value: 0.3183
# Step 5: Create future promo scenarios (10% drift)
last_promo <- tail(df_ts$promo_spend, 1)
new_promo <- data.frame(ts_promo = last_promo * c(0.9, 1.0, 1.1))
# Step 6: Forecast revenue
forecast_ts <- forecast(ts_model, newdata = new_promo)
# Step 7: Plot forecast
autoplot(forecast_ts) +
labs(title = "Revenue Forecast with Lagged Promotion Effect",
x = "Month", y = "Projected Revenue") +
theme_minimal()