Customer Behavior & Forecasting Analysis

1 Simulate Customer Data
2 Exploratory Data Analysis(EDA)
3 Confidence intervals
4 Hypothesis Testing: loyalty impact on spend
5 Chi-Square Test:Delivery vs Recommendation
6 ANOVA:Satisfaction by Region
7 Linear Regression:Purchase Drivers
8 Forecasting:Simulate 12-month Revenue
9 Correlation with Marketing Variables
10 Promotion * Region Interaction model
11 19. Time Series Regression with Lagged Promotion Effect

1 Simulate Customer Data

set.seed(42)
n <- 300
df <- tibble(
  customer_id = paste0("C", 1:n),
  gender = sample(c("Male", "Female", "Other"), n, replace = TRUE, prob = c(0.45, 0.5, 0.05)),
  age = round(rnorm(n, mean = 35, sd = 10)),
  region = sample(c("North", "South", "East", "West"), n, replace = TRUE),
  purchase_amount = round(rnorm(n, mean = 200, sd = 50), 2),
  satisfaction_score = round(rnorm(n, mean = 7, sd = 1.5), 1),
  loyalty_program = sample(c("Yes", "No"), n, replace = TRUE, prob = c(0.6, 0.4)),
  delivery_mode = sample(c("Standard", "Express", "In-Store"), n, replace = TRUE),
  return_rate = round(runif(n, 0, 0.3), 2),
  recommended = sample(c("Yes", "No"), n, replace = TRUE, prob = c(0.7, 0.3))
)

2 Exploratory Data Analysis(EDA)

summary(df)

##  customer_id           gender               age           region         
##  Length:300         Length:300         Min.   : 8.00   Length:300        
##  Class :character   Class :character   1st Qu.:28.00   Class :character  
##  Mode  :character   Mode  :character   Median :35.00   Mode  :character  
##                                        Mean   :34.82                     
##                                        3rd Qu.:41.00                     
##                                        Max.   :60.00                     
##  purchase_amount satisfaction_score loyalty_program    delivery_mode     
##  Min.   : 49.1   Min.   : 1.900     Length:300         Length:300        
##  1st Qu.:166.2   1st Qu.: 5.875     Class :character   Class :character  
##  Median :198.7   Median : 7.100     Mode  :character   Mode  :character  
##  Mean   :196.8   Mean   : 6.997                                          
##  3rd Qu.:231.5   3rd Qu.: 8.000                                          
##  Max.   :360.6   Max.   :12.200                                          
##   return_rate     recommended       
##  Min.   :0.0000   Length:300        
##  1st Qu.:0.0900   Class :character  
##  Median :0.1500   Mode  :character  
##  Mean   :0.1546                     
##  3rd Qu.:0.2300                     
##  Max.   :0.3000

df %>% tabyl(loyalty_program, recommended)

##  loyalty_program No Yes
##               No 43  80
##              Yes 63 114

3 Confidence intervals

t.test(df$satisfaction_score, conf.level = 0.95)

## 
##  One Sample t-test
## 
## data:  df$satisfaction_score
## t = 76.087, df = 299, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  6.815703 7.177630
## sample estimates:
## mean of x 
##  6.996667

t.test(df$purchase_amount, conf.level = 0.95)

## 
##  One Sample t-test
## 
## data:  df$purchase_amount
## t = 70.335, df = 299, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  191.2521 202.2623
## sample estimates:
## mean of x 
##  196.7572

4 Hypothesis Testing: loyalty impact on spend

t.test(purchase_amount ~ loyalty_program, data = df)

## 
##  Welch Two Sample t-test
## 
## data:  purchase_amount by loyalty_program
## t = -1.6037, df = 234.18, p-value = 0.1101
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
##  -20.870046   2.140001
## sample estimates:
##  mean in group No mean in group Yes 
##          191.2319          200.5969

5 Chi-Square Test:Delivery vs Recommendation

table_check <- table(df$delivery_mode, df$recommended)
chisq.test(table_check)

## 
##  Pearson's Chi-squared test
## 
## data:  table_check
## X-squared = 1.414, df = 2, p-value = 0.4931

CrossTable(df$delivery_mode, df$recommended, chisq = TRUE)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  300 
## 
##  
##                  | df$recommended 
## df$delivery_mode |        No |       Yes | Row Total | 
## -----------------|-----------|-----------|-----------|
##          Express |        35 |        66 |       101 | 
##                  |     0.013 |     0.007 |           | 
##                  |     0.347 |     0.653 |     0.337 | 
##                  |     0.330 |     0.340 |           | 
##                  |     0.117 |     0.220 |           | 
## -----------------|-----------|-----------|-----------|
##         In-Store |        31 |        67 |        98 | 
##                  |     0.380 |     0.208 |           | 
##                  |     0.316 |     0.684 |     0.327 | 
##                  |     0.292 |     0.345 |           | 
##                  |     0.103 |     0.223 |           | 
## -----------------|-----------|-----------|-----------|
##         Standard |        40 |        61 |       101 | 
##                  |     0.521 |     0.285 |           | 
##                  |     0.396 |     0.604 |     0.337 | 
##                  |     0.377 |     0.314 |           | 
##                  |     0.133 |     0.203 |           | 
## -----------------|-----------|-----------|-----------|
##     Column Total |       106 |       194 |       300 | 
##                  |     0.353 |     0.647 |           | 
## -----------------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  1.414013     d.f. =  2     p =  0.4931183 
## 
## 
##

6 ANOVA:Satisfaction by Region

anova_model <- aov(satisfaction_score ~ region, data = df)
summary(anova_model)

##              Df Sum Sq Mean Sq F value Pr(>F)
## region        3    0.9  0.2947   0.115  0.951
## Residuals   296  757.6  2.5595

TukeyHSD(anova_model)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = satisfaction_score ~ region, data = df)
## 
## $region
##                    diff        lwr       upr     p adj
## North-East  -0.12640647 -0.7998883 0.5470754 0.9624034
## South-East  -0.10183382 -0.7616245 0.5579569 0.9784923
## West-East   -0.01470850 -0.6906671 0.6612501 0.9999361
## South-North  0.02457265 -0.6509646 0.7001099 0.9997021
## West-North   0.11169797 -0.5796390 0.8030350 0.9754681
## West-South   0.08712532 -0.5908812 0.7651318 0.9873584

7 Linear Regression:Purchase Drivers

lm_model <- lm(purchase_amount ~ age + satisfaction_score + return_rate + loyalty_program, data = df)
summary(lm_model)

## 
## Call:
## lm(formula = purchase_amount ~ age + satisfaction_score + return_rate + 
##     loyalty_program, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -139.551  -32.124   -2.005   33.581  152.395 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        180.8824    17.3174  10.445   <2e-16 ***
## age                  0.1971     0.2966   0.665    0.507    
## satisfaction_score   1.2322     1.7598   0.700    0.484    
## return_rate        -31.9904    32.6776  -0.979    0.328    
## loyalty_programYes   9.0459     5.6893   1.590    0.113    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 48.4 on 295 degrees of freedom
## Multiple R-squared:  0.01534,    Adjusted R-squared:  0.001987 
## F-statistic: 1.149 on 4 and 295 DF,  p-value: 0.3337

8 Forecasting:Simulate 12-month Revenue

set.seed(2025)
months_full <- seq(as.Date("2024-02-01"), by = "month", length.out = 12)
df_ts <- tibble(
  month = months_full,
  month_num = 1:12,
  total_revenue = round(
    100000 + 5000 * sin(2 * pi * 1:12 / 12) + rnorm(12, 0, 5000)
  )
)

ggplot(df_ts, aes(x = month, y = total_revenue)) +
  geom_line(color = "#1f77b4", size = 1.2) +
  geom_point(size = 3, color = "#ff7f0e") +
  labs(title = "Simulated Monthly Revenue",
       x = "Month", y = "Revenue (USD)") +
  theme_minimal()

## ETS Forecast:Monthly Revenue

ts_revenue <- ts(df_ts$total_revenue, start = c(2024, 2), frequency = 12)
ets_model <- ets(ts_revenue)
summary(ets_model)

## ETS(A,N,N) 
## 
## Call:
##  ets(y = ts_revenue) 
## 
##   Smoothing parameters:
##     alpha = 0.9999 
## 
##   Initial states:
##     l = 105603.7212 
## 
##   sigma:  4230.846
## 
##      AIC     AICc      BIC 
## 234.0348 237.0348 235.4895 
## 
## Training set error measures:
##                     ME     RMSE      MAE       MPE    MAPE MASE         ACF1
## Training set -1198.311 3862.216 3263.459 -1.294131 3.27909  NaN -0.007515003

forecast_ets <- forecast(ets_model, h = 3)
autoplot(forecast_ets) +
  labs(title = "ETS Forecast: Next 3 Months",
       x = "Month", y = "Revenue") +
  theme_minimal()

forecast_df <- data.frame(
  Month = as.yearmon(time(forecast_ets$mean)),
  Forecast = round(forecast_ets$mean, 2),
  Lo_80 = round(forecast_ets$lower[,1], 2),
  Hi_80 = round(forecast_ets$upper[,1], 2),
  Lo_95 = round(forecast_ets$lower[,2], 2),
  Hi_95 = round(forecast_ets$upper[,2], 2)
)
kable(forecast_df, caption = "ETS Revenue Forecast: Confidence & Prediction Intervals")

ETS Revenue Forecast: Confidence & Prediction Intervals
Month	Forecast	Lo_80	Hi_80	Lo_95	Hi_95
Feb 2025	91225.43	85803.38	96647.48	82933.12	99517.74
Mar 2025	91225.43	83557.88	98892.98	79498.93	102951.93
Apr 2025	91225.43	81834.80	100616.07	76863.69	105587.17

9 Correlation with Marketing Variables

# Now run your correlation block
df_marketing <- df %>%
  select(purchase_amount, satisfaction_score, return_rate, loyalty_program, recommended) %>%
  mutate(
    loyalty_program = ifelse(loyalty_program == "Yes", 1, 0),
    recommended = ifelse(recommended == "Yes", 1, 0)
  )

cor(df_marketing)

##                    purchase_amount satisfaction_score  return_rate
## purchase_amount         1.00000000         0.04327292 -0.057796706
## satisfaction_score      0.04327292         1.00000000  0.019885714
## return_rate            -0.05779671         0.01988571  1.000000000
## loyalty_program         0.09522137         0.04556405 -0.014265707
## recommended            -0.04470831        -0.07917488  0.008242507
##                    loyalty_program  recommended
## purchase_amount         0.09522137 -0.044708307
## satisfaction_score      0.04556405 -0.079174876
## return_rate            -0.01426571  0.008242507
## loyalty_program         1.00000000 -0.006522080
## recommended            -0.00652208  1.000000000

10 Promotion * Region Interaction model

# Create encoded promotion variable
df <- df %>%
  mutate(
    promo_yes = ifelse(recommended == "Yes", 1, 0),
    region = factor(region)
  )

# Linear model with interaction term
lm_interaction <- lm(purchase_amount ~ promo_yes * region + satisfaction_score + age + return_rate, data = df)
summary(lm_interaction)

## 
## Call:
## lm(formula = purchase_amount ~ promo_yes * region + satisfaction_score + 
##     age + return_rate, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -137.760  -33.757    1.012   31.755  161.522 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           181.1436    19.4362   9.320   <2e-16 ***
## promo_yes              -2.4355    11.2482  -0.217    0.829    
## regionNorth            10.2774    12.5065   0.822    0.412    
## regionSouth            13.8672    13.1243   1.057    0.292    
## regionWest              4.0647    14.0128   0.290    0.772    
## satisfaction_score      1.4040     1.7809   0.788    0.431    
## age                     0.2165     0.3014   0.718    0.473    
## return_rate           -38.6517    33.2259  -1.163    0.246    
## promo_yes:regionNorth  -1.6690    16.2473  -0.103    0.918    
## promo_yes:regionSouth -13.9441    16.3489  -0.853    0.394    
## promo_yes:regionWest    8.6233    17.1127   0.504    0.615    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 48.74 on 289 degrees of freedom
## Multiple R-squared:  0.02178,    Adjusted R-squared:  -0.01207 
## F-statistic: 0.6434 on 10 and 289 DF,  p-value: 0.7761

11 19. Time Series Regression with Lagged Promotion Effect

# Step 1: Simulate monthly revenue and promo spend
set.seed(2025)
months_full <- seq(as.Date("2024-02-01"), by = "month", length.out = 12)
df_ts <- tibble(
  month = months_full,
  month_num = 1:12,
  total_revenue = round(
    100000 + 5000 * sin(2 * pi * 1:12 / 12) + rnorm(12, 0, 5000)
  ),
  promo_spend = round(runif(12, 2000, 8000))
)

# Step 2: Create lagged promo variable
df_ts <- df_ts %>%
  mutate(promo_lag1 = lag(promo_spend, 1))

# Step 3: Create time series objects
ts_revenue <- ts(df_ts$total_revenue[2:12], start = c(2024, 3), frequency = 12)
ts_promo <- ts(df_ts$promo_lag1[2:12], start = c(2024, 3), frequency = 12)

# Step 4: Fit time series regression model
ts_model <- tslm(ts_revenue ~ ts_promo)
summary(ts_model)

## 
## Call:
## tslm(formula = ts_revenue ~ ts_promo)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10113.2  -4099.5   -725.3   5617.3   8445.6 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 93537.565   6532.336  14.319 1.69e-07 ***
## ts_promo        1.306      1.237   1.056    0.318    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6275 on 9 degrees of freedom
## Multiple R-squared:  0.1103, Adjusted R-squared:  0.01146 
## F-statistic: 1.116 on 1 and 9 DF,  p-value: 0.3183

# Step 5: Create future promo scenarios (10% drift)
last_promo <- tail(df_ts$promo_spend, 1)
new_promo <- data.frame(ts_promo = last_promo * c(0.9, 1.0, 1.1))

# Step 6: Forecast revenue
forecast_ts <- forecast(ts_model, newdata = new_promo)

# Step 7: Plot forecast
autoplot(forecast_ts) +
  labs(title = "Revenue Forecast with Lagged Promotion Effect",
       x = "Month", y = "Projected Revenue") +
  theme_minimal()

Customer Behavior & Forecasting Analysis

Fred Festus Bukenya

2025-07-17

1 Simulate Customer Data

2 Exploratory Data Analysis(EDA)

3 Confidence intervals

4 Hypothesis Testing: loyalty impact on spend

5 Chi-Square Test:Delivery vs Recommendation

6 ANOVA:Satisfaction by Region

7 Linear Regression:Purchase Drivers

8 Forecasting:Simulate 12-month Revenue

9 Correlation with Marketing Variables

10 Promotion * Region Interaction model

11 19. Time Series Regression with Lagged Promotion Effect