pkgs <- c(
"tidyverse",
"lubridate",
"scales",
"corrplot",
"ggridges",
"patchwork",
"caret",
"randomForest",
"xgboost",
"Metrics",
"car",
"lmtest",
"knitr"
)
invisible(lapply(pkgs, function(p) {
if (!requireNamespace(p, quietly = TRUE)) install.packages(p, repos = "https://cloud.r-project.org")
library(p, character.only = TRUE)
}))
## Warning: package 'dplyr' was built under R version 4.4.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
##
## Attaching package: 'scales'
##
##
## The following object is masked from 'package:purrr':
##
## discard
##
##
## The following object is masked from 'package:readr':
##
## col_factor
## Warning: package 'corrplot' was built under R version 4.4.2
## corrplot 0.95 loaded
## Warning: package 'ggridges' was built under R version 4.4.3
## Warning: package 'patchwork' was built under R version 4.4.3
## Warning: package 'caret' was built under R version 4.4.2
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
## Warning: package 'xgboost' was built under R version 4.4.3
## Warning: package 'Metrics' was built under R version 4.4.3
##
## Attaching package: 'Metrics'
##
## The following objects are masked from 'package:caret':
##
## precision, recall
## Warning: package 'car' was built under R version 4.4.2
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
## Warning: package 'lmtest' was built under R version 4.4.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.4.2
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Warning: package 'knitr' was built under R version 4.4.3
raw <- read.csv("Walmart.csv", stringsAsFactors = FALSE) # load dataset without auto-factors
df <- raw %>%
mutate(
Date = dmy(Date), # convert to proper date format
Year = year(Date), # extract year
Month = month(Date, label = TRUE, abbr = TRUE), # month name (short)
Month_num = month(Date), # numeric month (1–12)
Week = isoweek(Date), # ISO week number
Quarter = quarter(Date), # quarter of year (1–4)
Holiday_Flag = factor(Holiday_Flag, levels = c(0, 1),
labels = c("Non-Holiday", "Holiday")), # convert to readable categories
Store = factor(Store) # treat store as categorical variable
)
glimpse(df)
## Rows: 6,435
## Columns: 13
## $ Store <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Date <date> 2010-02-05, 2010-02-12, 2010-02-19, 2010-02-26, 2010-03-…
## $ Weekly_Sales <dbl> 1643691, 1641957, 1611968, 1409728, 1554807, 1439542, 147…
## $ Holiday_Flag <fct> Non-Holiday, Holiday, Non-Holiday, Non-Holiday, Non-Holid…
## $ Temperature <dbl> 42.31, 38.51, 39.93, 46.63, 46.50, 57.79, 54.58, 51.45, 6…
## $ Fuel_Price <dbl> 2.572, 2.548, 2.514, 2.561, 2.625, 2.667, 2.720, 2.732, 2…
## $ CPI <dbl> 211.0964, 211.2422, 211.2891, 211.3196, 211.3501, 211.380…
## $ Unemployment <dbl> 8.106, 8.106, 8.106, 8.106, 8.106, 8.106, 8.106, 8.106, 7…
## $ Year <dbl> 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 201…
## $ Month <ord> Feb, Feb, Feb, Feb, Mar, Mar, Mar, Mar, Apr, Apr, Apr, Ap…
## $ Month_num <dbl> 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, …
## $ Week <dbl> 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20…
## $ Quarter <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
summary(df)
## Store Date Weekly_Sales Holiday_Flag
## 1 : 143 Min. :2010-02-05 Min. : 209986 Non-Holiday:5985
## 2 : 143 1st Qu.:2010-10-08 1st Qu.: 553350 Holiday : 450
## 3 : 143 Median :2011-06-17 Median : 960746
## 4 : 143 Mean :2011-06-17 Mean :1046965
## 5 : 143 3rd Qu.:2012-02-24 3rd Qu.:1420159
## 6 : 143 Max. :2012-10-26 Max. :3818686
## (Other):5577
## Temperature Fuel_Price CPI Unemployment
## Min. : -2.06 Min. :2.472 Min. :126.1 Min. : 3.879
## 1st Qu.: 47.46 1st Qu.:2.933 1st Qu.:131.7 1st Qu.: 6.891
## Median : 62.67 Median :3.445 Median :182.6 Median : 7.874
## Mean : 60.66 Mean :3.359 Mean :171.6 Mean : 7.999
## 3rd Qu.: 74.94 3rd Qu.:3.735 3rd Qu.:212.7 3rd Qu.: 8.622
## Max. :100.14 Max. :4.468 Max. :227.2 Max. :14.313
##
## Year Month Month_num Week Quarter
## Min. :2010 Apr : 630 Min. : 1.000 Min. : 1.00 Min. :1.000
## 1st Qu.:2010 Jul : 630 1st Qu.: 4.000 1st Qu.:14.00 1st Qu.:2.000
## Median :2011 Mar : 585 Median : 6.000 Median :26.00 Median :2.000
## Mean :2011 Jun : 585 Mean : 6.448 Mean :25.82 Mean :2.483
## 3rd Qu.:2012 Aug : 585 3rd Qu.: 9.000 3rd Qu.:38.00 3rd Qu.:3.000
## Max. :2012 Sep : 585 Max. :12.000 Max. :52.00 Max. :4.000
## (Other):2835
BLUE <- "#0071CE"
YELLOW <- "#FFC220"
RED <- "#E04B2A"
theme_walmart <- theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 15, colour = BLUE),
plot.subtitle = element_text(colour = "grey45", size = 11),
panel.grid.minor = element_blank(),
axis.title = element_text(colour = "grey30"),
legend.position = "bottom",
plot.background = element_rect(fill = "#FAFAFA", colour = NA),
panel.background = element_rect(fill = "#FAFAFA", colour = NA)
)
sales_y <- scale_y_continuous(labels = label_dollar(suffix = "M"))
p_dist <- ggplot(df, aes(Weekly_Sales / 1e6)) +
geom_histogram(aes(y = after_stat(density)), bins = 60,
fill = BLUE, colour = "white", linewidth = 0.2) +
geom_density(colour = YELLOW, linewidth = 1) +
scale_x_continuous(labels = label_dollar(suffix = "M")) +
labs(title = "Distribution of Weekly Sales",
subtitle = "Right-skewed; large stores pull the tail",
x = "Weekly Sales (USD Millions)", y = "Density") +
theme_walmart
print(p_dist)
Most stores fall between $500K–$1.5M/week with a few large stores pushing the tail toward $3.8M.
time_sales <- df %>%
group_by(Date) %>%
summarise(Total_Sales = sum(Weekly_Sales) / 1e6, .groups = "drop")
p_time <- ggplot(time_sales, aes(Date, Total_Sales)) +
geom_line(colour = BLUE, linewidth = 0.7) +
geom_smooth(method = "loess", span = 0.2, colour = YELLOW, se = FALSE) +
sales_y +
labs(title = "Total Weekly Sales Over Time (All 45 Stores)",
subtitle = "Loess trend in yellow — clear seasonality & holiday spikes",
x = NULL, y = "Total Sales (USD M)") +
theme_walmart
print(p_time)
## `geom_smooth()` using formula = 'y ~ x'
Sales spike every November–December and stay relatively flat the rest of the year.
p_holiday <- ggplot(df, aes(Holiday_Flag, Weekly_Sales / 1e6, fill = Holiday_Flag)) +
geom_boxplot(outlier.colour = RED, outlier.size = 0.7, width = 0.5) +
scale_fill_manual(values = c("Non-Holiday" = BLUE, "Holiday" = YELLOW)) +
sales_y +
labs(title = "Weekly Sales: Holiday vs Non-Holiday",
x = NULL, y = "Weekly Sales (USD M)", fill = NULL) +
theme_walmart + theme(legend.position = "none")
print(p_holiday)
Holiday weeks generally produce slightly higher sales than non-holiday weeks. The difference is not dramatic, but holiday periods appear to reduce the likelihood of particularly poor sales weeks.
p_month <- ggplot(df, aes(Weekly_Sales / 1e6, Month, fill = after_stat(x))) +
geom_density_ridges_gradient(scale = 2.5, rel_min_height = 0.01,
colour = "white", linewidth = 0.3) +
scale_fill_gradient(low = "#cce4f6", high = BLUE) +
scale_x_continuous(labels = label_dollar(suffix = "M")) +
labs(title = "Sales Distribution by Month",
subtitle = "November & December drive the highest peaks",
x = "Weekly Sales (USD M)", y = NULL) +
theme_walmart + theme(legend.position = "none")
print(p_month)
## Picking joint bandwidth of 0.144
November and December shift clearly to the right while January drops back down as the weakest month.
top10 <- df %>%
group_by(Store) %>%
summarise(Avg_Sales = mean(Weekly_Sales) / 1e6, .groups = "drop") %>%
slice_max(Avg_Sales, n = 10)
p_top10 <- ggplot(top10, aes(reorder(Store, Avg_Sales), Avg_Sales)) +
geom_col(fill = BLUE, width = 0.7) +
geom_text(aes(label = dollar(Avg_Sales, suffix = "M", accuracy = 0.01)),
hjust = -0.1, size = 3.5, colour = "grey20") +
coord_flip() +
scale_y_continuous(labels = label_dollar(suffix = "M"),
expand = expansion(mult = c(0, 0.15))) +
labs(title = "Top 10 Stores by Average Weekly Sales",
x = "Store", y = "Avg Weekly Sales (USD M)") +
theme_walmart
print(p_top10)
Store 20 leads at $2.1M/week, nearly double the overall average, with a wide gap between the top stores and the rest.
scatter_vs_sales <- function(xvar, xlabel) {
ggplot(df, aes(.data[[xvar]], Weekly_Sales / 1e6)) +
geom_point(alpha = 0.2, colour = BLUE, size = 0.8) +
geom_smooth(method = "lm", colour = RED, se = TRUE) +
sales_y +
labs(title = paste(xlabel, "vs Sales"), x = xlabel, y = "Weekly Sales (M)") +
theme_walmart
}
p_scatter_temp <- ggplot(df, aes(Temperature, Weekly_Sales / 1e6, colour = Holiday_Flag)) +
geom_point(alpha = 0.3, size = 0.8) +
geom_smooth(method = "lm", se = FALSE, linewidth = 1) +
scale_colour_manual(values = c("Non-Holiday" = BLUE, "Holiday" = RED)) +
sales_y +
labs(title = "Temperature vs Sales", x = "°F", y = "Weekly Sales (M)", colour = NULL) +
theme_walmart + theme(legend.position = "top")
p_scatter_fuel <- scatter_vs_sales("Fuel_Price", "Fuel Price (USD)")
p_scatter_unemp <- scatter_vs_sales("Unemployment", "Unemployment Rate (%)")
p_scatter_cpi <- scatter_vs_sales("CPI", "Consumer Price Index")
print(p_scatter_temp)
## `geom_smooth()` using formula = 'y ~ x'
print(p_scatter_fuel)
## `geom_smooth()` using formula = 'y ~ x'
print(p_scatter_unemp)
## `geom_smooth()` using formula = 'y ~ x'
print(p_scatter_cpi)
## `geom_smooth()` using formula = 'y ~ x'
Unemployment and CPI show a slight downward slope. Fuel Price is completely flat with no relationship to sales.
cor_mat <- df %>%
select(Weekly_Sales, Temperature, Fuel_Price, CPI, Unemployment, Month_num, Week, Year) %>%
rename(Sales = Weekly_Sales, Temp = Temperature, Fuel = Fuel_Price, Month = Month_num) %>%
cor()
corrplot(
cor_mat,
method = "color",
type = "upper",
addCoef.col = "black"
)
No Economic variable has a strong relationship with sales. The strongest is Unemployment at -0.11, but this is still weak. Fuel Price is almost zero at 0.01, meaning it has no real relationship with sales. The high Fuel Price vs Year correlation (0.78) just reflects that fuel prices increased steadily from 2010 to 2012, not that fuel is driving sales.
summary(df %>% select(Weekly_Sales, Temperature, Fuel_Price, CPI, Unemployment))
## Weekly_Sales Temperature Fuel_Price CPI
## Min. : 209986 Min. : -2.06 Min. :2.472 Min. :126.1
## 1st Qu.: 553350 1st Qu.: 47.46 1st Qu.:2.933 1st Qu.:131.7
## Median : 960746 Median : 62.67 Median :3.445 Median :182.6
## Mean :1046965 Mean : 60.66 Mean :3.359 Mean :171.6
## 3rd Qu.:1420159 3rd Qu.: 74.94 3rd Qu.:3.735 3rd Qu.:212.7
## Max. :3818686 Max. :100.14 Max. :4.468 Max. :227.2
## Unemployment
## Min. : 3.879
## 1st Qu.: 6.891
## Median : 7.874
## Mean : 7.999
## 3rd Qu.: 8.622
## Max. :14.313
Weekly Sales: Very wide range ($210K–$3.82M). Mean ($1.05M) is higher than median ($961K), showing a right-skew from a few very high-performing stores.
Temperature: Ranges from -2°F to 100°F, covering different climates and seasons. Average is about 61°F.
Fuel Price: Ranges from $2.47 to $4.47, averaging around $3.36, reflecting changing economic conditions over time.
CPI: Ranges from 126 to 227, showing inflation differences across time and regions that may affect spending.
Unemployment: Ranges from 3.88% to 14.31%, capturing both strong and weak economic periods that can influence retail sales.
t.test(Weekly_Sales ~ Holiday_Flag, data = df)
##
## Welch Two Sample t-test
##
## data: Weekly_Sales by Holiday_Flag
## t = -2.6801, df = 504, p-value = 0.007602
## alternative hypothesis: true difference in means between group Non-Holiday and group Holiday is not equal to 0
## 95 percent confidence interval:
## -141473.17 -21789.85
## sample estimates:
## mean in group Non-Holiday mean in group Holiday
## 1041256 1122888
t-test result: t = -2.68, p = 0.0076 shows a statistically significant difference in mean Weekly Sales between holiday and non-holiday weeks
Mean sales: Non-holiday ≈ $1.04M vs Holiday ≈ $1.12M → holiday weeks have higher sales
Confidence interval: consistently negative for (Non-holiday − Holiday), confirming higher holiday sales
summary(aov(Weekly_Sales ~ factor(Quarter), data = df))
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(Quarter) 3 1.289e+13 4.297e+12 13.57 8.01e-09 ***
## Residuals 6431 2.036e+15 3.167e+11
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The ANOVA test shows that Quarter has a significant effect on Weekly Sales (F = 13.57, p = 8.01e-09), meaning sales are different in at least one quarter compared to the others. Most of the variation is still in the residuals, but the result still shows clear seasonal differences in sales across quarters.
sapply(c("Temperature", "Fuel_Price", "CPI", "Unemployment"), function(v) {
ct <- cor.test(df$Weekly_Sales, df[[v]])
c(r = round(ct$estimate, 4), p = round(ct$p.value, 4))
})
## Temperature Fuel_Price CPI Unemployment
## r.cor -0.0638 0.0095 -0.0726 -0.1062
## p 0.0000 0.4478 0.0000 0.0000
Temperature (r = -0.0638, p < 0.001), CPI (r = -0.0726, p < 0.001), and Unemployment (r = -0.1062, p < 0.001) all show statistically significant but very weak negative correlations with Weekly Sales, meaning higher values in these variables are slightly associated with lower sales. Fuel Price has no significant relationship with Weekly Sales (r = 0.0095, p = 0.4478), indicating no meaningful linear effect.
store1_ts <- df %>%
filter(Store == 1) %>%
arrange(Date) %>%
pull(Weekly_Sales)
ts_obj <- ts(store1_ts, frequency = 52, start = c(2010, 6))
decomp <- stl(ts_obj, s.window = "periodic")
plot(decomp, main = "STL Decomposition — Store 1 Weekly Sales")
The STL decomposition for Store 1 confirms a high degree of seasonality that aligns with the ANOVA findings. The seasonal panel reveals distinct, recurring peaks—most notably during the end-of-year holidays—which explains the significant quarterly effect. However, the remainder panel shows frequent, high-magnitude fluctuations, validating the observation that while seasonality is statistically significant, a large portion of weekly sales variance remains driven by non-seasonal factors.
The Holiday Spikes: Point out those two massive spikes around 2011.0 and 2012.0. Those are likely Black Friday/Christmas. This is exactly what is driving your “Quarterly” significance.
The Growth Trend: Notice the Trend line is a smooth upward slope. You can mention that Store 1 isn’t just seasonal; its baseline performance is actually improving over time.
The Gray Bars (Scale Check): Looking at the gray bars on the right of each panel. The bar for the Seasonal panel is much smaller than the one for the Data panel. This means that while seasonality is “real” (significant), it only accounts for a fraction of the total movement in sales. This perfectly supports your earlier sentence about “residual variation being much larger.”
df <- df %>%
mutate(
Is_Holiday = as.integer(Holiday_Flag == "Holiday"),
Is_Q4 = as.integer(Quarter == 4),
Is_Nov_Dec = as.integer(Month_num %in% c(11, 12)),
Log_Sales = log(Weekly_Sales))
train <- df %>% filter(Year <= 2011)
test <- df %>% filter(Year == 2012)
test <- test %>% filter(Store %in% unique(train$Store))
cat(sprintf("Train: %d rows | Test: %d rows\n", nrow(train), nrow(test)))
## Train: 4500 rows | Test: 1935 rows
FEATURES <- c("Store", "Is_Holiday", "Is_Q4", "Is_Nov_Dec",
"Temperature", "Fuel_Price", "CPI", "Unemployment",
"Month_num", "Week", "Year")
formula_lm <- as.formula(paste("Log_Sales ~", paste(FEATURES, collapse = " + ")))
evaluate_model <- function(name, actuals_log, preds_log) {
actuals <- exp(actuals_log)
preds <- exp(preds_log)
ss_res <- sum((actuals - preds)^2)
ss_tot <- sum((actuals - mean(actuals))^2)
list(
Model = name,
RMSE_USD = round(sqrt(mean((actuals - preds)^2))),
MAE_USD = round(mean(abs(actuals - preds))),
R2 = round(1 - ss_res / ss_tot, 4)
)
}
lm_model <- lm(formula_lm, data = train)
summary(lm_model)
##
## Call:
## lm(formula = formula_lm, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.69279 -0.06252 -0.00600 0.04998 0.67116
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.667e+02 2.342e+01 7.118 1.27e-12 ***
## Store2 2.345e-01 1.659e-02 14.135 < 2e-16 ***
## Store3 -1.419e+00 1.832e-02 -77.459 < 2e-16 ***
## Store4 1.452e+00 1.853e-01 7.837 5.71e-15 ***
## Store5 -1.634e+00 1.859e-02 -87.890 < 2e-16 ***
## Store6 -2.569e-02 1.794e-02 -1.432 0.152194
## Store7 -6.888e-01 5.062e-02 -13.607 < 2e-16 ***
## Store8 -6.103e-01 2.098e-02 -29.090 < 2e-16 ***
## Store9 -1.137e+00 2.082e-02 -54.623 < 2e-16 ***
## Store10 1.418e+00 1.869e-01 7.587 3.96e-14 ***
## Store11 -1.870e-01 1.832e-02 -10.207 < 2e-16 ***
## Store12 8.825e-01 1.938e-01 4.554 5.41e-06 ***
## Store13 1.438e+00 1.856e-01 7.746 1.16e-14 ***
## Store14 7.373e-01 6.605e-02 11.163 < 2e-16 ***
## Store15 2.198e-01 1.729e-01 1.271 0.203740
## Store16 -8.096e-01 4.941e-02 -16.385 < 2e-16 ***
## Store17 6.008e-01 1.852e-01 3.245 0.001184 **
## Store18 7.857e-01 1.738e-01 4.522 6.29e-06 ***
## Store19 1.057e+00 1.729e-01 6.113 1.06e-09 ***
## Store20 4.028e-01 2.246e-02 17.934 < 2e-16 ***
## Store21 -6.831e-01 1.659e-02 -41.176 < 2e-16 ***
## Store22 6.622e-01 1.647e-01 4.020 5.92e-05 ***
## Store23 9.373e-01 1.718e-01 5.455 5.17e-08 ***
## Store24 9.895e-01 1.732e-01 5.714 1.18e-08 ***
## Store25 -6.837e-01 2.257e-02 -30.297 < 2e-16 ***
## Store26 6.845e-01 1.729e-01 3.960 7.62e-05 ***
## Store27 1.210e+00 1.646e-01 7.352 2.31e-13 ***
## Store28 1.166e+00 1.938e-01 6.015 1.94e-09 ***
## Store29 1.098e-01 1.748e-01 0.628 0.530191
## Store30 -1.245e+00 1.659e-02 -75.022 < 2e-16 ***
## Store31 -9.008e-02 1.659e-02 -5.430 5.92e-08 ***
## Store32 4.588e-02 5.047e-02 0.909 0.363367
## Store33 -5.887e-01 1.872e-01 -3.145 0.001673 **
## Store34 7.707e-01 1.879e-01 4.101 4.19e-05 ***
## Store35 5.783e-01 1.652e-01 3.501 0.000469 ***
## Store36 -1.323e+00 1.708e-02 -77.464 < 2e-16 ***
## Store37 -1.065e+00 1.707e-02 -62.377 < 2e-16 ***
## Store38 -1.276e-01 1.938e-01 -0.658 0.510370
## Store39 -6.575e-02 1.707e-02 -3.853 0.000118 ***
## Store40 5.765e-01 1.718e-01 3.355 0.000799 ***
## Store41 7.365e-02 4.929e-02 1.494 0.135155
## Store42 1.621e-01 1.869e-01 0.867 0.385843
## Store43 -7.201e-01 2.914e-02 -24.715 < 2e-16 ***
## Store44 -4.866e-01 1.855e-01 -2.623 0.008733 **
## Store45 -2.432e-01 6.605e-02 -3.682 0.000234 ***
## Is_Holiday 2.149e-02 6.919e-03 3.106 0.001911 **
## Is_Q4 -2.510e-02 9.754e-03 -2.574 0.010099 *
## Is_Nov_Dec 1.768e-01 9.254e-03 19.107 < 2e-16 ***
## Temperature 4.997e-04 2.317e-04 2.156 0.031109 *
## Fuel_Price 1.581e-02 1.017e-02 1.554 0.120185
## CPI 1.380e-02 2.159e-03 6.393 1.80e-10 ***
## Unemployment -2.213e-02 6.331e-03 -3.496 0.000478 ***
## Month_num 3.550e-02 6.210e-03 5.716 1.16e-08 ***
## Week -9.145e-03 1.409e-03 -6.490 9.51e-11 ***
## Year -7.726e-02 1.179e-02 -6.550 6.39e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.117 on 4445 degrees of freedom
## Multiple R-squared: 0.9618, Adjusted R-squared: 0.9614
## F-statistic: 2074 on 54 and 4445 DF, p-value: < 2.2e-16
The model explains 96% of the variation in sales, mostly driven by the store dummy variables and seasonal flags.
knitr::kable(vif(lm_model))
| GVIF | Df | GVIF^(1/(2*Df)) | |
|---|---|---|---|
| Store | 2.632820e+05 | 44 | 1.152380 |
| Is_Holiday | 1.159305e+00 | 1 | 1.076710 |
| Is_Q4 | 6.169750e+00 | 1 | 2.483898 |
| Is_Nov_Dec | 4.158742e+00 | 1 | 2.039299 |
| Temperature | 6.334309e+00 | 1 | 2.516805 |
| Fuel_Price | 6.552010e+00 | 1 | 2.559689 |
| CPI | 2.290512e+03 | 1 | 47.859299 |
| Unemployment | 4.653416e+01 | 1 | 6.821595 |
| Month_num | 1.399247e+02 | 1 | 11.828979 |
| Week | 1.373356e+02 | 1 | 11.719026 |
| Year | 1.142254e+01 | 1 | 3.379726 |
Although several variables showed high VIF values, indicating multicollinearity, this was expected due to overlapping time-based variables and store dummy variables. Since the primary goal of the model was prediction rather than coefficient interpretation, and the model demonstrated strong performance, the variables were retained instead of removed.
bptest(lm_model)
##
## studentized Breusch-Pagan test
##
## data: lm_model
## BP = 1172.8, df = 54, p-value < 2.2e-16
The Breusch–Pagan test indicates heteroscedasticity (p < 2.2e-16), meaning error variance is not constant across observations, bigger stores tend to have larger prediction errors.
dwtest(lm_model)
##
## Durbin-Watson test
##
## data: lm_model
## DW = 1.4481, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
The Durbin–Watson test (DW = 1.44, p < 2.2e-16) tells some autocorrelation present, nearby weeks tend to have correlated errors.
lm_preds <- predict(lm_model, newdata = test)
res_lm <- evaluate_model("Linear Regression", test$Log_Sales, lm_preds)
plot(lm_model, col = BLUE, pch = 16, cex = 0.5)
The first plot shows that the model is doing a decent job because the points are mostly spread randomly around the line, although there are a few outliers. The second plot shows that the residuals are not normally distributed, since the points move far away from the straight line, which is the main problem with the model. The third plot suggests that the spread of errors is mostly consistent, so the variance assumption is mostly fine. The fourth plot shows that there are some unusual data points, but none are strong enough to completely affect or ruin the model.
step_model <- step(lm_model, direction = "backward", trace = 0)
# Stepwise Model Summary
summary(step_model)
##
## Call:
## lm(formula = Log_Sales ~ Store + Is_Holiday + Is_Q4 + Is_Nov_Dec +
## Temperature + Fuel_Price + CPI + Unemployment + Month_num +
## Week + Year, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.69279 -0.06252 -0.00600 0.04998 0.67116
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.667e+02 2.342e+01 7.118 1.27e-12 ***
## Store2 2.345e-01 1.659e-02 14.135 < 2e-16 ***
## Store3 -1.419e+00 1.832e-02 -77.459 < 2e-16 ***
## Store4 1.452e+00 1.853e-01 7.837 5.71e-15 ***
## Store5 -1.634e+00 1.859e-02 -87.890 < 2e-16 ***
## Store6 -2.569e-02 1.794e-02 -1.432 0.152194
## Store7 -6.888e-01 5.062e-02 -13.607 < 2e-16 ***
## Store8 -6.103e-01 2.098e-02 -29.090 < 2e-16 ***
## Store9 -1.137e+00 2.082e-02 -54.623 < 2e-16 ***
## Store10 1.418e+00 1.869e-01 7.587 3.96e-14 ***
## Store11 -1.870e-01 1.832e-02 -10.207 < 2e-16 ***
## Store12 8.825e-01 1.938e-01 4.554 5.41e-06 ***
## Store13 1.438e+00 1.856e-01 7.746 1.16e-14 ***
## Store14 7.373e-01 6.605e-02 11.163 < 2e-16 ***
## Store15 2.198e-01 1.729e-01 1.271 0.203740
## Store16 -8.096e-01 4.941e-02 -16.385 < 2e-16 ***
## Store17 6.008e-01 1.852e-01 3.245 0.001184 **
## Store18 7.857e-01 1.738e-01 4.522 6.29e-06 ***
## Store19 1.057e+00 1.729e-01 6.113 1.06e-09 ***
## Store20 4.028e-01 2.246e-02 17.934 < 2e-16 ***
## Store21 -6.831e-01 1.659e-02 -41.176 < 2e-16 ***
## Store22 6.622e-01 1.647e-01 4.020 5.92e-05 ***
## Store23 9.373e-01 1.718e-01 5.455 5.17e-08 ***
## Store24 9.895e-01 1.732e-01 5.714 1.18e-08 ***
## Store25 -6.837e-01 2.257e-02 -30.297 < 2e-16 ***
## Store26 6.845e-01 1.729e-01 3.960 7.62e-05 ***
## Store27 1.210e+00 1.646e-01 7.352 2.31e-13 ***
## Store28 1.166e+00 1.938e-01 6.015 1.94e-09 ***
## Store29 1.098e-01 1.748e-01 0.628 0.530191
## Store30 -1.245e+00 1.659e-02 -75.022 < 2e-16 ***
## Store31 -9.008e-02 1.659e-02 -5.430 5.92e-08 ***
## Store32 4.588e-02 5.047e-02 0.909 0.363367
## Store33 -5.887e-01 1.872e-01 -3.145 0.001673 **
## Store34 7.707e-01 1.879e-01 4.101 4.19e-05 ***
## Store35 5.783e-01 1.652e-01 3.501 0.000469 ***
## Store36 -1.323e+00 1.708e-02 -77.464 < 2e-16 ***
## Store37 -1.065e+00 1.707e-02 -62.377 < 2e-16 ***
## Store38 -1.276e-01 1.938e-01 -0.658 0.510370
## Store39 -6.575e-02 1.707e-02 -3.853 0.000118 ***
## Store40 5.765e-01 1.718e-01 3.355 0.000799 ***
## Store41 7.365e-02 4.929e-02 1.494 0.135155
## Store42 1.621e-01 1.869e-01 0.867 0.385843
## Store43 -7.201e-01 2.914e-02 -24.715 < 2e-16 ***
## Store44 -4.866e-01 1.855e-01 -2.623 0.008733 **
## Store45 -2.432e-01 6.605e-02 -3.682 0.000234 ***
## Is_Holiday 2.149e-02 6.919e-03 3.106 0.001911 **
## Is_Q4 -2.510e-02 9.754e-03 -2.574 0.010099 *
## Is_Nov_Dec 1.768e-01 9.254e-03 19.107 < 2e-16 ***
## Temperature 4.997e-04 2.317e-04 2.156 0.031109 *
## Fuel_Price 1.581e-02 1.017e-02 1.554 0.120185
## CPI 1.380e-02 2.159e-03 6.393 1.80e-10 ***
## Unemployment -2.213e-02 6.331e-03 -3.496 0.000478 ***
## Month_num 3.550e-02 6.210e-03 5.716 1.16e-08 ***
## Week -9.145e-03 1.409e-03 -6.490 9.51e-11 ***
## Year -7.726e-02 1.179e-02 -6.550 6.39e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.117 on 4445 degrees of freedom
## Multiple R-squared: 0.9618, Adjusted R-squared: 0.9614
## F-statistic: 2074 on 54 and 4445 DF, p-value: < 2.2e-16
kept <- names(coef(step_model))
dropped <- setdiff(names(coef(lm_model)), kept)
cat("Features dropped by stepwise:", paste(dropped, collapse = ", "), "\n")
## Features dropped by stepwise:
step_preds <- predict(step_model, newdata = test)
res_step <- evaluate_model("Stepwise LM", test$Log_Sales, step_preds)
The stepwise regression model retained all predictors, resulting in the same model as the original linear regression. This suggests that each variable contributed useful predictive information to the model. The model achieved an adjusted R² of 0.9614, indicating excellent predictive performance, with approximately 96% of the variation in log weekly sales explained by the predictors.
set.seed(42)
rf_model <- randomForest(
formula_lm,
data = train,
ntree = 300,
mtry = 4,
importance = TRUE,
nodesize = 5
)
rf_model
##
## Call:
## randomForest(formula = formula_lm, data = train, ntree = 300, mtry = 4, importance = TRUE, nodesize = 5)
## Type of random forest: regression
## Number of trees: 300
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 0.005695193
## % Var explained: 98.39
The Random Forest model did an excellent job predicting the results because it had very low errors and was able to explain about 98% of the changes in the data.
rf_preds <- predict(rf_model, newdata = test)
res_rf <- evaluate_model("Random Forest", test$Log_Sales, rf_preds)
imp_df <- importance(rf_model) %>%
as.data.frame() %>%
rownames_to_column("Feature") %>%
arrange(desc(`%IncMSE`)) %>%
head(10)
p_imp <- ggplot(imp_df, aes(reorder(Feature, `%IncMSE`), `%IncMSE`)) +
geom_col(fill = BLUE, width = 0.7) +
geom_text(aes(label = round(`%IncMSE`, 1)), hjust = -0.2, size = 3.5, colour = "grey20") +
coord_flip() +
scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
labs(title = "Random Forest — Top 10 Feature Importance",
x = NULL, y = "% Increase in MSE") +
theme_walmart
print(p_imp)
Store dominates every other variable by a wide margin, making it the strongest predictor of Weekly Sales. Week comes in second, showing that seasonality still plays a major role. Unemployment and CPI rank surprisingly high, suggesting that economic conditions matter more to the model than the simple correlations implied. Meanwhile, Fuel Price, Temperature, Holiday status, and other calendar variables contribute useful information but have a much smaller impact on predictions.
X_train <- model.matrix(formula_lm, data = train)[, -1]
X_test <- model.matrix(formula_lm, data = test)[, -1]
y_train <- train$Log_Sales
y_test <- test$Log_Sales
set.seed(42)
xgb_model <- xgb.train(
params = list(objective = "reg:squarederror", eta = 0.05,
max_depth = 5, subsample = 0.8),
data = xgb.DMatrix(X_train, label = y_train),
nrounds = 300,
watchlist = list(test = xgb.DMatrix(X_test, label = y_test)),
early_stopping_rounds = 30,
print_every_n = 50,
verbose = 0
)
## Warning in throw_err_or_depr_msg("Parameter '", match_old, "' has been renamed
## to '", : Parameter 'watchlist' has been renamed to 'evals'. This warning will
## become an error in a future version.
xgb_preds <- predict(xgb_model, xgb.DMatrix(X_test))
res_xgb <- evaluate_model("XGBoost", y_test, xgb_preds)
# Actual vs Predicted — XGBoost
avp_df <- tibble(
Actual = exp(y_test) / 1e6,
Predicted = exp(xgb_preds) / 1e6
)
p_avp <- ggplot(avp_df, aes(Actual, Predicted)) +
geom_point(alpha = 0.3, colour = BLUE, size = 1) +
geom_abline(slope = 1, intercept = 0, colour = RED, linewidth = 1) +
scale_x_continuous(labels = label_dollar(suffix = "M")) +
sales_y +
labs(title = "XGBoost — Actual vs Predicted",
x = "Actual (USD M)", y = "Predicted (USD M)") +
theme_walmart
print(p_avp)
# Residuals plot
resid_df <- tibble(
Fitted = exp(xgb_preds) / 1e6,
Residual = (exp(y_test) - exp(xgb_preds)) / 1e6
)
p_resid <- ggplot(resid_df, aes(Fitted, Residual)) +
geom_point(alpha = 0.3, colour = BLUE, size = 0.9) +
geom_hline(yintercept = 0, colour = RED, linewidth = 1) +
geom_smooth(method = "loess", se = FALSE, colour = YELLOW) +
scale_x_continuous(labels = label_dollar(suffix = "M")) +
sales_y +
labs(title = "XGBoost — Residuals vs Fitted",
x = "Fitted (USD M)", y = "Residuals (USD M)") +
theme_walmart
print(p_resid)
## `geom_smooth()` using formula = 'y ~ x'
Actual vs Predicted
Residuals vs Fitted
# Result of all Model
results <- bind_rows(res_lm, res_step, res_rf, res_xgb)
knitr::kable(results, digits = 4)
| Model | RMSE_USD | MAE_USD | R2 |
|---|---|---|---|
| Linear Regression | 107837 | 76142 | 0.9596 |
| Stepwise LM | 107837 | 76142 | 0.9596 |
| Random Forest | 163287 | 108039 | 0.9074 |
| XGBoost | 155633 | 103786 | 0.9159 |
The linear regression and stepwise models tied for the best overall performance. This is because the store dummy variables captured each store’s baseline sales so well that the tree models couldn’t really improve on it.
Linear Regression & Stepwise LM: RMSE = $107K, MAE = $76K, R² = 0.9596 — best performing models, identical results since stepwise kept all features
XGBoost: RMSE = $155K, MAE = $103K, R² = 0.9159 — best tree model, slightly better than Random Forest because it fixes errors step by step
Random Forest: RMSE = $163K, MAE = $108K, R² = 0.9074 — a bit weaker than XGBoost but still explains over 90% of the variation
All four models performed well with R² above 90%, showing that weekly sales are very predictable when you know the store and the time of year. The linear regression did best, which makes sense because it already learned each store’s normal sales pattern from the training data and applied it to 2012. XGBoost did better than Random Forest because it fixes mistakes step by step instead of averaging many trees. The feature importance plot shows the main driver is clearly the store, with week of the year next, while economic factors matter much less. Overall, the results show that just knowing the store and season is enough to predict sales pretty well.