Introduction

This report delineates the application of supervised and unsupervised machine learning methodologies to analyze datasets for predictive and exploratory purposes, respectively. Utilizing a sales dataset from Walmart, the report embarks on regression and classification analyses under the supervised learning paradigm. Concurrently, an online retail dataset facilitates the exploration of clustering analysis through unsupervised learning, addressing distinct challenges inherent to each approach.

Data Wrangling and Cleaning for Supervised Learning (More on the Python Script -> ‘3189 Coursework Store Sales.ipynb’)

# Import All Libraries
library(tidyverse)
library(dplyr)
library(lubridate)
library(caret)
library(e1071)
library(randomForest)
library(glmnet)
library(rpart)
library(rpart.plot)
library(class)
library(ggcorrplot)
library(ggplot2)
library(reshape2)
library(scales)
library(plotly)
library(factoextra)
library(ggfortify)
library(cluster)
library(Metrics)
library(klaR)
library(dendextend)
library(patchwork)
library(smotefamily)

SUPERVISED LEARNING: Walmart Sales**

Data Loading & Initial Checks

Loading the Walmart Dataset

# Load Walmart dataset
walmart_df <- read.csv("C:/Users/Santasila Bryan Kusn/OneDrive/Documents/SIM GE Courses/LSE Year 3/ST3189 - Machine Learning/Assignments + Coursework/Cursework Project/Walmart.csv")

Structure and missing data check

# Structure and missing data check
str(walmart_df)

## 'data.frame':    6435 obs. of  8 variables:
##  $ Store       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Date        : chr  "05-02-2010" "12-02-2010" "19-02-2010" "26-02-2010" ...
##  $ Weekly_Sales: num  1643691 1641957 1611968 1409728 1554807 ...
##  $ Holiday_Flag: int  0 1 0 0 0 0 0 0 0 0 ...
##  $ Temperature : num  42.3 38.5 39.9 46.6 46.5 ...
##  $ Fuel_Price  : num  2.57 2.55 2.51 2.56 2.62 ...
##  $ CPI         : num  211 211 211 211 211 ...
##  $ Unemployment: num  8.11 8.11 8.11 8.11 8.11 ...

cat("Total missing:", sum(is.na(walmart_df)), "\n")

## Total missing: 0

cat("Duplicates:", sum(duplicated(walmart_df)), "\n")

## Duplicates: 0

Feature Engineering

# Convert date
walmart_df$Date <- as.Date(walmart_df$Date, format="%d-%m-%Y")

# Create 'Week' from Date for modeling
walmart_df$Week <- lubridate::week(walmart_df$Date)

# Create numeric date for correlation
walmart_df$Date_numeric <- as.numeric(walmart_df$Date)

# Convert 'Holiday_Flag' to factor
walmart_df$Holiday_Flag <- as.factor(walmart_df$Holiday_Flag)

# Convert 'Store' to factor
walmart_df$Store <- as.factor(walmart_df$Store)

str(walmart_df)

## 'data.frame':    6435 obs. of  10 variables:
##  $ Store       : Factor w/ 45 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Date        : Date, format: "2010-02-05" "2010-02-12" ...
##  $ Weekly_Sales: num  1643691 1641957 1611968 1409728 1554807 ...
##  $ Holiday_Flag: Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 1 ...
##  $ Temperature : num  42.3 38.5 39.9 46.6 46.5 ...
##  $ Fuel_Price  : num  2.57 2.55 2.51 2.56 2.62 ...
##  $ CPI         : num  211 211 211 211 211 ...
##  $ Unemployment: num  8.11 8.11 8.11 8.11 8.11 ...
##  $ Week        : num  6 7 8 9 10 11 12 13 14 15 ...
##  $ Date_numeric: num  14645 14652 14659 14666 14673 ...

cat("Total missing:", sum(is.na(walmart_df)), "\n")

## Total missing: 0

cat("Duplicates:", sum(duplicated(walmart_df)), "\n")

## Duplicates: 0

Exploratory Data Analysis

library(dplyr)

walmart_df_reg1 <- walmart_df

# Target Encoding Stores
walmart_df_reg1 <- walmart_df_reg1 %>%
  arrange(Store, Week) %>%  
  group_by(Store) %>%
  mutate(
    Stores = cummean(lag(Weekly_Sales))  
  ) %>%
  ungroup()


walmart_df_reg1$Stores[is.na(walmart_df_reg1$Stores)] <- mean(walmart_df$Weekly_Sales, na.rm = TRUE)

# Target Encoding Holiday_Sales
walmart_df_reg1 <- walmart_df_reg1 %>%
  arrange(Holiday_Flag, Week) %>%
  group_by(Holiday_Flag) %>%
  mutate(
    Holiday_Sales = cummean(lag(Weekly_Sales))
  ) %>%
  ungroup()

walmart_df_reg1$Holiday_Sales[is.na(walmart_df_reg1$Holiday_Sales)] <- mean(walmart_df$Weekly_Sales, na.rm = TRUE)

str(walmart_df_reg1)

## tibble [6,435 × 12] (S3: tbl_df/tbl/data.frame)
##  $ Store        : Factor w/ 45 levels "1","2","3","4",..: 1 1 2 2 3 3 4 4 5 5 ...
##  $ Date         : Date[1:6435], format: "2011-01-07" "2012-01-06" ...
##  $ Weekly_Sales : num [1:6435] 1444732 1550370 1758051 1799520 378241 ...
##  $ Holiday_Flag : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Temperature  : num [1:6435] 48.3 49 44.7 46.8 53.4 ...
##  $ Fuel_Price   : num [1:6435] 2.98 3.16 2.98 3.16 2.98 ...
##  $ CPI          : num [1:6435] 211 220 211 219 215 ...
##  $ Unemployment : num [1:6435] 7.74 7.35 8.03 7.06 7.55 ...
##  $ Week         : num [1:6435] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Date_numeric : num [1:6435] 14981 15345 14981 15345 14981 ...
##  $ Stores       : num [1:6435] 1046965 1046965 1046965 1046965 1046965 ...
##  $ Holiday_Sales: num [1:6435] 1046965 1046965 1046965 1046965 1046965 ...

Correlation & Basic Summaries

library("corrplot")
numeric_columns <- walmart_df_reg1[c("Weekly_Sales", "Temperature", "Fuel_Price", "CPI", "Unemployment", "Date_numeric", "Week", "Stores")]
cor_matrix <- cor(numeric_columns, use = "complete.obs")

## Warning in cor(numeric_columns, use = "complete.obs"): the standard deviation
## is zero

ggcorrplot(
  cor_matrix, type = "lower", lab = TRUE, lab_size = 3, tl.srt = 45,
  title = "Correlation plot for Walmart Dataset", ggtheme=theme_minimal()
)

#### Correlation Boxplot for feature iportance

library(DescTools)  
library(psych)      

# Exclude Weekly_Sales
all_vars <- setdiff(names(walmart_df), "Weekly_Sales")

corr_df <- data.frame(Variable=character(), Correlation=numeric(), Type=character(), stringsAsFactors=FALSE)


for (var in all_vars) {
  if (is.numeric(walmart_df[[var]])) {
    corr_val <- cor(walmart_df[[var]], walmart_df$Weekly_Sales, use="complete.obs")
    corr_df <- rbind(corr_df, data.frame(Variable=var, Correlation=corr_val, Type="Numeric"))
  } else if (is.factor(walmart_df[[var]]) || is.character(walmart_df[[var]])) {
    model <- aov(Weekly_Sales ~ as.factor(walmart_df[[var]]), data=walmart_df)
    eta_sq <- EtaSq(model, type=1)[1, "eta.sq"]
    corr_df <- rbind(corr_df, data.frame(Variable=var, Correlation=eta_sq, Type="Categorical"))
  }
}

ggplot(corr_df, aes(x=reorder(Variable, abs(Correlation)), y=abs(Correlation), fill=Type)) +
  geom_bar(stat="identity") +
  coord_flip() +
  labs(title="Correlation Strength to Weekly_Sales",
       x="Variable", y="Correlation / Effect Size") +
  theme_minimal()

Boxplots & Distribution

1) Single Boxplot: Weekly Sales

ggplot(walmart_df, aes(y = Weekly_Sales)) +
  geom_boxplot() +
  scale_y_continuous(labels = comma) +
  theme_minimal() +
  coord_flip() +
  labs(title = "Distribution of Weekly Sales", y = "Weekly Sales")

median_sales <- median(walmart_df$Weekly_Sales)
cat("Median Weekly Sales:", median_sales, "\n")

## Median Weekly Sales: 960746

q1_sales <- quantile(walmart_df$Weekly_Sales, 0.25)
cat("Q1 Weekly Sales:", q1_sales, "\n")

## Q1 Weekly Sales: 553350.1

q3_sales <- quantile(walmart_df$Weekly_Sales, 0.75)
cat("Q3 Weekly Sales:", q3_sales, "\n")

## Q3 Weekly Sales: 1420159

2) Multiple Boxplot for Distribution Across Stores

mean_sales <- walmart_df %>%
  group_by(Store) %>%
  summarise(Mean_Weekly_Sales = mean(Weekly_Sales)) %>%
  arrange(Mean_Weekly_Sales)

walmart_df <- left_join(walmart_df, mean_sales, by = "Store")
walmart_df$Store <- factor(walmart_df$Store, levels = mean_sales$Store)

ggplot(walmart_df, aes(x = Store, y = Weekly_Sales)) +
  geom_boxplot(aes(fill = Mean_Weekly_Sales)) +
  scale_fill_gradient(low = "#ff00d4", high = "#00ddff") +
  coord_flip() +
  theme_minimal() +
  labs(title = "Mean of Weekly Sales Across Different Stores", 
       x = "Store Number", y = "Weekly Sales") +
  scale_y_continuous(labels = comma) +
  theme(legend.position = "none", plot.title = element_text(hjust = 0.5))

3) Holiday vs Non-Holiday

walmart_df$Holiday_Flag <- factor(walmart_df$Holiday_Flag,
                                  levels = c(0, 1),
                                  labels = c("Non-Holidays", "Holidays"))

ggplot(walmart_df, aes(x = Holiday_Flag, y = Weekly_Sales, fill = Holiday_Flag)) +
  geom_boxplot() +
  scale_y_continuous(labels = comma) +
  theme_minimal() +
  labs(
    title = "Comparison of Weekly Sales (Holiday vs. Non-Holiday)",
    x = "", y = "Weekly Sales"
  ) +
  scale_fill_manual(values = c("#00a1ff", "#00ff8f")) +
  theme(legend.position = "none", plot.title = element_text(hjust = 0.5))

holiday_summary <- walmart_df %>%
  group_by(Holiday_Flag) %>%
  summarise(Median_Sales = median(Weekly_Sales, na.rm = TRUE))
cat("Holiday vs Non-Holiday Median Sales:\n")

## Holiday vs Non-Holiday Median Sales:

print(holiday_summary)

## # A tibble: 2 × 2
##   Holiday_Flag Median_Sales
##   <fct>               <dbl>
## 1 Non-Holidays      956211.
## 2 Holidays         1018538.

Average Weekly Sales by Date (Line Plot)

avg_sales_by_date <- walmart_df %>%
  group_by(Date) %>%
  summarise(Weekly_Sales=mean(Weekly_Sales))

avg_sales_by_date$Color <- ifelse(avg_sales_by_date$Weekly_Sales>1200000,"green","red")

ggplot(avg_sales_by_date, aes(x=Date, y=Weekly_Sales)) +
  geom_line(color="black") +
  geom_point(aes(color=Color), size=2) +
  scale_x_date(date_breaks="3 months", date_labels="%d-%m-%Y") +
  scale_y_continuous(labels=comma) +
  scale_color_identity() +
  labs(title="Average Weekly Sales by Date",
       x="Date", y="Average Weekly Sales") +
  theme_minimal() +
  theme(axis.text.x=element_text(angle=60, hjust=1),
        plot.title=element_text(hjust=0.5),
        legend.position="none")

summary(avg_sales_by_date$Weekly_Sales)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  879997  997346 1027642 1046965 1062045 1798476

REGRESSION ANALYSIS

Feature Selection for Linear, Ridge and Lasso Regression (Exhaustive Feature Selection)

library(caret)
library(dplyr)

# Feature Selection
rfe_columns <- c("Stores", "Temperature", "CPI", "Unemployment", "Week", "Fuel_Price", "Holiday_Sales")

train_x_reg1 <- walmart_df_reg1[, rfe_columns]
train_y_reg1 <- walmart_df_reg1$Weekly_Sales

# Ensure numeric columns
train_x_reg1 <- train_x_reg1 %>%
  mutate(across(everything(), ~ as.numeric(as.character(.))))


# Feature Selection for Linear Regression
rfe_ctrl_reg <- rfeControl(functions = lmFuncs, method = "cv", number = 5)

# Find best features
set.seed(2025)
rfe_results_reg <- rfe(
  x = train_x_reg1,
  y = train_y_reg1,
  sizes = 1:ncol(train_x_reg1),
  rfeControl = rfe_ctrl_reg
)

cat("\n=== RFE RESULTS (Regression - Target Encoding) ===\n")

## 
## === RFE RESULTS (Regression - Target Encoding) ===

print(rfe_results_reg)

## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (5 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables   RMSE Rsquared    MAE RMSESD RsquaredSD MAESD Selected
##          1 561058  0.01131 468405  10956   0.001519  5653         
##          2 561194  0.01084 468478  10993   0.001509  5624         
##          3 559714  0.01631 467748  10688   0.004151  5654         
##          4 557739  0.02362 467609  10303   0.008967  5497         
##          5 555735  0.03072 467280  10844   0.008195  6129        *
##          6 555735  0.03072 467280  10844   0.008195  6129         
##          7 555735  0.03072 467280  10844   0.008195  6129         
## 
## The top 5 variables (out of 5):
##    Unemployment, Fuel_Price, Week, Temperature, CPI

Linear Regression Diagnostic Plot

walmart_df_reg1$Week <- week(walmart_df_reg1$Date)
lm_model <- lm(Weekly_Sales ~ Fuel_Price + Unemployment + Week + Stores + Temperature + CPI,
               data = walmart_df_reg1)

par(mfrow = c(2, 2))
plot(lm_model)

summary_model <- summary(lm_model)
rmse_lm <- sqrt(mean(summary_model$residuals^2))

# Check Data Structure
str(walmart_df_reg1)

## tibble [6,435 × 12] (S3: tbl_df/tbl/data.frame)
##  $ Store        : Factor w/ 45 levels "1","2","3","4",..: 1 1 2 2 3 3 4 4 5 5 ...
##  $ Date         : Date[1:6435], format: "2011-01-07" "2012-01-06" ...
##  $ Weekly_Sales : num [1:6435] 1444732 1550370 1758051 1799520 378241 ...
##  $ Holiday_Flag : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Temperature  : num [1:6435] 48.3 49 44.7 46.8 53.4 ...
##  $ Fuel_Price   : num [1:6435] 2.98 3.16 2.98 3.16 2.98 ...
##  $ CPI          : num [1:6435] 211 220 211 219 215 ...
##  $ Unemployment : num [1:6435] 7.74 7.35 8.03 7.06 7.55 ...
##  $ Week         : num [1:6435] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Date_numeric : num [1:6435] 14981 15345 14981 15345 14981 ...
##  $ Stores       : num [1:6435] 1046965 1046965 1046965 1046965 1046965 ...
##  $ Holiday_Sales: num [1:6435] 1046965 1046965 1046965 1046965 1046965 ...

Linear Regression Train Test Split

set.seed(2025)
walmart_df_reg1 <- dplyr::select(walmart_df_reg1, -Date_numeric, -Holiday_Flag, -Store,-Holiday_Sales, -Date)
trainIndex1 <- createDataPartition(walmart_df_reg1$Weekly_Sales, p = 0.8, list = FALSE)
trainData1 <- walmart_df_reg1[trainIndex1, ]
testData1 <- walmart_df_reg1[-trainIndex1, ]

cat("\nTraining Data Size:", nrow(trainData1), "\n")

## 
## Training Data Size: 5151

cat("Testing  Data Size:", nrow(testData1), "\n")

## Testing  Data Size: 1284

# Cross-validation control
cv_ctrl_reg <- trainControl(method="cv", number=5)

1) Linear Regression Evaluation

# Linear Regression (method="lm")
lm_cv <- train(
  Weekly_Sales ~ Stores + Fuel_Price + Unemployment + Week + Temperature + CPI,
  data = trainData1,
  method = "lm",
  trControl = cv_ctrl_reg
)

predictions <- predict(lm_cv, newdata = testData1)

evaluation_metrics <- postResample(predictions, testData1$Weekly_Sales)
print(evaluation_metrics)

##         RMSE     Rsquared          MAE 
## 5.679807e+05 3.500834e-02 4.732756e+05

Linear Regression Coefficient

cat("\nCoefficients of final linear model:\n")

## 
## Coefficients of final linear model:

print(summary(lm_cv$finalModel)$coefficients)

##                 Estimate Std. Error    t value     Pr(>|t|)
## (Intercept)  1577489.941 90597.7840 17.4120146 5.063833e-66
## Fuel_Price      2443.011 17538.4724  0.1392944 8.892229e-01
## Unemployment  -39107.274  4430.9319 -8.8259705 1.463885e-18
## Week            3597.800   563.4975  6.3847661 1.867449e-10
## Temperature    -1403.960   452.7820 -3.1007424 1.940802e-03
## CPI            -1372.998   216.9979 -6.3272446 2.707189e-10

2.1) Ridge Regression

# Ridge Regression (Use same variables as LR)
ridge_cv <- train(
  Weekly_Sales ~ Stores + Fuel_Price + Unemployment + Week +  Temperature + CPI,
  data = trainData1,
  method = "glmnet",
  trControl = cv_ctrl_reg,
  tuneGrid = expand.grid(alpha = 0, lambda = seq(0.001, 0.1, by = 0.01)),
  preProcess = c("center", "scale")
)

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Stores

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Stores

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Stores

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Stores

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Stores

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Stores

cat("\n2) Ridge Regression:\n")

## 
## 2) Ridge Regression:

print(ridge_cv)

## glmnet 
## 
## 5151 samples
##    6 predictor
## 
## Pre-processing: centered (6), scaled (6) 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 4122, 4121, 4121, 4120, 4120 
## Resampling results across tuning parameters:
## 
##   lambda  RMSE      Rsquared    MAE     
##   0.001   553011.4  0.02864283  465624.9
##   0.011   553011.4  0.02864283  465624.9
##   0.021   553011.4  0.02864283  465624.9
##   0.031   553011.4  0.02864283  465624.9
##   0.041   553011.4  0.02864283  465624.9
##   0.051   553011.4  0.02864283  465624.9
##   0.061   553011.4  0.02864283  465624.9
##   0.071   553011.4  0.02864283  465624.9
##   0.081   553011.4  0.02864283  465624.9
##   0.091   553011.4  0.02864283  465624.9
## 
## Tuning parameter 'alpha' was held constant at a value of 0
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0 and lambda = 0.091.

cat("\nBest Tuning Parameters for Ridge (alpha = 0):\n")

## 
## Best Tuning Parameters for Ridge (alpha = 0):

print(ridge_cv$bestTune)

##    alpha lambda
## 10     0  0.091

# Prediction
ridge_predictions <- predict(ridge_cv, newdata = testData1)

# Evaluation
ridge_eval <- postResample(ridge_predictions, testData1$Weekly_Sales)
cat("\nRidge Model Evaluation:\n")

## 
## Ridge Model Evaluation:

print(ridge_eval)

##         RMSE     Rsquared          MAE 
## 5.680049e+05 3.500146e-02 4.732875e+05

2.2) Lasso Regression

# Lasso Regression (Use same variables as LR)
lasso_cv <- train(
  Weekly_Sales ~ Stores + Fuel_Price + Unemployment + Week +  Temperature + CPI,
  data = trainData1,
  method = "glmnet",
  trControl = cv_ctrl_reg,
  tuneGrid = expand.grid(alpha = 1, lambda = seq(0.001, 0.1, by = 0.01)),
  preProcess = c("center", "scale")
)

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Stores

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Stores

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Stores

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Stores

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Stores

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Stores

cat("\n3) Lasso Regression:\n")

## 
## 3) Lasso Regression:

print(lasso_cv)

## glmnet 
## 
## 5151 samples
##    6 predictor
## 
## Pre-processing: centered (6), scaled (6) 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 4121, 4123, 4120, 4120, 4120 
## Resampling results across tuning parameters:
## 
##   lambda  RMSE      Rsquared    MAE     
##   0.001   552731.1  0.02904947  465469.7
##   0.011   552731.1  0.02904947  465469.7
##   0.021   552731.1  0.02904947  465469.7
##   0.031   552731.1  0.02904947  465469.7
##   0.041   552731.1  0.02904947  465469.7
##   0.051   552731.1  0.02904947  465469.7
##   0.061   552731.1  0.02904947  465469.7
##   0.071   552731.1  0.02904947  465469.7
##   0.081   552731.1  0.02904947  465469.7
##   0.091   552731.1  0.02904947  465469.7
## 
## Tuning parameter 'alpha' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 0.091.

cat("\nBest Tuning Parameters for Lasso (alpha = 1):\n")

## 
## Best Tuning Parameters for Lasso (alpha = 1):

print(lasso_cv$bestTune)

##    alpha lambda
## 10     1  0.091

# prediction
lasso_predictions <- predict(lasso_cv, newdata = testData1)

# Evaluation
lasso_eval <- postResample(lasso_predictions, testData1$Weekly_Sales)
cat("\nLasso Model Evaluation:\n")

## 
## Lasso Model Evaluation:

print(lasso_eval)

##         RMSE     Rsquared          MAE 
## 5.679927e+05 3.499983e-02 4.732677e+05

Feature Selection for Decision Tree (Exhaustive Feature Selection)

# FEATURE SELECTION for REGRESSION via RFE
candidate_features_reg <- c("Store","Temperature","CPI","Unemployment","Week","Fuel_Price","Holiday_Flag")
walmart_df_reg2 <- walmart_df

# Split Train Test
train_x_reg2 <- walmart_df_reg2[, candidate_features_reg]
train_y_reg2 <- walmart_df_reg2$Weekly_Sales

# RFE control
rfe_ctrl_reg_tree <- rfeControl(functions = treebagFuncs, method = "cv", number = 5)

# Run RFE
set.seed(2025)
rfe_results_reg_tree <- rfe(
  x = train_x_reg2,
  y = train_y_reg2,
  sizes = 1:length(candidate_features_reg),
  rfeControl = rfe_ctrl_reg_tree
)

cat("\n=== RFE RESULTS (Decision Tree Regression) ===\n")

## 
## === RFE RESULTS (Decision Tree Regression) ===

print(rfe_results_reg_tree)

## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (5 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables   RMSE Rsquared    MAE RMSESD RsquaredSD MAESD Selected
##          1 184500   0.8930 119062  11597   0.011085  3812         
##          2 175669   0.9029 116596  10934   0.011358  4764         
##          3 172510   0.9066 115819   7697   0.007349  3644         
##          4 172333   0.9067 115523   8211   0.007577  4727         
##          5 171927   0.9072 115734   7880   0.007959  4632        *
##          6 172417   0.9067 115491   7889   0.007380  4170         
##          7 173305   0.9057 116135   7815   0.007314  4432         
## 
## The top 5 variables (out of 5):
##    Store, Week, CPI, Unemployment, Temperature

# Evaluate
best_features_reg_tree <- predictors(rfe_results_reg_tree)
cat("\nChosen subset for Decision Tree Regression:\n")

## 
## Chosen subset for Decision Tree Regression:

print(best_features_reg_tree)

## [1] "Store"        "Week"         "CPI"          "Unemployment" "Temperature"

Spliting Train test

set.seed(2025)
walmart_df_reg2 <- dplyr::select(walmart_df_reg2, -Date_numeric, -Fuel_Price, -Holiday_Flag)
trainIndex2 <- createDataPartition(walmart_df_reg2$Weekly_Sales, p = 0.8, list = FALSE)
trainData2 <- walmart_df_reg2[trainIndex2, ]
testData2 <- walmart_df_reg2[-trainIndex2, ]

cat("\nTraining Data Size:", nrow(trainData2), "\n")

## 
## Training Data Size: 5151

cat("Testing  Data Size:", nrow(testData2), "\n")

## Testing  Data Size: 1284

# Cross-validation control
cv_ctrl_reg <- trainControl(method="cv", number=5)

3) Decision Tree Evaluation

# Decision Tree (method="rpart")
dt_grid <- expand.grid(cp=seq(0.0001, 0.01, by=0.002))
dt_cv <- train(
  Weekly_Sales ~ Store + Week + CPI + Unemployment + Temperature,
  data = walmart_df_reg2,
  method = "rpart",
  trControl = cv_ctrl_reg,
  tuneGrid = dt_grid
)
cat("\n4) Decision Tree:\n")

## 
## 4) Decision Tree:

print(dt_cv)

## CART 
## 
## 6435 samples
##    5 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 5148, 5148, 5148, 5149, 5147 
## Resampling results across tuning parameters:
## 
##   cp      RMSE      Rsquared   MAE      
##   0.0001  150733.0  0.9286377   84998.36
##   0.0021  179021.6  0.8992878  109588.06
##   0.0041  189146.2  0.8874999  119394.29
##   0.0061  205798.2  0.8667599  137033.07
##   0.0081  220335.6  0.8475742  153190.28
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 1e-04.

cat("\nBest Tuning (cp):\n")

## 
## Best Tuning (cp):

print(dt_cv$bestTune)

##      cp
## 1 1e-04

# Prediction
dt_predictions <- predict(dt_cv, newdata = testData2)

# Evaluation
dt_eval <- postResample(dt_predictions, testData2$Weekly_Sales)
cat("\nDecision Tree Model Evaluation:\n")

## 
## Decision Tree Model Evaluation:

print(dt_eval)

##         RMSE     Rsquared          MAE 
## 1.207413e+05 9.524806e-01 7.070176e+04

Decision Tree Visualization

# Visualizing Decision Tree
rpart.plot(dt_cv$finalModel, main="Final Decision Tree (Regression)", cex = 0.3, cex.main = 1.5)

## Warning: labs do not fit even at cex 0.15, there may be some overplotting

Feature Selection for Random Forest Regression (Exhaustive Feature Selection)

# FEATURE SELECTION for REGRESSION via RFE
candidate_features_reg <- c("Store","Temperature","CPI","Unemployment","Week","Fuel_Price","Holiday_Flag")
walmart_df_reg3 <- walmart_df

train_x_reg3 <- walmart_df_reg3[, candidate_features_reg]
train_y_reg3 <- walmart_df_reg3$Weekly_Sales

# RFE control for Random Forest Regression
rfe_ctrl_reg_rf <- rfeControl(
  functions = rfFuncs,
  method    = "cv",
  number    = 5
)

# Run RFE
set.seed(2025)
rfe_results_reg_rf <- rfe(
  x = train_x_reg3,
  y = train_y_reg3,
  sizes = 1:length(candidate_features_reg),
  rfeControl = rfe_ctrl_reg_rf
)

cat("\n=== RFE RESULTS (Random Forest Regression) ===\n")

## 
## === RFE RESULTS (Random Forest Regression) ===

print(rfe_results_reg_rf)

## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (5 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables   RMSE Rsquared   MAE RMSESD RsquaredSD MAESD Selected
##          1 162861   0.9167 91052  12066   0.010473  3066         
##          2 128814   0.9485 73960   7502   0.005411  2539         
##          3 119729   0.9569 67184   9935   0.006769  1450         
##          4 126218   0.9523 70091  10928   0.007253  2095         
##          5 131999   0.9516 77531  10658   0.006881  2696         
##          6 119756   0.9553 64427  10726   0.007576  2125         
##          7 111063   0.9627 63449  10119   0.006257  2091        *
## 
## The top 5 variables (out of 7):
##    Store, Week, CPI, Unemployment, Temperature

Train Test Split

set.seed(2025)
walmart_df_reg3 <- dplyr::select(walmart_df_reg3, -Date_numeric, -Fuel_Price, -Holiday_Flag)
trainIndex3 <- createDataPartition(walmart_df_reg3$Weekly_Sales, p = 0.8, list = FALSE)
trainData3 <- walmart_df_reg3[trainIndex3, ]
testData3 <- walmart_df_reg3[-trainIndex3, ]

cat("\nTraining Data Size:", nrow(trainData3), "\n")

## 
## Training Data Size: 5151

cat("Testing  Data Size:", nrow(testData3), "\n")

## Testing  Data Size: 1284

# Cross-validation control
cv_ctrl_reg <- trainControl(method="cv", number=5)

Random Forest Model Evaluation

# Random Forest (method="rf")
rf_grid <- expand.grid(mtry=c(2,3,4))
rf_cv <- train(
  Weekly_Sales ~ Store + Week+ CPI+ Unemployment+ Temperature,
  data = trainData3,
  method = "rf",
  trControl = cv_ctrl_reg,
  tuneGrid = rf_grid,
  ntree = 500
)
cat("\n5) Random Forest:\n")

## 
## 5) Random Forest:

print(rf_cv)

## Random Forest 
## 
## 5151 samples
##    5 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 4120, 4121, 4121, 4121, 4121 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE     
##   2     357477.0  0.8766710  292696.2
##   3     277247.7  0.8798755  219199.6
##   4     228989.8  0.8897462  169283.5
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 4.

cat("\nBest Tuning (mtry):\n")

## 
## Best Tuning (mtry):

print(rf_cv$bestTune)

##   mtry
## 3    4

cat("\nRandom Forest Variable Importance:\n")

## 
## Random Forest Variable Importance:

print(varImp(rf_cv))

## rf variable importance
## 
##   only 20 most important variables shown (out of 48)
## 
##              Overall
## Store20       100.00
## Store14        89.49
## Store4         75.47
## Store13        74.67
## Store10        60.06
## Store2         56.48
## Store27        40.23
## Store44        38.02
## Unemployment   37.53
## Store5         33.65
## CPI            32.18
## Store36        28.58
## Store3         21.74
## Store6         20.28
## Store30        19.94
## Store38        18.06
## Store1         17.53
## Store16        14.88
## Store37        14.39
## Week           13.12

# Prediction
rf_predictions <- predict(rf_cv, newdata = testData3)
rf_eval <- postResample(rf_predictions, testData3$Weekly_Sales)
cat("\nRandom Forest Model Evaluation:\n")

## 
## Random Forest Model Evaluation:

print(rf_eval)

##         RMSE     Rsquared          MAE 
## 2.219946e+05 8.895877e-01 1.663702e+05

Compare 5 Regression Models

# Create a list of your models and evaluation results
models <- list(
  Linear        = list(eval = evaluation_metrics),
  Ridge         = list(eval = ridge_eval),
  Lasso         = list(eval = lasso_eval),
  DecisionTree  = list(eval = dt_eval),
  RandomForest  = list(eval = rf_eval)
)

results_testing <- data.frame(Model = character(),
                              RMSE = numeric(),
                              R2 = numeric(),
                              stringsAsFactors = FALSE)

# Loop over each model and extract performance metrics
for (model_name in names(models)) {
  metrics <- models[[model_name]]$eval
  
  # Append the results to the data frame
  results_testing <- rbind(results_testing,
                           data.frame(Model = model_name,
                                      RMSE = metrics["RMSE"],
                                      R2   = metrics["Rsquared"]))
}

rownames(results_testing) <- NULL
cat("\n\n=== FINAL REGRESSION PERFORMANCE ON TESTING DATA (5 Models) ===\n")

## 
## 
## === FINAL REGRESSION PERFORMANCE ON TESTING DATA (5 Models) ===

print(results_testing, row.names = FALSE)

##         Model     RMSE         R2
##        Linear 567980.7 0.03500834
##         Ridge 568004.9 0.03500146
##         Lasso 567992.7 0.03499983
##  DecisionTree 120741.3 0.95248058
##  RandomForest 221994.6 0.88958767

# Classification

# Reload Walmart dataset
walmart_df <- read.csv("C:/Users/Santasila Bryan Kusn/OneDrive/Documents/SIM GE Courses/LSE Year 3/ST3189 - Machine Learning/Assignments + Coursework/Cursework Project/Walmart.csv")

Structure and missing data check

# Structure and missing data check
str(walmart_df)

## 'data.frame':    6435 obs. of  8 variables:
##  $ Store       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Date        : chr  "05-02-2010" "12-02-2010" "19-02-2010" "26-02-2010" ...
##  $ Weekly_Sales: num  1643691 1641957 1611968 1409728 1554807 ...
##  $ Holiday_Flag: int  0 1 0 0 0 0 0 0 0 0 ...
##  $ Temperature : num  42.3 38.5 39.9 46.6 46.5 ...
##  $ Fuel_Price  : num  2.57 2.55 2.51 2.56 2.62 ...
##  $ CPI         : num  211 211 211 211 211 ...
##  $ Unemployment: num  8.11 8.11 8.11 8.11 8.11 ...

cat("Total missing:", sum(is.na(walmart_df)), "\n")

## Total missing: 0

cat("Duplicates:", sum(duplicated(walmart_df)), "\n")

## Duplicates: 0

Feature Engineering

# Convert date
walmart_df$Date <- as.Date(walmart_df$Date, format="%d-%m-%Y")

# Create 'Month' from Date for modeling
walmart_df$Week <- lubridate::week(walmart_df$Date)

# Create numeric date for correlation
walmart_df$Date_numeric <- as.numeric(walmart_df$Date)

# Convert 'Holiday_Flag' to factor
walmart_df$Holiday_Flag <- as.factor(walmart_df$Holiday_Flag)

# Convert 'Store' to factor
walmart_df$Store <- as.factor(walmart_df$Store)

str(walmart_df)

## 'data.frame':    6435 obs. of  10 variables:
##  $ Store       : Factor w/ 45 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Date        : Date, format: "2010-02-05" "2010-02-12" ...
##  $ Weekly_Sales: num  1643691 1641957 1611968 1409728 1554807 ...
##  $ Holiday_Flag: Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 1 ...
##  $ Temperature : num  42.3 38.5 39.9 46.6 46.5 ...
##  $ Fuel_Price  : num  2.57 2.55 2.51 2.56 2.62 ...
##  $ CPI         : num  211 211 211 211 211 ...
##  $ Unemployment: num  8.11 8.11 8.11 8.11 8.11 ...
##  $ Week        : num  6 7 8 9 10 11 12 13 14 15 ...
##  $ Date_numeric: num  14645 14652 14659 14666 14673 ...

cat("Total missing:", sum(is.na(walmart_df)), "\n")

## Total missing: 0

cat("Duplicates:", sum(duplicated(walmart_df)), "\n")

## Duplicates: 0

Create Binary Variables (High vs Low Sales)

# Create a binary target: "High" vs "Low" if Weekly_Sales is above the mean
walmart_df_cl <- walmart_df

walmart_df_cl$Sales_Class <- ifelse(walmart_df_cl$Weekly_Sales > mean(walmart_df_cl$Weekly_Sales), "High", "Low")
walmart_df_cl$Sales_Class <- as.factor(walmart_df_cl$Sales_Class)

cat("\nClass Distribution:\n")

## 
## Class Distribution:

print(table(walmart_df_cl$Sales_Class))

## 
## High  Low 
## 2876 3559

print(prop.table(table(walmart_df_cl$Sales_Class)))

## 
##      High       Low 
## 0.4469308 0.5530692

str(walmart_df_cl)

## 'data.frame':    6435 obs. of  11 variables:
##  $ Store       : Factor w/ 45 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Date        : Date, format: "2010-02-05" "2010-02-12" ...
##  $ Weekly_Sales: num  1643691 1641957 1611968 1409728 1554807 ...
##  $ Holiday_Flag: Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 1 ...
##  $ Temperature : num  42.3 38.5 39.9 46.6 46.5 ...
##  $ Fuel_Price  : num  2.57 2.55 2.51 2.56 2.62 ...
##  $ CPI         : num  211 211 211 211 211 ...
##  $ Unemployment: num  8.11 8.11 8.11 8.11 8.11 ...
##  $ Week        : num  6 7 8 9 10 11 12 13 14 15 ...
##  $ Date_numeric: num  14645 14652 14659 14666 14673 ...
##  $ Sales_Class : Factor w/ 2 levels "High","Low": 1 1 1 1 1 1 1 1 1 1 ...

# Drop unnecessary variable
walmart_df_cl <- dplyr::select(walmart_df_cl, -Date_numeric, -Date)
str(walmart_df_cl)

## 'data.frame':    6435 obs. of  9 variables:
##  $ Store       : Factor w/ 45 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Weekly_Sales: num  1643691 1641957 1611968 1409728 1554807 ...
##  $ Holiday_Flag: Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 1 ...
##  $ Temperature : num  42.3 38.5 39.9 46.6 46.5 ...
##  $ Fuel_Price  : num  2.57 2.55 2.51 2.56 2.62 ...
##  $ CPI         : num  211 211 211 211 211 ...
##  $ Unemployment: num  8.11 8.11 8.11 8.11 8.11 ...
##  $ Week        : num  6 7 8 9 10 11 12 13 14 15 ...
##  $ Sales_Class : Factor w/ 2 levels "High","Low": 1 1 1 1 1 1 1 1 1 1 ...

1) Logistic Regression Feature Selection

library(dplyr)

# Salin data
walmart_df_cls <- walmart_df_cl

# Pastikan Sales_Class bertipe faktor dengan urutan yang benar
walmart_df_cls$Sales_Class <- factor(walmart_df_cls$Sales_Class, levels = c("Low", "High"))

# Siapkan kolom kosong
walmart_df_cls$Store_WOE <- NA
walmart_df_cls$Holiday_WOE <- NA

# Fungsi untuk menghitung WoE dari data sebelumnya
calculate_woe_timeaware <- function(data, cat_var, target_var, positive_class = "High", negative_class = "Low", smoothing = 1e-6) {
  total_positive <- sum(data[[target_var]] == positive_class, na.rm = TRUE)
  total_negative <- sum(data[[target_var]] == negative_class, na.rm = TRUE)
  
  woe_table <- data %>%
    group_by(across(all_of(cat_var))) %>%
    summarise(
      count_positive = sum(!!sym(target_var) == positive_class, na.rm = TRUE),
      count_negative = sum(!!sym(target_var) == negative_class, na.rm = TRUE),
      .groups = "drop"
    ) %>%
    mutate(
      perc_positive = count_positive / total_positive,
      perc_negative = count_negative / total_negative,
      WOE = log((perc_positive + smoothing) / (perc_negative + smoothing))
    ) %>%
    dplyr::select(all_of(cat_var), WOE)
  
  return(woe_table)
}

# Ambil daftar minggu unik (diurutkan agar time-aware)
weeks <- sort(unique(walmart_df_cls$Week))

# Loop untuk setiap minggu → hitung WoE hingga minggu sebelumnya (Week < t)
for (w in weeks) {
  # Data sebelumnya saja
  prior_data <- walmart_df_cls %>% filter(Week < w)
  current_rows <- which(walmart_df_cls$Week == w)
  
  if (nrow(prior_data) > 0) {
    # WoE Store
    woe_store <- calculate_woe_timeaware(prior_data, "Store", "Sales_Class")
    # Mapping ke Store_WOE untuk minggu w
    walmart_df_cls$Store_WOE[current_rows] <- walmart_df_cls$Store[current_rows] %>%
      as.character() %>%
      sapply(function(x) {
        value <- woe_store$WOE[woe_store$Store == x]
        if (length(value) == 0) return(0) else return(value)
      })
    
    # WoE Holiday_Flag
    woe_holiday <- calculate_woe_timeaware(prior_data, "Holiday_Flag", "Sales_Class")
    walmart_df_cls$Holiday_WOE[current_rows] <- walmart_df_cls$Holiday_Flag[current_rows] %>%
      as.character() %>%
      sapply(function(x) {
        value <- woe_holiday$WOE[woe_holiday$Holiday_Flag == x]
        if (length(value) == 0) return(0) else return(value)
      })
  } else {
    # Untuk minggu pertama, default WoE = 0
    walmart_df_cls$Store_WOE[current_rows] <- 0
    walmart_df_cls$Holiday_WOE[current_rows] <- 0
  }
}

# Opsional: drop Holiday_Flag jika sudah digantikan
walmart_df_cls <- walmart_df_cls %>% dplyr::select(-Holiday_Flag)

str(walmart_df_cls)

## 'data.frame':    6435 obs. of  10 variables:
##  $ Store       : Factor w/ 45 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Weekly_Sales: num  1643691 1641957 1611968 1409728 1554807 ...
##  $ Temperature : num  42.3 38.5 39.9 46.6 46.5 ...
##  $ Fuel_Price  : num  2.57 2.55 2.51 2.56 2.62 ...
##  $ CPI         : num  211 211 211 211 211 ...
##  $ Unemployment: num  8.11 8.11 8.11 8.11 8.11 ...
##  $ Week        : num  6 7 8 9 10 11 12 13 14 15 ...
##  $ Sales_Class : Factor w/ 2 levels "Low","High": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Store_WOE   : num  10.9 10.9 10.9 10.9 10.9 ...
##  $ Holiday_WOE : num  0 0.2223 -0.0395 -0.0315 -0.0255 ...

Train Test Split

# TRAIN/TEST SPLIT (80/20)
set.seed(2025)
trainIndex <- createDataPartition(walmart_df_cls$Sales_Class, p=0.8, list=FALSE)
trainData_cls <- walmart_df_cls[trainIndex, ]
testData_cls  <- walmart_df_cls[-trainIndex, ]

cat("\nTraining Size:", nrow(trainData_cls), "\n")

## 
## Training Size: 5149

cat("Testing  Size:", nrow(testData_cls), "\n")

## Testing  Size: 1286

str(testData_cls)

## 'data.frame':    1286 obs. of  10 variables:
##  $ Store       : Factor w/ 45 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Weekly_Sales: num  1545419 1391256 1494252 1422712 1449143 ...
##  $ Temperature : num  65.9 64.8 74.8 84.3 85.2 ...
##  $ Fuel_Price  : num  2.77 2.79 2.85 2.65 2.62 ...
##  $ CPI         : num  211 210 210 211 212 ...
##  $ Unemployment: num  7.81 7.81 7.81 7.81 7.79 ...
##  $ Week        : num  15 17 20 26 35 50 2 13 15 31 ...
##  $ Sales_Class : Factor w/ 2 levels "Low","High": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Store_WOE   : num  10.9 10.9 10.9 10.8 10.8 ...
##  $ Holiday_WOE : num  -0.01453 -0.01205 -0.00962 -0.00579 -0.00371 ...

# RFE on TRAIN SET for Classification

candidate_features_cls <- c("Store_WOE", "Holiday_WOE", "Temperature", 
                            "CPI", "Unemployment", "Week", "Fuel_Price")

# Prepare X and Y from the training set
train_x_rfe <- trainData_cls[, candidate_features_cls]
train_y_rfe <- trainData_cls$Sales_Class

# RFE control
rfe_ctrl_cls_logreg <- rfeControl(
  functions=lrFuncs,  
  method="cv",
  number=5
)

# RFE
set.seed(2025)
rfe_results_cls_logreg <- rfe(
  x=train_x_rfe,
  y=train_y_rfe,
  sizes=1:length(candidate_features_cls),
  rfeControl=rfe_ctrl_cls_logreg
)

cat("\n=== RFE RESULTS (Logistic - WOE) ===\n")

## 
## === RFE RESULTS (Logistic - WOE) ===

print(rfe_results_cls_logreg)

## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (5 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables Accuracy  Kappa AccuracySD KappaSD Selected
##          1   0.9446 0.8875   0.005617 0.01148         
##          2   0.9441 0.8864   0.008018 0.01621         
##          3   0.9448 0.8880   0.007644 0.01553        *
##          4   0.9431 0.8844   0.007402 0.01497         
##          5   0.9435 0.8852   0.006804 0.01376         
##          6   0.9437 0.8856   0.006926 0.01399         
##          7   0.9439 0.8859   0.007803 0.01576         
## 
## The top 3 variables (out of 3):
##    Store_WOE, Temperature, Week

best_features_cls <- predictors(rfe_results_cls_logreg)
cat("\nChosen subset for classification:\n")

## 
## Chosen subset for classification:

print(best_features_cls)

## [1] "Store_WOE"   "Temperature" "Week"

# CROSS-VALIDATION
cls_formula <- as.formula(
  paste("Sales_Class ~", paste(best_features_cls, collapse=" + "))
)

# 5-fold CV with SMOTE
cv_ctrl_cls <- trainControl(
  method="cv",
  number=5,
  sampling="smote",
  savePredictions=TRUE
)

# FIT LOGISTIC MODEL ON TEST DATA
set.seed(2025)
cls_glm <- train(
  cls_formula,
  data=testData_cls,
  method="glm",
  family=binomial,
  trControl=cv_ctrl_cls
)
cat("\n[LOGISTIC] Results:\n")

## 
## [LOGISTIC] Results:

print(cls_glm)

## Generalized Linear Model 
## 
## 1286 samples
##    3 predictor
##    2 classes: 'Low', 'High' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1029, 1028, 1029, 1029, 1029 
## Addtional sampling using SMOTE
## 
## Resampling results:
## 
##   Accuracy   Kappa  
##   0.9408922  0.88035

cat("\nLogistic Coefficients:\n")

## 
## Logistic Coefficients:

print(summary(cls_glm$finalModel)$coefficients)

##                Estimate  Std. Error   z value     Pr(>|z|)
## (Intercept)  0.80978517 0.476087228  1.700918 8.895845e-02
## Store_WOE    0.72951792 0.068294288 10.681976 1.236125e-26
## Temperature -0.02013515 0.007659153 -2.628900 8.566147e-03
## Week         0.02439882 0.008936037  2.730385 6.326032e-03

str(testData_cls)

## 'data.frame':    1286 obs. of  10 variables:
##  $ Store       : Factor w/ 45 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Weekly_Sales: num  1545419 1391256 1494252 1422712 1449143 ...
##  $ Temperature : num  65.9 64.8 74.8 84.3 85.2 ...
##  $ Fuel_Price  : num  2.77 2.79 2.85 2.65 2.62 ...
##  $ CPI         : num  211 210 210 211 212 ...
##  $ Unemployment: num  7.81 7.81 7.81 7.81 7.79 ...
##  $ Week        : num  15 17 20 26 35 50 2 13 15 31 ...
##  $ Sales_Class : Factor w/ 2 levels "Low","High": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Store_WOE   : num  10.9 10.9 10.9 10.8 10.8 ...
##  $ Holiday_WOE : num  -0.01453 -0.01205 -0.00962 -0.00579 -0.00371 ...

KNN

# KNN (Use same variables as Logistic Regression)
cls_knn <- train(
  cls_formula,
  data = trainData_cls,
  method = "knn",
  trControl = cv_ctrl_cls,
  tuneLength = 10,
  preProcess = c("center", "scale")
)
print(cls_knn)

## k-Nearest Neighbors 
## 
## 5149 samples
##    3 predictor
##    2 classes: 'Low', 'High' 
## 
## Pre-processing: centered (3), scaled (3) 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 4119, 4120, 4119, 4120, 4118 
## Addtional sampling using SMOTE prior to pre-processing
## 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    5  0.9405725  0.8795553
##    7  0.9430973  0.8847135
##    9  0.9444566  0.8873687
##   11  0.9434840  0.8855164
##   13  0.9454286  0.8894411
##   15  0.9440686  0.8866666
##   17  0.9434866  0.8855424
##   19  0.9444575  0.8875184
##   21  0.9450395  0.8886844
##   23  0.9467885  0.8922857
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 23.

Support Vector Machine

# SVM (Radial)
cls_svm <- train(
  cls_formula,
  data = trainData_cls,
  method = "svmRadial",
  trControl = cv_ctrl_cls,
  tuneLength = 5,
  preProcess = c("center", "scale")
)
print(cls_svm)

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 5149 samples
##    3 predictor
##    2 classes: 'Low', 'High' 
## 
## Pre-processing: centered (3), scaled (3) 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 4119, 4119, 4119, 4120, 4119 
## Addtional sampling using SMOTE prior to pre-processing
## 
## Resampling results across tuning parameters:
## 
##   C     Accuracy   Kappa    
##   0.25  0.9411549  0.8806528
##   0.50  0.9423195  0.8829626
##   1.00  0.9442620  0.8868530
##   2.00  0.9446498  0.8876714
##   4.00  0.9427079  0.8836876
## 
## Tuning parameter 'sigma' was held constant at a value of 0.556677
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.556677 and C = 2.

Decision Tree Feature Selection

library(caret)
# Candidate features
candidate_features_cls <- c("Store", "Temperature", "CPI", "Unemployment", "Week", "Fuel_Price", "Holiday_Flag")

# Use logistic regression specific RFE functions
rfe_ctrl_cls <- rfeControl(
  functions = treebagFuncs,    
  method    = "cv",
  number    = 5
)

# Define predictor matrix and target vector
train_x_cls <- walmart_df[, candidate_features_cls]
train_y_cls <- walmart_df_cl$Sales_Class

# Run RFE
set.seed(2025)
rfe_results_cls <- rfe(
  x = train_x_cls,
  y = train_y_cls,
  sizes = 1:length(candidate_features_cls),  
  rfeControl = rfe_ctrl_cls
)

cat("\n=== RFE RESULTS (Classification) ===\n")

## 
## === RFE RESULTS (Classification) ===

print(rfe_results_cls)

## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (5 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables Accuracy  Kappa AccuracySD  KappaSD Selected
##          1   0.9523 0.9030   0.007699 0.015738         
##          2   0.9520 0.9028   0.005638 0.011337         
##          3   0.9562 0.9113   0.005082 0.010241         
##          4   0.9605 0.9201   0.004540 0.009173         
##          5   0.9635 0.9261   0.002627 0.005308         
##          6   0.9627 0.9245   0.006380 0.012862         
##          7   0.9658 0.9307   0.004133 0.008356        *
## 
## The top 5 variables (out of 7):
##    Store, Week, CPI, Temperature, Unemployment

Train Test Split

# Train test
set.seed(2025)
walmart_df_cl1 <- dplyr::select(walmart_df_cl, -Fuel_Price, -Holiday_Flag, -Weekly_Sales)
trainIndex <- createDataPartition(walmart_df_cl1$Sales_Class, p = 0.8, list = FALSE)
trainData_cl <- walmart_df_cl1[trainIndex, ]
testData_cl <- walmart_df_cl1[-trainIndex, ]

cat("\nTraining Data Size:", nrow(trainData_cl), "\n")

## 
## Training Data Size: 5149

cat("Testing  Data Size:", nrow(testData_cl), "\n")

## Testing  Data Size: 1286

# Cross-validation control
cv_ctrl_reg <- trainControl(method="cv", number=5, sampling ='smote')

# Setup Cross-Validation with SMOTE
cv_ctrl_cls <- trainControl(
  method        = "cv", 
  number        = 5,
  sampling      = "smote",  
  savePredictions = TRUE
)
str(walmart_df)

## 'data.frame':    6435 obs. of  10 variables:
##  $ Store       : Factor w/ 45 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Date        : Date, format: "2010-02-05" "2010-02-12" ...
##  $ Weekly_Sales: num  1643691 1641957 1611968 1409728 1554807 ...
##  $ Holiday_Flag: Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 1 ...
##  $ Temperature : num  42.3 38.5 39.9 46.6 46.5 ...
##  $ Fuel_Price  : num  2.57 2.55 2.51 2.56 2.62 ...
##  $ CPI         : num  211 211 211 211 211 ...
##  $ Unemployment: num  8.11 8.11 8.11 8.11 8.11 ...
##  $ Week        : num  6 7 8 9 10 11 12 13 14 15 ...
##  $ Date_numeric: num  14645 14652 14659 14666 14673 ...

# Create a formula for classification
cls_formula <- as.formula("Sales_Class ~ Store + Week + CPI + Temperature + Unemployment")

Decision Tree

# (5) Decision Tree
cls_dt_grid <- expand.grid(cp = seq(0.001, 0.01, by = 0.002))
cls_dt <- train(
  cls_formula,
  data = trainData_cl,
  method = "rpart",
  trControl = cv_ctrl_cls,
  tuneGrid = cls_dt_grid
)
print(cls_dt)

## CART 
## 
## 5149 samples
##    5 predictor
##    2 classes: 'High', 'Low' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 4120, 4119, 4119, 4119, 4119 
## Addtional sampling using SMOTE
## 
## Resampling results across tuning parameters:
## 
##   cp     Accuracy   Kappa    
##   0.001  0.9463970  0.8911338
##   0.003  0.9296929  0.8562443
##   0.005  0.9300816  0.8571123
##   0.007  0.9300816  0.8571123
##   0.009  0.9256141  0.8475072
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.001.

Visualise Classification Decision Tree

# Visualizing Decision Tree
rpart.plot(cls_dt$finalModel, main="Final Decision Tree (Classification)", cex = 0.5, cex.main = 1.5)

## Warning: labs do not fit even at cex 0.15, there may be some overplotting

Random Forest Feature Selection

# Candidate features
walmart_df_cl2 <- walmart_df_cl
candidate_features_cl2 <- c("Store", "Temperature", "CPI", "Unemployment", "Week", "Fuel_Price", "Holiday_Flag")

# Use logistic regression specific RFE functions
rfe_ctrl_cls <- rfeControl(
  functions = rfFuncs,    
  method    = "cv",
  number    = 5
)

# Define predictor matrix and target vector
train_x_cl2 <- walmart_df_cl2[, candidate_features_cl2]
train_x_cl2 <- train_x_cl2 %>% mutate_if(is.factor, as.numeric)

train_y_cl2 <- walmart_df_cl2$Sales_Class

# Run RFE
set.seed(2025)
rfe_results_cls <- rfe(
  x = train_x_cl2,
  y = train_y_cl2,
  sizes = 1:length(candidate_features_cl2),  
  rfeControl = rfe_ctrl_cls
)

cat("\n=== RFE RESULTS (Classification) ===\n")

## 
## === RFE RESULTS (Classification) ===

print(rfe_results_cls)

## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (5 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables Accuracy  Kappa AccuracySD  KappaSD Selected
##          1   0.9523 0.9030   0.007699 0.015738         
##          2   0.9579 0.9144   0.007662 0.015612         
##          3   0.9613 0.9213   0.007075 0.014422         
##          4   0.9587 0.9162   0.005737 0.011640         
##          5   0.9622 0.9235   0.004232 0.008579        *
##          6   0.9616 0.9223   0.004440 0.008934         
##          7   0.9616 0.9223   0.005194 0.010407         
## 
## The top 5 variables (out of 5):
##    Store, CPI, Unemployment, Fuel_Price, Temperature

### Train test Split`
set.seed(2025)
walmart_df_cl <- dplyr::select(walmart_df_cl2, -Weekly_Sales, -Week, -Holiday_Flag)
trainIndex <- createDataPartition(walmart_df_cl2$Sales_Class, p = 0.8, list = FALSE)
trainData_cls2 <- walmart_df_cl2[trainIndex, ]
testData_cls2 <- walmart_df_cl2[-trainIndex, ]

cat("\nTraining Data Size:", nrow(trainData_cls2), "\n")

## 
## Training Data Size: 5149

cat("Testing  Data Size:", nrow(testData_cls2), "\n")

## Testing  Data Size: 1286

# Cross-validation control
cv_ctrl_reg <- trainControl(method="cv", number=5, sampling ='smote')

# Setup Cross-Validation with SMOTE
cv_ctrl_cls <- trainControl(
  method        = "cv", 
  number        = 5,
  sampling      = "smote",  
  savePredictions = TRUE
)
str(walmart_df)

## 'data.frame':    6435 obs. of  10 variables:
##  $ Store       : Factor w/ 45 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Date        : Date, format: "2010-02-05" "2010-02-12" ...
##  $ Weekly_Sales: num  1643691 1641957 1611968 1409728 1554807 ...
##  $ Holiday_Flag: Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 1 ...
##  $ Temperature : num  42.3 38.5 39.9 46.6 46.5 ...
##  $ Fuel_Price  : num  2.57 2.55 2.51 2.56 2.62 ...
##  $ CPI         : num  211 211 211 211 211 ...
##  $ Unemployment: num  8.11 8.11 8.11 8.11 8.11 ...
##  $ Week        : num  6 7 8 9 10 11 12 13 14 15 ...
##  $ Date_numeric: num  14645 14652 14659 14666 14673 ...

# Create a formula for classification
cls_formula <- as.formula("Sales_Class ~ Store + CPI + Unemployment + Fuel_Price + Temperature")

Random Forest

# (3) Random Forest
cls_rf <- train(
  cls_formula,
  data = trainData_cls2,
  method = "rf",
  trControl = cv_ctrl_cls,
  tuneGrid = expand.grid(mtry = c(2, 3, 4)),
  ntree = 500
)
print(cls_rf)

## Random Forest 
## 
## 5149 samples
##    5 predictor
##    2 classes: 'High', 'Low' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 4120, 4119, 4119, 4119, 4119 
## Addtional sampling using SMOTE
## 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.9263913  0.8490232
##   3     0.9256141  0.8472439
##   4     0.9283333  0.8530607
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 4.

cat("\nRandom Forest Variable Importance:\n")

## 
## Random Forest Variable Importance:

print(varImp(cls_rf))

## rf variable importance
## 
##   only 20 most important variables shown (out of 48)
## 
##              Overall
## Unemployment  100.00
## Store13        83.67
## Store6         79.46
## Store20        75.39
## Store27        74.79
## Store2         74.34
## Store24        74.05
## Store4         73.61
## Store31        70.75
## Store14        66.16
## Store39        65.60
## Store19        65.16
## Store10        64.57
## Store11        64.55
## Store23        64.51
## Store41        61.32
## Store32        60.61
## Store28        60.10
## CPI            56.11
## Store3         25.14

# EVALUATE ON TEST DATA

models_class <- list(
  Logistic=cls_glm,
  KNN=cls_knn,
  SVM=cls_svm,
  DecisionTree=cls_dt,
  RandomForest=cls_rf
)

final_cls_results <- data.frame(
  Model=character(),
  Accuracy=numeric(),
  Sensitivity=numeric(),
  Specificity=numeric(),
  Precision=numeric(),
  stringsAsFactors=FALSE
)

for(model_name in names(models_class)) {
  model_obj <- models_class[[model_name]]
  preds <- predict(model_obj, newdata=testData_cls)
  cm    <- confusionMatrix(preds, testData_cls$Sales_Class)
  
  final_cls_results <- rbind(
    final_cls_results,
    data.frame(
      Model       = model_name,
      Accuracy    = cm$overall["Accuracy"],
      Sensitivity = cm$byClass["Sensitivity"],
      Specificity = cm$byClass["Specificity"],
      Precision   = cm$byClass["Precision"]
    )
  )
}

## Warning in confusionMatrix.default(preds, testData_cls$Sales_Class): Levels are
## not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(preds, testData_cls$Sales_Class): Levels are
## not in the same order for reference and data. Refactoring data to match.

cat("\n=== FINAL CLASSIFICATION RESULTS (TEST) ===\n")

## 
## === FINAL CLASSIFICATION RESULTS (TEST) ===

print(final_cls_results, row.names=FALSE)

##         Model  Accuracy Sensitivity Specificity Precision
##      Logistic 0.9424572   0.9507736   0.9321739 0.9454545
##           KNN 0.9401244   0.9493671   0.9286957 0.9427374
##           SVM 0.9455677   0.9690577   0.9165217 0.9348711
##  DecisionTree 0.9611198   0.9845288   0.9321739 0.9472260
##  RandomForest 0.9307932   0.9943741   0.8521739 0.8926768

3) UNSUPERVISED LEARNING: ONLINE RETAIL

## Load & Basic Checks + Transaction by Country
online_df <- read.csv("C:/Users/Santasila Bryan Kusn/OneDrive/Documents/SIM GE Courses/LSE Year 3/ST3189 - Machine Learning/Assignments + Coursework/Cursework Project/Online Retail.csv")

Check Total Missing and Duplicates

cat("Dimension:", dim(online_df), "\n")

## Dimension: 541909 8

cat("Total Missing:", sum(is.na(online_df)), "\n")

## Total Missing: 135080

cat("Duplicates:", sum(duplicated(online_df)), "\n")

## Duplicates: 5268

Remove negative/zero quantity or unit price

online_df <- online_df[online_df$Quantity>0 & online_df$UnitPrice>0, ]
online_df <- na.omit(online_df)

cat("Final dimension after cleaning:", dim(online_df), "\n")

## Final dimension after cleaning: 397884 8

cat("Remaining Missing:", sum(is.na(online_df)), "\n")

## Remaining Missing: 0

Transaction Count by Country

transaction_data <- online_df %>%
  count(Country, name = "TransactionCount") %>%
  arrange(desc(TransactionCount))

head(transaction_data, 10)  # top 10 countries

p <- ggplot(transaction_data, aes(x=reorder(Country, -TransactionCount), y=TransactionCount)) +
  geom_bar(stat='identity', fill='steelblue') +
  theme_minimal() +
  labs(x='', y='Transaction Count', title='Transaction Count by Country') +
  scale_y_continuous(
    breaks = seq(50000, 400000, by=50000),
    labels = function(x) paste0(x/1000, "K")
  ) +
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(angle = 40, vjust = 0.7))
ggplotly(p)

# Revenue calculations
online_retail <- online_df %>%
  mutate(Revenue = Quantity * UnitPrice,
         InvoiceDate = dmy_hm(InvoiceDate))

monthly_revenue <- online_retail %>%
  mutate(YearMonth = floor_date(InvoiceDate, "month")) %>%
  group_by(YearMonth) %>%
  summarise(Monthly_Revenue = sum(Revenue))

head(monthly_revenue)

d_revenue <- online_retail %>%
  mutate(YearMonth = floor_date(InvoiceDate, "day")) %>%
  group_by(YearMonth) %>%
  summarise(Monthly_Revenue = sum(Revenue))

weekday_revenue <- online_retail %>%
  mutate(Weekday = wday(InvoiceDate)) %>%
  group_by(Weekday) %>%
  summarise(Daily_Revenue = sum(Revenue))

# Plot daily
p2 <- ggplot(d_revenue, aes(x=YearMonth, y=Monthly_Revenue)) +
  geom_line(color='lightblue') +
  labs(title='Revenue by Date', x='Date', y='Total Sales') +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))
ggplotly(p2)

# We rename columns for clarity
# or just do it after creation
data0 <- online_df
data1 <- data0 %>%
  filter(!is.na(CustomerID), !is.na(Quantity), Quantity>0) %>%
  mutate(InvoiceDate = dmy_hm(InvoiceDate),
         Revenue = Quantity*UnitPrice)

snapshot_date <- as.Date("2011-12-09")

rfm_data <- data1 %>%
  group_by(CustomerID) %>%
  summarise(
    Recency   = round(as.numeric(difftime(snapshot_date, max(InvoiceDate), units="days")),2),
    Frequency = n_distinct(InvoiceNo),
    Monetary  = sum(Revenue)
  ) %>%
  ungroup()

# Rename columns to explicitly say RFM
colnames(rfm_data) <- c("CustomerID","Recency","Frequency","Monetary")
print(rfm_data)

## # A tibble: 4,338 × 4
##    CustomerID Recency Frequency Monetary
##         <int>   <dbl>     <int>    <dbl>
##  1      12346  325.           1   77184.
##  2      12347    1.34         7    4310 
##  3      12348   74.4          4    1797.
##  4      12349   17.6          1    1758.
##  5      12350  309.           1     334.
##  6      12352   35.4          8    2506.
##  7      12353  203.           1      89 
##  8      12354  231.           1    1079.
##  9      12355  213.           1     459.
## 10      12356   21.6          3    2811.
## # ℹ 4,328 more rows

# Boxplots combined into one figure
p_recency <- ggplot(rfm_data, aes(x=factor(1), y=Recency)) +
  geom_boxplot(fill="#5874DC") +
  coord_cartesian(ylim=c(0,300)) +
  labs(title="Recency", y="Recency", x="") +
  theme_minimal() +
  theme(plot.title=element_text(hjust=0.5),
        axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

p_frequency <- ggplot(rfm_data, aes(x=factor(1), y=Frequency)) +
  geom_boxplot(fill="#E06C78") +
  coord_cartesian(ylim=c(0,20)) +
  labs(title="Frequency", y="Frequency", x="") +
  theme_minimal() +
  theme(plot.title=element_text(hjust=0.5),
        axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

p_monetary <- ggplot(rfm_data, aes(x=factor(1), y=Monetary)) +
  geom_boxplot(fill="#6AAB9C") +
  coord_cartesian(ylim=c(0,3000)) +
  labs(title="Monetary", y="Monetary", x="") +
  theme_minimal() +
  theme(plot.title=element_text(hjust=0.5),
        axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

p_combined <- p_recency + p_frequency + p_monetary + 
  plot_layout(guides = 'collect') & 
  theme(legend.position = 'top', legend.justification="right")

print(p_combined)

cat("Mean Recency:", mean(rfm_data$Recency, na.rm=TRUE), "\n")

## Mean Recency: 91.51462

cat("Mean Frequency:", mean(rfm_data$Frequency, na.rm=TRUE), "\n")

## Mean Frequency: 4.272015

cat("Mean Monetary:", mean(rfm_data$Monetary, na.rm=TRUE), "\n")

## Mean Monetary: 2054.266

cat("Median Recency:", median(rfm_data$Recency, na.rm=TRUE), "\n")

## Median Recency: 49.555

cat("Median Frequency:", median(rfm_data$Frequency, na.rm=TRUE), "\n")

## Median Frequency: 2

cat("Median Monetary:", median(rfm_data$Monetary, na.rm=TRUE), "\n")

## Median Monetary: 674.485

K-Means Clustering

rfm_data$Recency   <- as.numeric(rfm_data$Recency)
rfm_data$Frequency <- as.numeric(rfm_data$Frequency)
rfm_data$Monetary  <- as.numeric(rfm_data$Monetary)

rfm_scaled <- scale(rfm_data[, c("Recency","Frequency","Monetary")])

# PCA
pca_res <- prcomp(rfm_scaled, scale.=TRUE)
pca_summary <- summary(pca_res)

cat("Proportion of Variance (PC1,PC2):\n")

## Proportion of Variance (PC1,PC2):

print(pca_summary$importance[2,1:2])

##     PC1     PC2 
## 0.55507 0.30245

num_pcs <- which(cumsum(pca_summary$importance[2,])>=0.8)[1]
cat("Number of PCs used:", num_pcs, "\n")

## Number of PCs used: 2

# K-Means (k=4 as example)
set.seed(2025)
kmeans_result <- kmeans(pca_res$x[,1:2], centers=4)

# Elbow & Silhouette
fviz_nbclust(rfm_scaled, kmeans, method="wss")

fviz_nbclust(rfm_scaled, kmeans, method="silhouette")

# Visualization
cluster_data <- data.frame(pca_res$x[,1:2], cluster=factor(kmeans_result$cluster))

ggplot(cluster_data, aes(x=PC1, y=PC2, color=cluster)) +
  geom_point(size=3, alpha=0.6) +
  labs(title="K-Means Clustering on PCA (k=4)", color="Cluster") +
  theme_minimal()

# Summaries
rfm_data$Cluster <- kmeans_result$cluster
kmeans_summary <- rfm_data %>%
  group_by(Cluster) %>%
  summarise(
    Size = n(),
    Avg_Recency   = mean(Recency),
    Avg_Frequency = mean(Frequency),
    Avg_Monetary  = mean(Monetary)
  )
print(kmeans_summary)

## # A tibble: 4 × 5
##   Cluster  Size Avg_Recency Avg_Frequency Avg_Monetary
##     <int> <int>       <dbl>         <dbl>        <dbl>
## 1       1   171       14.6          23.7        14871.
## 2       2    12        6.50         86.8       130361.
## 3       3  3086       42.3           3.82        1391.
## 4       4  1069      247.            1.54         479.

create a distance matrix from the raw RFM

dist_matrix <- dist(rfm_data[, c("Recency","Frequency","Monetary")])

hc_ward     <- hclust(dist_matrix, method="ward.D2")
hc_complete <- hclust(dist_matrix, method="complete")
hc_single   <- hclust(dist_matrix, method="single")
hc_average  <- hclust(dist_matrix, method="average")

clusters_ward     <- cutree(hc_ward, k=4)
clusters_complete <- cutree(hc_complete, k=4)
clusters_single   <- cutree(hc_single, k=4)
clusters_average  <- cutree(hc_average, k=4)

sil_ward     <- silhouette(clusters_ward, dist_matrix)
sil_complete <- silhouette(clusters_complete, dist_matrix)
sil_single   <- silhouette(clusters_single, dist_matrix)
sil_average  <- silhouette(clusters_average, dist_matrix)

cat("\nAverage Silhouette Scores by method:\n")

## 
## Average Silhouette Scores by method:

cat("Ward's:", mean(sil_ward[,3]), "\n")

## Ward's: 0.9543151

cat("Complete:", mean(sil_complete[,3]), "\n")

## Complete: 0.9671201

cat("Single:", mean(sil_single[,3]), "\n")

## Single: 0.9800182

cat("Average:", mean(sil_average[,3]), "\n")

## Average: 0.963419

Visualize the Hierarchical Clustering

dist_euclidean <- dist(rfm_data[,c("Recency","Frequency","Monetary")], method="euclidean")
dist_manhattan <- dist(rfm_data[,c("Recency","Frequency","Monetary")], method="manhattan")

hc_euclidean <- hclust(dist_euclidean, method="ward.D2")
hc_manhattan <- hclust(dist_manhattan, method="ward.D2")

clusters_euclidean <- cutree(hc_euclidean, k=4)
clusters_manhattan <- cutree(hc_manhattan, k=4)

sil_euclidean <- silhouette(clusters_euclidean, dist_euclidean)
sil_manhattan <- silhouette(clusters_manhattan, dist_manhattan)

cat("Silhouette Score (Euclidean):", mean(sil_euclidean[,3]), "\n")

## Silhouette Score (Euclidean): 0.9543151

cat("Silhouette Score (Manhattan):", mean(sil_manhattan[,3]), "\n")

## Silhouette Score (Manhattan): 0.891857

# Plot final dendrogram using scaled data + ward
rfm_scaled <- scale(rfm_data[, c('Recency', 'Frequency', 'Monetary')])

hc <- hclust(dist(rfm_scaled, method = "euclidian"), method="ward.D2")
dend <- as.dendrogram(hc)
dend <- color_branches(dend, k=4)
dend <- hang.dendrogram(dend, hang=-1)
labels(dend) <- NULL

## Warning in `labels<-.dendrogram`(`*tmp*`, value = NULL): The lengths of the new
## labels is shorter than the number of leaves in the dendrogram - labels are
## recycled.

## Warning in rep(new_labels, length.out = leaves_length): 'x' is NULL so the
## result will be NULL

plot(dend, main="Dendrogram using Euclidian Distance & Ward Linkage, k=4")

clusters <- cutree(hc, k=4)
rfm_data$Cluster <- clusters

cluster_summary <- rfm_data %>%
  group_by(Cluster) %>%
  summarise(
    Size = n(),
    Average_Recency   = mean(Recency),
    Average_Frequency = mean(Frequency),
    Average_Monetary  = mean(Monetary)
  ) %>%
  arrange(desc(Average_Monetary))

print(cluster_summary)

## # A tibble: 4 × 5
##   Cluster  Size Average_Recency Average_Frequency Average_Monetary
##     <int> <int>           <dbl>             <dbl>            <dbl>
## 1       4    15            5.48             83.5           111916.
## 2       1   146           11.1              24.4            14514.
## 3       2  2909           36.8               4.03            1535.
## 4       3  1268          227.                1.58             511.

CONCLUSION & REFERENCES

All requested code has been consolidated into one script:

Walmart data used for Regression (5 models) + Classification (5 models), each with cross-validation, hyperparameter tuning, final performance tables.
Online Retail data used for unsupervised RFM analysis, K-Means, and Hierarchical Clustering.
Combined boxplots for RFM, transaction count bar chart, and hierarchical linkage comparisons.

References

Kuhn, M. (2008). Building Predictive Models in R Using the caret Package. Journal of Statistical Software, 28(5).
James, G., Witten, D., Hastie, T., & Tibshirani, R. (2013). An Introduction to Statistical Learning: With Applications in R. Springer.
Hastie, T., Tibshirani, R., & Friedman, J. (2009). The Elements of Statistical Learning: Data Mining, Inference, and Prediction. Springer.

ST3189 COURSEWORK PROJECT

SANTASILA BRYAN KUSNO

16/3/2025

Introduction

Data Wrangling and Cleaning for Supervised Learning (More on the Python Script -> ‘3189 Coursework Store Sales.ipynb’)

SUPERVISED LEARNING: Walmart Sales**

Data Loading & Initial Checks

Loading the Walmart Dataset

Structure and missing data check

Feature Engineering

Exploratory Data Analysis

Correlation & Basic Summaries

Boxplots & Distribution

1) Single Boxplot: Weekly Sales

2) Multiple Boxplot for Distribution Across Stores

3) Holiday vs Non-Holiday

Average Weekly Sales by Date (Line Plot)

REGRESSION ANALYSIS

Feature Selection for Linear, Ridge and Lasso Regression (Exhaustive Feature Selection)

Linear Regression Diagnostic Plot

Linear Regression Train Test Split

1) Linear Regression Evaluation

Linear Regression Coefficient

2.1) Ridge Regression

2.2) Lasso Regression

Feature Selection for Decision Tree (Exhaustive Feature Selection)

Spliting Train test

3) Decision Tree Evaluation

Decision Tree Visualization

Feature Selection for Random Forest Regression (Exhaustive Feature Selection)

Train Test Split

Random Forest Model Evaluation

Compare 5 Regression Models

Structure and missing data check

Feature Engineering

Create Binary Variables (High vs Low Sales)

1) Logistic Regression Feature Selection

Train Test Split

KNN

Support Vector Machine

Decision Tree Feature Selection

Train Test Split

Decision Tree

Visualise Classification Decision Tree

Random Forest Feature Selection

Random Forest

3) UNSUPERVISED LEARNING: ONLINE RETAIL

Check Total Missing and Duplicates

Remove negative/zero quantity or unit price

Transaction Count by Country

K-Means Clustering

create a distance matrix from the raw RFM

Visualize the Hierarchical Clustering

CONCLUSION & REFERENCES

References

THANK YOU