# Import All Libraries
library(tidyverse)
library(dplyr)
library(lubridate)
library(caret)
library(e1071)
library(randomForest)
library(glmnet)
library(rpart)
library(rpart.plot)
library(class)
library(ggcorrplot)
library(ggplot2)
library(reshape2)
library(scales)
library(plotly)
library(factoextra)
library(ggfortify)
library(cluster)
library(Metrics)
library(klaR)
library(dendextend)
library(patchwork)
library(smotefamily)
# Load Walmart dataset
walmart_df <- read.csv("C:/Users/Santasila Bryan Kusn/OneDrive/Documents/SIM GE Courses/LSE Year 3/ST3189 - Machine Learning/Assignments + Coursework/Cursework Project/Walmart.csv")
# Structure and missing data check
str(walmart_df)
## 'data.frame': 6435 obs. of 8 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : chr "05-02-2010" "12-02-2010" "19-02-2010" "26-02-2010" ...
## $ Weekly_Sales: num 1643691 1641957 1611968 1409728 1554807 ...
## $ Holiday_Flag: int 0 1 0 0 0 0 0 0 0 0 ...
## $ Temperature : num 42.3 38.5 39.9 46.6 46.5 ...
## $ Fuel_Price : num 2.57 2.55 2.51 2.56 2.62 ...
## $ CPI : num 211 211 211 211 211 ...
## $ Unemployment: num 8.11 8.11 8.11 8.11 8.11 ...
cat("Total missing:", sum(is.na(walmart_df)), "\n")
## Total missing: 0
cat("Duplicates:", sum(duplicated(walmart_df)), "\n")
## Duplicates: 0
# Convert date
walmart_df$Date <- as.Date(walmart_df$Date, format="%d-%m-%Y")
# Create 'Week' from Date for modeling
walmart_df$Week <- lubridate::week(walmart_df$Date)
# Create numeric date for correlation
walmart_df$Date_numeric <- as.numeric(walmart_df$Date)
# Convert 'Holiday_Flag' to factor
walmart_df$Holiday_Flag <- as.factor(walmart_df$Holiday_Flag)
# Convert 'Store' to factor
walmart_df$Store <- as.factor(walmart_df$Store)
str(walmart_df)
## 'data.frame': 6435 obs. of 10 variables:
## $ Store : Factor w/ 45 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : Date, format: "2010-02-05" "2010-02-12" ...
## $ Weekly_Sales: num 1643691 1641957 1611968 1409728 1554807 ...
## $ Holiday_Flag: Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 1 ...
## $ Temperature : num 42.3 38.5 39.9 46.6 46.5 ...
## $ Fuel_Price : num 2.57 2.55 2.51 2.56 2.62 ...
## $ CPI : num 211 211 211 211 211 ...
## $ Unemployment: num 8.11 8.11 8.11 8.11 8.11 ...
## $ Week : num 6 7 8 9 10 11 12 13 14 15 ...
## $ Date_numeric: num 14645 14652 14659 14666 14673 ...
cat("Total missing:", sum(is.na(walmart_df)), "\n")
## Total missing: 0
cat("Duplicates:", sum(duplicated(walmart_df)), "\n")
## Duplicates: 0
library(dplyr)
walmart_df_reg1 <- walmart_df
# Target Encoding Stores
walmart_df_reg1 <- walmart_df_reg1 %>%
arrange(Store, Week) %>%
group_by(Store) %>%
mutate(
Stores = cummean(lag(Weekly_Sales))
) %>%
ungroup()
walmart_df_reg1$Stores[is.na(walmart_df_reg1$Stores)] <- mean(walmart_df$Weekly_Sales, na.rm = TRUE)
# Target Encoding Holiday_Sales
walmart_df_reg1 <- walmart_df_reg1 %>%
arrange(Holiday_Flag, Week) %>%
group_by(Holiday_Flag) %>%
mutate(
Holiday_Sales = cummean(lag(Weekly_Sales))
) %>%
ungroup()
walmart_df_reg1$Holiday_Sales[is.na(walmart_df_reg1$Holiday_Sales)] <- mean(walmart_df$Weekly_Sales, na.rm = TRUE)
str(walmart_df_reg1)
## tibble [6,435 × 12] (S3: tbl_df/tbl/data.frame)
## $ Store : Factor w/ 45 levels "1","2","3","4",..: 1 1 2 2 3 3 4 4 5 5 ...
## $ Date : Date[1:6435], format: "2011-01-07" "2012-01-06" ...
## $ Weekly_Sales : num [1:6435] 1444732 1550370 1758051 1799520 378241 ...
## $ Holiday_Flag : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Temperature : num [1:6435] 48.3 49 44.7 46.8 53.4 ...
## $ Fuel_Price : num [1:6435] 2.98 3.16 2.98 3.16 2.98 ...
## $ CPI : num [1:6435] 211 220 211 219 215 ...
## $ Unemployment : num [1:6435] 7.74 7.35 8.03 7.06 7.55 ...
## $ Week : num [1:6435] 1 1 1 1 1 1 1 1 1 1 ...
## $ Date_numeric : num [1:6435] 14981 15345 14981 15345 14981 ...
## $ Stores : num [1:6435] 1046965 1046965 1046965 1046965 1046965 ...
## $ Holiday_Sales: num [1:6435] 1046965 1046965 1046965 1046965 1046965 ...
library("corrplot")
numeric_columns <- walmart_df_reg1[c("Weekly_Sales", "Temperature", "Fuel_Price", "CPI", "Unemployment", "Date_numeric", "Week", "Stores")]
cor_matrix <- cor(numeric_columns, use = "complete.obs")
## Warning in cor(numeric_columns, use = "complete.obs"): the standard deviation
## is zero
ggcorrplot(
cor_matrix, type = "lower", lab = TRUE, lab_size = 3, tl.srt = 45,
title = "Correlation plot for Walmart Dataset", ggtheme=theme_minimal()
)
#### Correlation Boxplot for feature iportance
library(DescTools)
library(psych)
# Exclude Weekly_Sales
all_vars <- setdiff(names(walmart_df), "Weekly_Sales")
corr_df <- data.frame(Variable=character(), Correlation=numeric(), Type=character(), stringsAsFactors=FALSE)
for (var in all_vars) {
if (is.numeric(walmart_df[[var]])) {
corr_val <- cor(walmart_df[[var]], walmart_df$Weekly_Sales, use="complete.obs")
corr_df <- rbind(corr_df, data.frame(Variable=var, Correlation=corr_val, Type="Numeric"))
} else if (is.factor(walmart_df[[var]]) || is.character(walmart_df[[var]])) {
model <- aov(Weekly_Sales ~ as.factor(walmart_df[[var]]), data=walmart_df)
eta_sq <- EtaSq(model, type=1)[1, "eta.sq"]
corr_df <- rbind(corr_df, data.frame(Variable=var, Correlation=eta_sq, Type="Categorical"))
}
}
ggplot(corr_df, aes(x=reorder(Variable, abs(Correlation)), y=abs(Correlation), fill=Type)) +
geom_bar(stat="identity") +
coord_flip() +
labs(title="Correlation Strength to Weekly_Sales",
x="Variable", y="Correlation / Effect Size") +
theme_minimal()
ggplot(walmart_df, aes(y = Weekly_Sales)) +
geom_boxplot() +
scale_y_continuous(labels = comma) +
theme_minimal() +
coord_flip() +
labs(title = "Distribution of Weekly Sales", y = "Weekly Sales")
median_sales <- median(walmart_df$Weekly_Sales)
cat("Median Weekly Sales:", median_sales, "\n")
## Median Weekly Sales: 960746
q1_sales <- quantile(walmart_df$Weekly_Sales, 0.25)
cat("Q1 Weekly Sales:", q1_sales, "\n")
## Q1 Weekly Sales: 553350.1
q3_sales <- quantile(walmart_df$Weekly_Sales, 0.75)
cat("Q3 Weekly Sales:", q3_sales, "\n")
## Q3 Weekly Sales: 1420159
mean_sales <- walmart_df %>%
group_by(Store) %>%
summarise(Mean_Weekly_Sales = mean(Weekly_Sales)) %>%
arrange(Mean_Weekly_Sales)
walmart_df <- left_join(walmart_df, mean_sales, by = "Store")
walmart_df$Store <- factor(walmart_df$Store, levels = mean_sales$Store)
ggplot(walmart_df, aes(x = Store, y = Weekly_Sales)) +
geom_boxplot(aes(fill = Mean_Weekly_Sales)) +
scale_fill_gradient(low = "#ff00d4", high = "#00ddff") +
coord_flip() +
theme_minimal() +
labs(title = "Mean of Weekly Sales Across Different Stores",
x = "Store Number", y = "Weekly Sales") +
scale_y_continuous(labels = comma) +
theme(legend.position = "none", plot.title = element_text(hjust = 0.5))
walmart_df$Holiday_Flag <- factor(walmart_df$Holiday_Flag,
levels = c(0, 1),
labels = c("Non-Holidays", "Holidays"))
ggplot(walmart_df, aes(x = Holiday_Flag, y = Weekly_Sales, fill = Holiday_Flag)) +
geom_boxplot() +
scale_y_continuous(labels = comma) +
theme_minimal() +
labs(
title = "Comparison of Weekly Sales (Holiday vs. Non-Holiday)",
x = "", y = "Weekly Sales"
) +
scale_fill_manual(values = c("#00a1ff", "#00ff8f")) +
theme(legend.position = "none", plot.title = element_text(hjust = 0.5))
holiday_summary <- walmart_df %>%
group_by(Holiday_Flag) %>%
summarise(Median_Sales = median(Weekly_Sales, na.rm = TRUE))
cat("Holiday vs Non-Holiday Median Sales:\n")
## Holiday vs Non-Holiday Median Sales:
print(holiday_summary)
## # A tibble: 2 × 2
## Holiday_Flag Median_Sales
## <fct> <dbl>
## 1 Non-Holidays 956211.
## 2 Holidays 1018538.
avg_sales_by_date <- walmart_df %>%
group_by(Date) %>%
summarise(Weekly_Sales=mean(Weekly_Sales))
avg_sales_by_date$Color <- ifelse(avg_sales_by_date$Weekly_Sales>1200000,"green","red")
ggplot(avg_sales_by_date, aes(x=Date, y=Weekly_Sales)) +
geom_line(color="black") +
geom_point(aes(color=Color), size=2) +
scale_x_date(date_breaks="3 months", date_labels="%d-%m-%Y") +
scale_y_continuous(labels=comma) +
scale_color_identity() +
labs(title="Average Weekly Sales by Date",
x="Date", y="Average Weekly Sales") +
theme_minimal() +
theme(axis.text.x=element_text(angle=60, hjust=1),
plot.title=element_text(hjust=0.5),
legend.position="none")
summary(avg_sales_by_date$Weekly_Sales)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 879997 997346 1027642 1046965 1062045 1798476
library(caret)
library(dplyr)
# Feature Selection
rfe_columns <- c("Stores", "Temperature", "CPI", "Unemployment", "Week", "Fuel_Price", "Holiday_Sales")
train_x_reg1 <- walmart_df_reg1[, rfe_columns]
train_y_reg1 <- walmart_df_reg1$Weekly_Sales
# Ensure numeric columns
train_x_reg1 <- train_x_reg1 %>%
mutate(across(everything(), ~ as.numeric(as.character(.))))
# Feature Selection for Linear Regression
rfe_ctrl_reg <- rfeControl(functions = lmFuncs, method = "cv", number = 5)
# Find best features
set.seed(2025)
rfe_results_reg <- rfe(
x = train_x_reg1,
y = train_y_reg1,
sizes = 1:ncol(train_x_reg1),
rfeControl = rfe_ctrl_reg
)
cat("\n=== RFE RESULTS (Regression - Target Encoding) ===\n")
##
## === RFE RESULTS (Regression - Target Encoding) ===
print(rfe_results_reg)
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (5 fold)
##
## Resampling performance over subset size:
##
## Variables RMSE Rsquared MAE RMSESD RsquaredSD MAESD Selected
## 1 561058 0.01131 468405 10956 0.001519 5653
## 2 561194 0.01084 468478 10993 0.001509 5624
## 3 559714 0.01631 467748 10688 0.004151 5654
## 4 557739 0.02362 467609 10303 0.008967 5497
## 5 555735 0.03072 467280 10844 0.008195 6129 *
## 6 555735 0.03072 467280 10844 0.008195 6129
## 7 555735 0.03072 467280 10844 0.008195 6129
##
## The top 5 variables (out of 5):
## Unemployment, Fuel_Price, Week, Temperature, CPI
walmart_df_reg1$Week <- week(walmart_df_reg1$Date)
lm_model <- lm(Weekly_Sales ~ Fuel_Price + Unemployment + Week + Stores + Temperature + CPI,
data = walmart_df_reg1)
par(mfrow = c(2, 2))
plot(lm_model)
summary_model <- summary(lm_model)
rmse_lm <- sqrt(mean(summary_model$residuals^2))
# Check Data Structure
str(walmart_df_reg1)
## tibble [6,435 × 12] (S3: tbl_df/tbl/data.frame)
## $ Store : Factor w/ 45 levels "1","2","3","4",..: 1 1 2 2 3 3 4 4 5 5 ...
## $ Date : Date[1:6435], format: "2011-01-07" "2012-01-06" ...
## $ Weekly_Sales : num [1:6435] 1444732 1550370 1758051 1799520 378241 ...
## $ Holiday_Flag : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Temperature : num [1:6435] 48.3 49 44.7 46.8 53.4 ...
## $ Fuel_Price : num [1:6435] 2.98 3.16 2.98 3.16 2.98 ...
## $ CPI : num [1:6435] 211 220 211 219 215 ...
## $ Unemployment : num [1:6435] 7.74 7.35 8.03 7.06 7.55 ...
## $ Week : num [1:6435] 1 1 1 1 1 1 1 1 1 1 ...
## $ Date_numeric : num [1:6435] 14981 15345 14981 15345 14981 ...
## $ Stores : num [1:6435] 1046965 1046965 1046965 1046965 1046965 ...
## $ Holiday_Sales: num [1:6435] 1046965 1046965 1046965 1046965 1046965 ...
set.seed(2025)
walmart_df_reg1 <- dplyr::select(walmart_df_reg1, -Date_numeric, -Holiday_Flag, -Store,-Holiday_Sales, -Date)
trainIndex1 <- createDataPartition(walmart_df_reg1$Weekly_Sales, p = 0.8, list = FALSE)
trainData1 <- walmart_df_reg1[trainIndex1, ]
testData1 <- walmart_df_reg1[-trainIndex1, ]
cat("\nTraining Data Size:", nrow(trainData1), "\n")
##
## Training Data Size: 5151
cat("Testing Data Size:", nrow(testData1), "\n")
## Testing Data Size: 1284
# Cross-validation control
cv_ctrl_reg <- trainControl(method="cv", number=5)
# Linear Regression (method="lm")
lm_cv <- train(
Weekly_Sales ~ Stores + Fuel_Price + Unemployment + Week + Temperature + CPI,
data = trainData1,
method = "lm",
trControl = cv_ctrl_reg
)
predictions <- predict(lm_cv, newdata = testData1)
evaluation_metrics <- postResample(predictions, testData1$Weekly_Sales)
print(evaluation_metrics)
## RMSE Rsquared MAE
## 5.679807e+05 3.500834e-02 4.732756e+05
cat("\nCoefficients of final linear model:\n")
##
## Coefficients of final linear model:
print(summary(lm_cv$finalModel)$coefficients)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1577489.941 90597.7840 17.4120146 5.063833e-66
## Fuel_Price 2443.011 17538.4724 0.1392944 8.892229e-01
## Unemployment -39107.274 4430.9319 -8.8259705 1.463885e-18
## Week 3597.800 563.4975 6.3847661 1.867449e-10
## Temperature -1403.960 452.7820 -3.1007424 1.940802e-03
## CPI -1372.998 216.9979 -6.3272446 2.707189e-10
# Ridge Regression (Use same variables as LR)
ridge_cv <- train(
Weekly_Sales ~ Stores + Fuel_Price + Unemployment + Week + Temperature + CPI,
data = trainData1,
method = "glmnet",
trControl = cv_ctrl_reg,
tuneGrid = expand.grid(alpha = 0, lambda = seq(0.001, 0.1, by = 0.01)),
preProcess = c("center", "scale")
)
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Stores
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Stores
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Stores
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Stores
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Stores
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Stores
cat("\n2) Ridge Regression:\n")
##
## 2) Ridge Regression:
print(ridge_cv)
## glmnet
##
## 5151 samples
## 6 predictor
##
## Pre-processing: centered (6), scaled (6)
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 4122, 4121, 4121, 4120, 4120
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0.001 553011.4 0.02864283 465624.9
## 0.011 553011.4 0.02864283 465624.9
## 0.021 553011.4 0.02864283 465624.9
## 0.031 553011.4 0.02864283 465624.9
## 0.041 553011.4 0.02864283 465624.9
## 0.051 553011.4 0.02864283 465624.9
## 0.061 553011.4 0.02864283 465624.9
## 0.071 553011.4 0.02864283 465624.9
## 0.081 553011.4 0.02864283 465624.9
## 0.091 553011.4 0.02864283 465624.9
##
## Tuning parameter 'alpha' was held constant at a value of 0
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0 and lambda = 0.091.
cat("\nBest Tuning Parameters for Ridge (alpha = 0):\n")
##
## Best Tuning Parameters for Ridge (alpha = 0):
print(ridge_cv$bestTune)
## alpha lambda
## 10 0 0.091
# Prediction
ridge_predictions <- predict(ridge_cv, newdata = testData1)
# Evaluation
ridge_eval <- postResample(ridge_predictions, testData1$Weekly_Sales)
cat("\nRidge Model Evaluation:\n")
##
## Ridge Model Evaluation:
print(ridge_eval)
## RMSE Rsquared MAE
## 5.680049e+05 3.500146e-02 4.732875e+05
# Lasso Regression (Use same variables as LR)
lasso_cv <- train(
Weekly_Sales ~ Stores + Fuel_Price + Unemployment + Week + Temperature + CPI,
data = trainData1,
method = "glmnet",
trControl = cv_ctrl_reg,
tuneGrid = expand.grid(alpha = 1, lambda = seq(0.001, 0.1, by = 0.01)),
preProcess = c("center", "scale")
)
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Stores
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Stores
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Stores
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Stores
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Stores
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: Stores
cat("\n3) Lasso Regression:\n")
##
## 3) Lasso Regression:
print(lasso_cv)
## glmnet
##
## 5151 samples
## 6 predictor
##
## Pre-processing: centered (6), scaled (6)
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 4121, 4123, 4120, 4120, 4120
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0.001 552731.1 0.02904947 465469.7
## 0.011 552731.1 0.02904947 465469.7
## 0.021 552731.1 0.02904947 465469.7
## 0.031 552731.1 0.02904947 465469.7
## 0.041 552731.1 0.02904947 465469.7
## 0.051 552731.1 0.02904947 465469.7
## 0.061 552731.1 0.02904947 465469.7
## 0.071 552731.1 0.02904947 465469.7
## 0.081 552731.1 0.02904947 465469.7
## 0.091 552731.1 0.02904947 465469.7
##
## Tuning parameter 'alpha' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 0.091.
cat("\nBest Tuning Parameters for Lasso (alpha = 1):\n")
##
## Best Tuning Parameters for Lasso (alpha = 1):
print(lasso_cv$bestTune)
## alpha lambda
## 10 1 0.091
# prediction
lasso_predictions <- predict(lasso_cv, newdata = testData1)
# Evaluation
lasso_eval <- postResample(lasso_predictions, testData1$Weekly_Sales)
cat("\nLasso Model Evaluation:\n")
##
## Lasso Model Evaluation:
print(lasso_eval)
## RMSE Rsquared MAE
## 5.679927e+05 3.499983e-02 4.732677e+05
# FEATURE SELECTION for REGRESSION via RFE
candidate_features_reg <- c("Store","Temperature","CPI","Unemployment","Week","Fuel_Price","Holiday_Flag")
walmart_df_reg2 <- walmart_df
# Split Train Test
train_x_reg2 <- walmart_df_reg2[, candidate_features_reg]
train_y_reg2 <- walmart_df_reg2$Weekly_Sales
# RFE control
rfe_ctrl_reg_tree <- rfeControl(functions = treebagFuncs, method = "cv", number = 5)
# Run RFE
set.seed(2025)
rfe_results_reg_tree <- rfe(
x = train_x_reg2,
y = train_y_reg2,
sizes = 1:length(candidate_features_reg),
rfeControl = rfe_ctrl_reg_tree
)
cat("\n=== RFE RESULTS (Decision Tree Regression) ===\n")
##
## === RFE RESULTS (Decision Tree Regression) ===
print(rfe_results_reg_tree)
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (5 fold)
##
## Resampling performance over subset size:
##
## Variables RMSE Rsquared MAE RMSESD RsquaredSD MAESD Selected
## 1 184500 0.8930 119062 11597 0.011085 3812
## 2 175669 0.9029 116596 10934 0.011358 4764
## 3 172510 0.9066 115819 7697 0.007349 3644
## 4 172333 0.9067 115523 8211 0.007577 4727
## 5 171927 0.9072 115734 7880 0.007959 4632 *
## 6 172417 0.9067 115491 7889 0.007380 4170
## 7 173305 0.9057 116135 7815 0.007314 4432
##
## The top 5 variables (out of 5):
## Store, Week, CPI, Unemployment, Temperature
# Evaluate
best_features_reg_tree <- predictors(rfe_results_reg_tree)
cat("\nChosen subset for Decision Tree Regression:\n")
##
## Chosen subset for Decision Tree Regression:
print(best_features_reg_tree)
## [1] "Store" "Week" "CPI" "Unemployment" "Temperature"
set.seed(2025)
walmart_df_reg2 <- dplyr::select(walmart_df_reg2, -Date_numeric, -Fuel_Price, -Holiday_Flag)
trainIndex2 <- createDataPartition(walmart_df_reg2$Weekly_Sales, p = 0.8, list = FALSE)
trainData2 <- walmart_df_reg2[trainIndex2, ]
testData2 <- walmart_df_reg2[-trainIndex2, ]
cat("\nTraining Data Size:", nrow(trainData2), "\n")
##
## Training Data Size: 5151
cat("Testing Data Size:", nrow(testData2), "\n")
## Testing Data Size: 1284
# Cross-validation control
cv_ctrl_reg <- trainControl(method="cv", number=5)
# Decision Tree (method="rpart")
dt_grid <- expand.grid(cp=seq(0.0001, 0.01, by=0.002))
dt_cv <- train(
Weekly_Sales ~ Store + Week + CPI + Unemployment + Temperature,
data = walmart_df_reg2,
method = "rpart",
trControl = cv_ctrl_reg,
tuneGrid = dt_grid
)
cat("\n4) Decision Tree:\n")
##
## 4) Decision Tree:
print(dt_cv)
## CART
##
## 6435 samples
## 5 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 5148, 5148, 5148, 5149, 5147
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.0001 150733.0 0.9286377 84998.36
## 0.0021 179021.6 0.8992878 109588.06
## 0.0041 189146.2 0.8874999 119394.29
## 0.0061 205798.2 0.8667599 137033.07
## 0.0081 220335.6 0.8475742 153190.28
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 1e-04.
cat("\nBest Tuning (cp):\n")
##
## Best Tuning (cp):
print(dt_cv$bestTune)
## cp
## 1 1e-04
# Prediction
dt_predictions <- predict(dt_cv, newdata = testData2)
# Evaluation
dt_eval <- postResample(dt_predictions, testData2$Weekly_Sales)
cat("\nDecision Tree Model Evaluation:\n")
##
## Decision Tree Model Evaluation:
print(dt_eval)
## RMSE Rsquared MAE
## 1.207413e+05 9.524806e-01 7.070176e+04
# Visualizing Decision Tree
rpart.plot(dt_cv$finalModel, main="Final Decision Tree (Regression)", cex = 0.3, cex.main = 1.5)
## Warning: labs do not fit even at cex 0.15, there may be some overplotting
# FEATURE SELECTION for REGRESSION via RFE
candidate_features_reg <- c("Store","Temperature","CPI","Unemployment","Week","Fuel_Price","Holiday_Flag")
walmart_df_reg3 <- walmart_df
train_x_reg3 <- walmart_df_reg3[, candidate_features_reg]
train_y_reg3 <- walmart_df_reg3$Weekly_Sales
# RFE control for Random Forest Regression
rfe_ctrl_reg_rf <- rfeControl(
functions = rfFuncs,
method = "cv",
number = 5
)
# Run RFE
set.seed(2025)
rfe_results_reg_rf <- rfe(
x = train_x_reg3,
y = train_y_reg3,
sizes = 1:length(candidate_features_reg),
rfeControl = rfe_ctrl_reg_rf
)
cat("\n=== RFE RESULTS (Random Forest Regression) ===\n")
##
## === RFE RESULTS (Random Forest Regression) ===
print(rfe_results_reg_rf)
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (5 fold)
##
## Resampling performance over subset size:
##
## Variables RMSE Rsquared MAE RMSESD RsquaredSD MAESD Selected
## 1 162861 0.9167 91052 12066 0.010473 3066
## 2 128814 0.9485 73960 7502 0.005411 2539
## 3 119729 0.9569 67184 9935 0.006769 1450
## 4 126218 0.9523 70091 10928 0.007253 2095
## 5 131999 0.9516 77531 10658 0.006881 2696
## 6 119756 0.9553 64427 10726 0.007576 2125
## 7 111063 0.9627 63449 10119 0.006257 2091 *
##
## The top 5 variables (out of 7):
## Store, Week, CPI, Unemployment, Temperature
set.seed(2025)
walmart_df_reg3 <- dplyr::select(walmart_df_reg3, -Date_numeric, -Fuel_Price, -Holiday_Flag)
trainIndex3 <- createDataPartition(walmart_df_reg3$Weekly_Sales, p = 0.8, list = FALSE)
trainData3 <- walmart_df_reg3[trainIndex3, ]
testData3 <- walmart_df_reg3[-trainIndex3, ]
cat("\nTraining Data Size:", nrow(trainData3), "\n")
##
## Training Data Size: 5151
cat("Testing Data Size:", nrow(testData3), "\n")
## Testing Data Size: 1284
# Cross-validation control
cv_ctrl_reg <- trainControl(method="cv", number=5)
# Random Forest (method="rf")
rf_grid <- expand.grid(mtry=c(2,3,4))
rf_cv <- train(
Weekly_Sales ~ Store + Week+ CPI+ Unemployment+ Temperature,
data = trainData3,
method = "rf",
trControl = cv_ctrl_reg,
tuneGrid = rf_grid,
ntree = 500
)
cat("\n5) Random Forest:\n")
##
## 5) Random Forest:
print(rf_cv)
## Random Forest
##
## 5151 samples
## 5 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 4120, 4121, 4121, 4121, 4121
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 357477.0 0.8766710 292696.2
## 3 277247.7 0.8798755 219199.6
## 4 228989.8 0.8897462 169283.5
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 4.
cat("\nBest Tuning (mtry):\n")
##
## Best Tuning (mtry):
print(rf_cv$bestTune)
## mtry
## 3 4
cat("\nRandom Forest Variable Importance:\n")
##
## Random Forest Variable Importance:
print(varImp(rf_cv))
## rf variable importance
##
## only 20 most important variables shown (out of 48)
##
## Overall
## Store20 100.00
## Store14 89.49
## Store4 75.47
## Store13 74.67
## Store10 60.06
## Store2 56.48
## Store27 40.23
## Store44 38.02
## Unemployment 37.53
## Store5 33.65
## CPI 32.18
## Store36 28.58
## Store3 21.74
## Store6 20.28
## Store30 19.94
## Store38 18.06
## Store1 17.53
## Store16 14.88
## Store37 14.39
## Week 13.12
# Prediction
rf_predictions <- predict(rf_cv, newdata = testData3)
rf_eval <- postResample(rf_predictions, testData3$Weekly_Sales)
cat("\nRandom Forest Model Evaluation:\n")
##
## Random Forest Model Evaluation:
print(rf_eval)
## RMSE Rsquared MAE
## 2.219946e+05 8.895877e-01 1.663702e+05
# Create a list of your models and evaluation results
models <- list(
Linear = list(eval = evaluation_metrics),
Ridge = list(eval = ridge_eval),
Lasso = list(eval = lasso_eval),
DecisionTree = list(eval = dt_eval),
RandomForest = list(eval = rf_eval)
)
results_testing <- data.frame(Model = character(),
RMSE = numeric(),
R2 = numeric(),
stringsAsFactors = FALSE)
# Loop over each model and extract performance metrics
for (model_name in names(models)) {
metrics <- models[[model_name]]$eval
# Append the results to the data frame
results_testing <- rbind(results_testing,
data.frame(Model = model_name,
RMSE = metrics["RMSE"],
R2 = metrics["Rsquared"]))
}
rownames(results_testing) <- NULL
cat("\n\n=== FINAL REGRESSION PERFORMANCE ON TESTING DATA (5 Models) ===\n")
##
##
## === FINAL REGRESSION PERFORMANCE ON TESTING DATA (5 Models) ===
print(results_testing, row.names = FALSE)
## Model RMSE R2
## Linear 567980.7 0.03500834
## Ridge 568004.9 0.03500146
## Lasso 567992.7 0.03499983
## DecisionTree 120741.3 0.95248058
## RandomForest 221994.6 0.88958767
# Classification |
# Reload Walmart dataset
walmart_df <- read.csv("C:/Users/Santasila Bryan Kusn/OneDrive/Documents/SIM GE Courses/LSE Year 3/ST3189 - Machine Learning/Assignments + Coursework/Cursework Project/Walmart.csv")
# Structure and missing data check
str(walmart_df)
## 'data.frame': 6435 obs. of 8 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : chr "05-02-2010" "12-02-2010" "19-02-2010" "26-02-2010" ...
## $ Weekly_Sales: num 1643691 1641957 1611968 1409728 1554807 ...
## $ Holiday_Flag: int 0 1 0 0 0 0 0 0 0 0 ...
## $ Temperature : num 42.3 38.5 39.9 46.6 46.5 ...
## $ Fuel_Price : num 2.57 2.55 2.51 2.56 2.62 ...
## $ CPI : num 211 211 211 211 211 ...
## $ Unemployment: num 8.11 8.11 8.11 8.11 8.11 ...
cat("Total missing:", sum(is.na(walmart_df)), "\n")
## Total missing: 0
cat("Duplicates:", sum(duplicated(walmart_df)), "\n")
## Duplicates: 0
# Convert date
walmart_df$Date <- as.Date(walmart_df$Date, format="%d-%m-%Y")
# Create 'Month' from Date for modeling
walmart_df$Week <- lubridate::week(walmart_df$Date)
# Create numeric date for correlation
walmart_df$Date_numeric <- as.numeric(walmart_df$Date)
# Convert 'Holiday_Flag' to factor
walmart_df$Holiday_Flag <- as.factor(walmart_df$Holiday_Flag)
# Convert 'Store' to factor
walmart_df$Store <- as.factor(walmart_df$Store)
str(walmart_df)
## 'data.frame': 6435 obs. of 10 variables:
## $ Store : Factor w/ 45 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : Date, format: "2010-02-05" "2010-02-12" ...
## $ Weekly_Sales: num 1643691 1641957 1611968 1409728 1554807 ...
## $ Holiday_Flag: Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 1 ...
## $ Temperature : num 42.3 38.5 39.9 46.6 46.5 ...
## $ Fuel_Price : num 2.57 2.55 2.51 2.56 2.62 ...
## $ CPI : num 211 211 211 211 211 ...
## $ Unemployment: num 8.11 8.11 8.11 8.11 8.11 ...
## $ Week : num 6 7 8 9 10 11 12 13 14 15 ...
## $ Date_numeric: num 14645 14652 14659 14666 14673 ...
cat("Total missing:", sum(is.na(walmart_df)), "\n")
## Total missing: 0
cat("Duplicates:", sum(duplicated(walmart_df)), "\n")
## Duplicates: 0
# Create a binary target: "High" vs "Low" if Weekly_Sales is above the mean
walmart_df_cl <- walmart_df
walmart_df_cl$Sales_Class <- ifelse(walmart_df_cl$Weekly_Sales > mean(walmart_df_cl$Weekly_Sales), "High", "Low")
walmart_df_cl$Sales_Class <- as.factor(walmart_df_cl$Sales_Class)
cat("\nClass Distribution:\n")
##
## Class Distribution:
print(table(walmart_df_cl$Sales_Class))
##
## High Low
## 2876 3559
print(prop.table(table(walmart_df_cl$Sales_Class)))
##
## High Low
## 0.4469308 0.5530692
str(walmart_df_cl)
## 'data.frame': 6435 obs. of 11 variables:
## $ Store : Factor w/ 45 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : Date, format: "2010-02-05" "2010-02-12" ...
## $ Weekly_Sales: num 1643691 1641957 1611968 1409728 1554807 ...
## $ Holiday_Flag: Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 1 ...
## $ Temperature : num 42.3 38.5 39.9 46.6 46.5 ...
## $ Fuel_Price : num 2.57 2.55 2.51 2.56 2.62 ...
## $ CPI : num 211 211 211 211 211 ...
## $ Unemployment: num 8.11 8.11 8.11 8.11 8.11 ...
## $ Week : num 6 7 8 9 10 11 12 13 14 15 ...
## $ Date_numeric: num 14645 14652 14659 14666 14673 ...
## $ Sales_Class : Factor w/ 2 levels "High","Low": 1 1 1 1 1 1 1 1 1 1 ...
# Drop unnecessary variable
walmart_df_cl <- dplyr::select(walmart_df_cl, -Date_numeric, -Date)
str(walmart_df_cl)
## 'data.frame': 6435 obs. of 9 variables:
## $ Store : Factor w/ 45 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Weekly_Sales: num 1643691 1641957 1611968 1409728 1554807 ...
## $ Holiday_Flag: Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 1 ...
## $ Temperature : num 42.3 38.5 39.9 46.6 46.5 ...
## $ Fuel_Price : num 2.57 2.55 2.51 2.56 2.62 ...
## $ CPI : num 211 211 211 211 211 ...
## $ Unemployment: num 8.11 8.11 8.11 8.11 8.11 ...
## $ Week : num 6 7 8 9 10 11 12 13 14 15 ...
## $ Sales_Class : Factor w/ 2 levels "High","Low": 1 1 1 1 1 1 1 1 1 1 ...
library(dplyr)
# Salin data
walmart_df_cls <- walmart_df_cl
# Pastikan Sales_Class bertipe faktor dengan urutan yang benar
walmart_df_cls$Sales_Class <- factor(walmart_df_cls$Sales_Class, levels = c("Low", "High"))
# Siapkan kolom kosong
walmart_df_cls$Store_WOE <- NA
walmart_df_cls$Holiday_WOE <- NA
# Fungsi untuk menghitung WoE dari data sebelumnya
calculate_woe_timeaware <- function(data, cat_var, target_var, positive_class = "High", negative_class = "Low", smoothing = 1e-6) {
total_positive <- sum(data[[target_var]] == positive_class, na.rm = TRUE)
total_negative <- sum(data[[target_var]] == negative_class, na.rm = TRUE)
woe_table <- data %>%
group_by(across(all_of(cat_var))) %>%
summarise(
count_positive = sum(!!sym(target_var) == positive_class, na.rm = TRUE),
count_negative = sum(!!sym(target_var) == negative_class, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(
perc_positive = count_positive / total_positive,
perc_negative = count_negative / total_negative,
WOE = log((perc_positive + smoothing) / (perc_negative + smoothing))
) %>%
dplyr::select(all_of(cat_var), WOE)
return(woe_table)
}
# Ambil daftar minggu unik (diurutkan agar time-aware)
weeks <- sort(unique(walmart_df_cls$Week))
# Loop untuk setiap minggu → hitung WoE hingga minggu sebelumnya (Week < t)
for (w in weeks) {
# Data sebelumnya saja
prior_data <- walmart_df_cls %>% filter(Week < w)
current_rows <- which(walmart_df_cls$Week == w)
if (nrow(prior_data) > 0) {
# WoE Store
woe_store <- calculate_woe_timeaware(prior_data, "Store", "Sales_Class")
# Mapping ke Store_WOE untuk minggu w
walmart_df_cls$Store_WOE[current_rows] <- walmart_df_cls$Store[current_rows] %>%
as.character() %>%
sapply(function(x) {
value <- woe_store$WOE[woe_store$Store == x]
if (length(value) == 0) return(0) else return(value)
})
# WoE Holiday_Flag
woe_holiday <- calculate_woe_timeaware(prior_data, "Holiday_Flag", "Sales_Class")
walmart_df_cls$Holiday_WOE[current_rows] <- walmart_df_cls$Holiday_Flag[current_rows] %>%
as.character() %>%
sapply(function(x) {
value <- woe_holiday$WOE[woe_holiday$Holiday_Flag == x]
if (length(value) == 0) return(0) else return(value)
})
} else {
# Untuk minggu pertama, default WoE = 0
walmart_df_cls$Store_WOE[current_rows] <- 0
walmart_df_cls$Holiday_WOE[current_rows] <- 0
}
}
# Opsional: drop Holiday_Flag jika sudah digantikan
walmart_df_cls <- walmart_df_cls %>% dplyr::select(-Holiday_Flag)
str(walmart_df_cls)
## 'data.frame': 6435 obs. of 10 variables:
## $ Store : Factor w/ 45 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Weekly_Sales: num 1643691 1641957 1611968 1409728 1554807 ...
## $ Temperature : num 42.3 38.5 39.9 46.6 46.5 ...
## $ Fuel_Price : num 2.57 2.55 2.51 2.56 2.62 ...
## $ CPI : num 211 211 211 211 211 ...
## $ Unemployment: num 8.11 8.11 8.11 8.11 8.11 ...
## $ Week : num 6 7 8 9 10 11 12 13 14 15 ...
## $ Sales_Class : Factor w/ 2 levels "Low","High": 2 2 2 2 2 2 2 2 2 2 ...
## $ Store_WOE : num 10.9 10.9 10.9 10.9 10.9 ...
## $ Holiday_WOE : num 0 0.2223 -0.0395 -0.0315 -0.0255 ...
# TRAIN/TEST SPLIT (80/20)
set.seed(2025)
trainIndex <- createDataPartition(walmart_df_cls$Sales_Class, p=0.8, list=FALSE)
trainData_cls <- walmart_df_cls[trainIndex, ]
testData_cls <- walmart_df_cls[-trainIndex, ]
cat("\nTraining Size:", nrow(trainData_cls), "\n")
##
## Training Size: 5149
cat("Testing Size:", nrow(testData_cls), "\n")
## Testing Size: 1286
str(testData_cls)
## 'data.frame': 1286 obs. of 10 variables:
## $ Store : Factor w/ 45 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Weekly_Sales: num 1545419 1391256 1494252 1422712 1449143 ...
## $ Temperature : num 65.9 64.8 74.8 84.3 85.2 ...
## $ Fuel_Price : num 2.77 2.79 2.85 2.65 2.62 ...
## $ CPI : num 211 210 210 211 212 ...
## $ Unemployment: num 7.81 7.81 7.81 7.81 7.79 ...
## $ Week : num 15 17 20 26 35 50 2 13 15 31 ...
## $ Sales_Class : Factor w/ 2 levels "Low","High": 2 2 2 2 2 2 2 2 2 2 ...
## $ Store_WOE : num 10.9 10.9 10.9 10.8 10.8 ...
## $ Holiday_WOE : num -0.01453 -0.01205 -0.00962 -0.00579 -0.00371 ...
# RFE on TRAIN SET for Classification
candidate_features_cls <- c("Store_WOE", "Holiday_WOE", "Temperature",
"CPI", "Unemployment", "Week", "Fuel_Price")
# Prepare X and Y from the training set
train_x_rfe <- trainData_cls[, candidate_features_cls]
train_y_rfe <- trainData_cls$Sales_Class
# RFE control
rfe_ctrl_cls_logreg <- rfeControl(
functions=lrFuncs,
method="cv",
number=5
)
# RFE
set.seed(2025)
rfe_results_cls_logreg <- rfe(
x=train_x_rfe,
y=train_y_rfe,
sizes=1:length(candidate_features_cls),
rfeControl=rfe_ctrl_cls_logreg
)
cat("\n=== RFE RESULTS (Logistic - WOE) ===\n")
##
## === RFE RESULTS (Logistic - WOE) ===
print(rfe_results_cls_logreg)
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (5 fold)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 1 0.9446 0.8875 0.005617 0.01148
## 2 0.9441 0.8864 0.008018 0.01621
## 3 0.9448 0.8880 0.007644 0.01553 *
## 4 0.9431 0.8844 0.007402 0.01497
## 5 0.9435 0.8852 0.006804 0.01376
## 6 0.9437 0.8856 0.006926 0.01399
## 7 0.9439 0.8859 0.007803 0.01576
##
## The top 3 variables (out of 3):
## Store_WOE, Temperature, Week
best_features_cls <- predictors(rfe_results_cls_logreg)
cat("\nChosen subset for classification:\n")
##
## Chosen subset for classification:
print(best_features_cls)
## [1] "Store_WOE" "Temperature" "Week"
# CROSS-VALIDATION
cls_formula <- as.formula(
paste("Sales_Class ~", paste(best_features_cls, collapse=" + "))
)
# 5-fold CV with SMOTE
cv_ctrl_cls <- trainControl(
method="cv",
number=5,
sampling="smote",
savePredictions=TRUE
)
# FIT LOGISTIC MODEL ON TEST DATA
set.seed(2025)
cls_glm <- train(
cls_formula,
data=testData_cls,
method="glm",
family=binomial,
trControl=cv_ctrl_cls
)
cat("\n[LOGISTIC] Results:\n")
##
## [LOGISTIC] Results:
print(cls_glm)
## Generalized Linear Model
##
## 1286 samples
## 3 predictor
## 2 classes: 'Low', 'High'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1029, 1028, 1029, 1029, 1029
## Addtional sampling using SMOTE
##
## Resampling results:
##
## Accuracy Kappa
## 0.9408922 0.88035
cat("\nLogistic Coefficients:\n")
##
## Logistic Coefficients:
print(summary(cls_glm$finalModel)$coefficients)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.80978517 0.476087228 1.700918 8.895845e-02
## Store_WOE 0.72951792 0.068294288 10.681976 1.236125e-26
## Temperature -0.02013515 0.007659153 -2.628900 8.566147e-03
## Week 0.02439882 0.008936037 2.730385 6.326032e-03
str(testData_cls)
## 'data.frame': 1286 obs. of 10 variables:
## $ Store : Factor w/ 45 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Weekly_Sales: num 1545419 1391256 1494252 1422712 1449143 ...
## $ Temperature : num 65.9 64.8 74.8 84.3 85.2 ...
## $ Fuel_Price : num 2.77 2.79 2.85 2.65 2.62 ...
## $ CPI : num 211 210 210 211 212 ...
## $ Unemployment: num 7.81 7.81 7.81 7.81 7.79 ...
## $ Week : num 15 17 20 26 35 50 2 13 15 31 ...
## $ Sales_Class : Factor w/ 2 levels "Low","High": 2 2 2 2 2 2 2 2 2 2 ...
## $ Store_WOE : num 10.9 10.9 10.9 10.8 10.8 ...
## $ Holiday_WOE : num -0.01453 -0.01205 -0.00962 -0.00579 -0.00371 ...
# KNN (Use same variables as Logistic Regression)
cls_knn <- train(
cls_formula,
data = trainData_cls,
method = "knn",
trControl = cv_ctrl_cls,
tuneLength = 10,
preProcess = c("center", "scale")
)
print(cls_knn)
## k-Nearest Neighbors
##
## 5149 samples
## 3 predictor
## 2 classes: 'Low', 'High'
##
## Pre-processing: centered (3), scaled (3)
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 4119, 4120, 4119, 4120, 4118
## Addtional sampling using SMOTE prior to pre-processing
##
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.9405725 0.8795553
## 7 0.9430973 0.8847135
## 9 0.9444566 0.8873687
## 11 0.9434840 0.8855164
## 13 0.9454286 0.8894411
## 15 0.9440686 0.8866666
## 17 0.9434866 0.8855424
## 19 0.9444575 0.8875184
## 21 0.9450395 0.8886844
## 23 0.9467885 0.8922857
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 23.
# SVM (Radial)
cls_svm <- train(
cls_formula,
data = trainData_cls,
method = "svmRadial",
trControl = cv_ctrl_cls,
tuneLength = 5,
preProcess = c("center", "scale")
)
print(cls_svm)
## Support Vector Machines with Radial Basis Function Kernel
##
## 5149 samples
## 3 predictor
## 2 classes: 'Low', 'High'
##
## Pre-processing: centered (3), scaled (3)
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 4119, 4119, 4119, 4120, 4119
## Addtional sampling using SMOTE prior to pre-processing
##
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.25 0.9411549 0.8806528
## 0.50 0.9423195 0.8829626
## 1.00 0.9442620 0.8868530
## 2.00 0.9446498 0.8876714
## 4.00 0.9427079 0.8836876
##
## Tuning parameter 'sigma' was held constant at a value of 0.556677
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.556677 and C = 2.
library(caret)
# Candidate features
candidate_features_cls <- c("Store", "Temperature", "CPI", "Unemployment", "Week", "Fuel_Price", "Holiday_Flag")
# Use logistic regression specific RFE functions
rfe_ctrl_cls <- rfeControl(
functions = treebagFuncs,
method = "cv",
number = 5
)
# Define predictor matrix and target vector
train_x_cls <- walmart_df[, candidate_features_cls]
train_y_cls <- walmart_df_cl$Sales_Class
# Run RFE
set.seed(2025)
rfe_results_cls <- rfe(
x = train_x_cls,
y = train_y_cls,
sizes = 1:length(candidate_features_cls),
rfeControl = rfe_ctrl_cls
)
cat("\n=== RFE RESULTS (Classification) ===\n")
##
## === RFE RESULTS (Classification) ===
print(rfe_results_cls)
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (5 fold)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 1 0.9523 0.9030 0.007699 0.015738
## 2 0.9520 0.9028 0.005638 0.011337
## 3 0.9562 0.9113 0.005082 0.010241
## 4 0.9605 0.9201 0.004540 0.009173
## 5 0.9635 0.9261 0.002627 0.005308
## 6 0.9627 0.9245 0.006380 0.012862
## 7 0.9658 0.9307 0.004133 0.008356 *
##
## The top 5 variables (out of 7):
## Store, Week, CPI, Temperature, Unemployment
# Train test
set.seed(2025)
walmart_df_cl1 <- dplyr::select(walmart_df_cl, -Fuel_Price, -Holiday_Flag, -Weekly_Sales)
trainIndex <- createDataPartition(walmart_df_cl1$Sales_Class, p = 0.8, list = FALSE)
trainData_cl <- walmart_df_cl1[trainIndex, ]
testData_cl <- walmart_df_cl1[-trainIndex, ]
cat("\nTraining Data Size:", nrow(trainData_cl), "\n")
##
## Training Data Size: 5149
cat("Testing Data Size:", nrow(testData_cl), "\n")
## Testing Data Size: 1286
# Cross-validation control
cv_ctrl_reg <- trainControl(method="cv", number=5, sampling ='smote')
# Setup Cross-Validation with SMOTE
cv_ctrl_cls <- trainControl(
method = "cv",
number = 5,
sampling = "smote",
savePredictions = TRUE
)
str(walmart_df)
## 'data.frame': 6435 obs. of 10 variables:
## $ Store : Factor w/ 45 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : Date, format: "2010-02-05" "2010-02-12" ...
## $ Weekly_Sales: num 1643691 1641957 1611968 1409728 1554807 ...
## $ Holiday_Flag: Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 1 ...
## $ Temperature : num 42.3 38.5 39.9 46.6 46.5 ...
## $ Fuel_Price : num 2.57 2.55 2.51 2.56 2.62 ...
## $ CPI : num 211 211 211 211 211 ...
## $ Unemployment: num 8.11 8.11 8.11 8.11 8.11 ...
## $ Week : num 6 7 8 9 10 11 12 13 14 15 ...
## $ Date_numeric: num 14645 14652 14659 14666 14673 ...
# Create a formula for classification
cls_formula <- as.formula("Sales_Class ~ Store + Week + CPI + Temperature + Unemployment")
# (5) Decision Tree
cls_dt_grid <- expand.grid(cp = seq(0.001, 0.01, by = 0.002))
cls_dt <- train(
cls_formula,
data = trainData_cl,
method = "rpart",
trControl = cv_ctrl_cls,
tuneGrid = cls_dt_grid
)
print(cls_dt)
## CART
##
## 5149 samples
## 5 predictor
## 2 classes: 'High', 'Low'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 4120, 4119, 4119, 4119, 4119
## Addtional sampling using SMOTE
##
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.001 0.9463970 0.8911338
## 0.003 0.9296929 0.8562443
## 0.005 0.9300816 0.8571123
## 0.007 0.9300816 0.8571123
## 0.009 0.9256141 0.8475072
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.001.
# Visualizing Decision Tree
rpart.plot(cls_dt$finalModel, main="Final Decision Tree (Classification)", cex = 0.5, cex.main = 1.5)
## Warning: labs do not fit even at cex 0.15, there may be some overplotting
# Candidate features
walmart_df_cl2 <- walmart_df_cl
candidate_features_cl2 <- c("Store", "Temperature", "CPI", "Unemployment", "Week", "Fuel_Price", "Holiday_Flag")
# Use logistic regression specific RFE functions
rfe_ctrl_cls <- rfeControl(
functions = rfFuncs,
method = "cv",
number = 5
)
# Define predictor matrix and target vector
train_x_cl2 <- walmart_df_cl2[, candidate_features_cl2]
train_x_cl2 <- train_x_cl2 %>% mutate_if(is.factor, as.numeric)
train_y_cl2 <- walmart_df_cl2$Sales_Class
# Run RFE
set.seed(2025)
rfe_results_cls <- rfe(
x = train_x_cl2,
y = train_y_cl2,
sizes = 1:length(candidate_features_cl2),
rfeControl = rfe_ctrl_cls
)
cat("\n=== RFE RESULTS (Classification) ===\n")
##
## === RFE RESULTS (Classification) ===
print(rfe_results_cls)
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (5 fold)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 1 0.9523 0.9030 0.007699 0.015738
## 2 0.9579 0.9144 0.007662 0.015612
## 3 0.9613 0.9213 0.007075 0.014422
## 4 0.9587 0.9162 0.005737 0.011640
## 5 0.9622 0.9235 0.004232 0.008579 *
## 6 0.9616 0.9223 0.004440 0.008934
## 7 0.9616 0.9223 0.005194 0.010407
##
## The top 5 variables (out of 5):
## Store, CPI, Unemployment, Fuel_Price, Temperature
### Train test Split`
set.seed(2025)
walmart_df_cl <- dplyr::select(walmart_df_cl2, -Weekly_Sales, -Week, -Holiday_Flag)
trainIndex <- createDataPartition(walmart_df_cl2$Sales_Class, p = 0.8, list = FALSE)
trainData_cls2 <- walmart_df_cl2[trainIndex, ]
testData_cls2 <- walmart_df_cl2[-trainIndex, ]
cat("\nTraining Data Size:", nrow(trainData_cls2), "\n")
##
## Training Data Size: 5149
cat("Testing Data Size:", nrow(testData_cls2), "\n")
## Testing Data Size: 1286
# Cross-validation control
cv_ctrl_reg <- trainControl(method="cv", number=5, sampling ='smote')
# Setup Cross-Validation with SMOTE
cv_ctrl_cls <- trainControl(
method = "cv",
number = 5,
sampling = "smote",
savePredictions = TRUE
)
str(walmart_df)
## 'data.frame': 6435 obs. of 10 variables:
## $ Store : Factor w/ 45 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : Date, format: "2010-02-05" "2010-02-12" ...
## $ Weekly_Sales: num 1643691 1641957 1611968 1409728 1554807 ...
## $ Holiday_Flag: Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 1 ...
## $ Temperature : num 42.3 38.5 39.9 46.6 46.5 ...
## $ Fuel_Price : num 2.57 2.55 2.51 2.56 2.62 ...
## $ CPI : num 211 211 211 211 211 ...
## $ Unemployment: num 8.11 8.11 8.11 8.11 8.11 ...
## $ Week : num 6 7 8 9 10 11 12 13 14 15 ...
## $ Date_numeric: num 14645 14652 14659 14666 14673 ...
# Create a formula for classification
cls_formula <- as.formula("Sales_Class ~ Store + CPI + Unemployment + Fuel_Price + Temperature")
# (3) Random Forest
cls_rf <- train(
cls_formula,
data = trainData_cls2,
method = "rf",
trControl = cv_ctrl_cls,
tuneGrid = expand.grid(mtry = c(2, 3, 4)),
ntree = 500
)
print(cls_rf)
## Random Forest
##
## 5149 samples
## 5 predictor
## 2 classes: 'High', 'Low'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 4120, 4119, 4119, 4119, 4119
## Addtional sampling using SMOTE
##
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9263913 0.8490232
## 3 0.9256141 0.8472439
## 4 0.9283333 0.8530607
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 4.
cat("\nRandom Forest Variable Importance:\n")
##
## Random Forest Variable Importance:
print(varImp(cls_rf))
## rf variable importance
##
## only 20 most important variables shown (out of 48)
##
## Overall
## Unemployment 100.00
## Store13 83.67
## Store6 79.46
## Store20 75.39
## Store27 74.79
## Store2 74.34
## Store24 74.05
## Store4 73.61
## Store31 70.75
## Store14 66.16
## Store39 65.60
## Store19 65.16
## Store10 64.57
## Store11 64.55
## Store23 64.51
## Store41 61.32
## Store32 60.61
## Store28 60.10
## CPI 56.11
## Store3 25.14
# EVALUATE ON TEST DATA
models_class <- list(
Logistic=cls_glm,
KNN=cls_knn,
SVM=cls_svm,
DecisionTree=cls_dt,
RandomForest=cls_rf
)
final_cls_results <- data.frame(
Model=character(),
Accuracy=numeric(),
Sensitivity=numeric(),
Specificity=numeric(),
Precision=numeric(),
stringsAsFactors=FALSE
)
for(model_name in names(models_class)) {
model_obj <- models_class[[model_name]]
preds <- predict(model_obj, newdata=testData_cls)
cm <- confusionMatrix(preds, testData_cls$Sales_Class)
final_cls_results <- rbind(
final_cls_results,
data.frame(
Model = model_name,
Accuracy = cm$overall["Accuracy"],
Sensitivity = cm$byClass["Sensitivity"],
Specificity = cm$byClass["Specificity"],
Precision = cm$byClass["Precision"]
)
)
}
## Warning in confusionMatrix.default(preds, testData_cls$Sales_Class): Levels are
## not in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(preds, testData_cls$Sales_Class): Levels are
## not in the same order for reference and data. Refactoring data to match.
cat("\n=== FINAL CLASSIFICATION RESULTS (TEST) ===\n")
##
## === FINAL CLASSIFICATION RESULTS (TEST) ===
print(final_cls_results, row.names=FALSE)
## Model Accuracy Sensitivity Specificity Precision
## Logistic 0.9424572 0.9507736 0.9321739 0.9454545
## KNN 0.9401244 0.9493671 0.9286957 0.9427374
## SVM 0.9455677 0.9690577 0.9165217 0.9348711
## DecisionTree 0.9611198 0.9845288 0.9321739 0.9472260
## RandomForest 0.9307932 0.9943741 0.8521739 0.8926768
## Load & Basic Checks + Transaction by Country
online_df <- read.csv("C:/Users/Santasila Bryan Kusn/OneDrive/Documents/SIM GE Courses/LSE Year 3/ST3189 - Machine Learning/Assignments + Coursework/Cursework Project/Online Retail.csv")
cat("Dimension:", dim(online_df), "\n")
## Dimension: 541909 8
cat("Total Missing:", sum(is.na(online_df)), "\n")
## Total Missing: 135080
cat("Duplicates:", sum(duplicated(online_df)), "\n")
## Duplicates: 5268
online_df <- online_df[online_df$Quantity>0 & online_df$UnitPrice>0, ]
online_df <- na.omit(online_df)
cat("Final dimension after cleaning:", dim(online_df), "\n")
## Final dimension after cleaning: 397884 8
cat("Remaining Missing:", sum(is.na(online_df)), "\n")
## Remaining Missing: 0
transaction_data <- online_df %>%
count(Country, name = "TransactionCount") %>%
arrange(desc(TransactionCount))
head(transaction_data, 10) # top 10 countries
p <- ggplot(transaction_data, aes(x=reorder(Country, -TransactionCount), y=TransactionCount)) +
geom_bar(stat='identity', fill='steelblue') +
theme_minimal() +
labs(x='', y='Transaction Count', title='Transaction Count by Country') +
scale_y_continuous(
breaks = seq(50000, 400000, by=50000),
labels = function(x) paste0(x/1000, "K")
) +
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 40, vjust = 0.7))
ggplotly(p)
# Revenue calculations
online_retail <- online_df %>%
mutate(Revenue = Quantity * UnitPrice,
InvoiceDate = dmy_hm(InvoiceDate))
monthly_revenue <- online_retail %>%
mutate(YearMonth = floor_date(InvoiceDate, "month")) %>%
group_by(YearMonth) %>%
summarise(Monthly_Revenue = sum(Revenue))
head(monthly_revenue)
d_revenue <- online_retail %>%
mutate(YearMonth = floor_date(InvoiceDate, "day")) %>%
group_by(YearMonth) %>%
summarise(Monthly_Revenue = sum(Revenue))
weekday_revenue <- online_retail %>%
mutate(Weekday = wday(InvoiceDate)) %>%
group_by(Weekday) %>%
summarise(Daily_Revenue = sum(Revenue))
# Plot daily
p2 <- ggplot(d_revenue, aes(x=YearMonth, y=Monthly_Revenue)) +
geom_line(color='lightblue') +
labs(title='Revenue by Date', x='Date', y='Total Sales') +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(p2)
# We rename columns for clarity
# or just do it after creation
data0 <- online_df
data1 <- data0 %>%
filter(!is.na(CustomerID), !is.na(Quantity), Quantity>0) %>%
mutate(InvoiceDate = dmy_hm(InvoiceDate),
Revenue = Quantity*UnitPrice)
snapshot_date <- as.Date("2011-12-09")
rfm_data <- data1 %>%
group_by(CustomerID) %>%
summarise(
Recency = round(as.numeric(difftime(snapshot_date, max(InvoiceDate), units="days")),2),
Frequency = n_distinct(InvoiceNo),
Monetary = sum(Revenue)
) %>%
ungroup()
# Rename columns to explicitly say RFM
colnames(rfm_data) <- c("CustomerID","Recency","Frequency","Monetary")
print(rfm_data)
## # A tibble: 4,338 × 4
## CustomerID Recency Frequency Monetary
## <int> <dbl> <int> <dbl>
## 1 12346 325. 1 77184.
## 2 12347 1.34 7 4310
## 3 12348 74.4 4 1797.
## 4 12349 17.6 1 1758.
## 5 12350 309. 1 334.
## 6 12352 35.4 8 2506.
## 7 12353 203. 1 89
## 8 12354 231. 1 1079.
## 9 12355 213. 1 459.
## 10 12356 21.6 3 2811.
## # ℹ 4,328 more rows
# Boxplots combined into one figure
p_recency <- ggplot(rfm_data, aes(x=factor(1), y=Recency)) +
geom_boxplot(fill="#5874DC") +
coord_cartesian(ylim=c(0,300)) +
labs(title="Recency", y="Recency", x="") +
theme_minimal() +
theme(plot.title=element_text(hjust=0.5),
axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
p_frequency <- ggplot(rfm_data, aes(x=factor(1), y=Frequency)) +
geom_boxplot(fill="#E06C78") +
coord_cartesian(ylim=c(0,20)) +
labs(title="Frequency", y="Frequency", x="") +
theme_minimal() +
theme(plot.title=element_text(hjust=0.5),
axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
p_monetary <- ggplot(rfm_data, aes(x=factor(1), y=Monetary)) +
geom_boxplot(fill="#6AAB9C") +
coord_cartesian(ylim=c(0,3000)) +
labs(title="Monetary", y="Monetary", x="") +
theme_minimal() +
theme(plot.title=element_text(hjust=0.5),
axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
p_combined <- p_recency + p_frequency + p_monetary +
plot_layout(guides = 'collect') &
theme(legend.position = 'top', legend.justification="right")
print(p_combined)
cat("Mean Recency:", mean(rfm_data$Recency, na.rm=TRUE), "\n")
## Mean Recency: 91.51462
cat("Mean Frequency:", mean(rfm_data$Frequency, na.rm=TRUE), "\n")
## Mean Frequency: 4.272015
cat("Mean Monetary:", mean(rfm_data$Monetary, na.rm=TRUE), "\n")
## Mean Monetary: 2054.266
cat("Median Recency:", median(rfm_data$Recency, na.rm=TRUE), "\n")
## Median Recency: 49.555
cat("Median Frequency:", median(rfm_data$Frequency, na.rm=TRUE), "\n")
## Median Frequency: 2
cat("Median Monetary:", median(rfm_data$Monetary, na.rm=TRUE), "\n")
## Median Monetary: 674.485
rfm_data$Recency <- as.numeric(rfm_data$Recency)
rfm_data$Frequency <- as.numeric(rfm_data$Frequency)
rfm_data$Monetary <- as.numeric(rfm_data$Monetary)
rfm_scaled <- scale(rfm_data[, c("Recency","Frequency","Monetary")])
# PCA
pca_res <- prcomp(rfm_scaled, scale.=TRUE)
pca_summary <- summary(pca_res)
cat("Proportion of Variance (PC1,PC2):\n")
## Proportion of Variance (PC1,PC2):
print(pca_summary$importance[2,1:2])
## PC1 PC2
## 0.55507 0.30245
num_pcs <- which(cumsum(pca_summary$importance[2,])>=0.8)[1]
cat("Number of PCs used:", num_pcs, "\n")
## Number of PCs used: 2
# K-Means (k=4 as example)
set.seed(2025)
kmeans_result <- kmeans(pca_res$x[,1:2], centers=4)
# Elbow & Silhouette
fviz_nbclust(rfm_scaled, kmeans, method="wss")
fviz_nbclust(rfm_scaled, kmeans, method="silhouette")
# Visualization
cluster_data <- data.frame(pca_res$x[,1:2], cluster=factor(kmeans_result$cluster))
ggplot(cluster_data, aes(x=PC1, y=PC2, color=cluster)) +
geom_point(size=3, alpha=0.6) +
labs(title="K-Means Clustering on PCA (k=4)", color="Cluster") +
theme_minimal()
# Summaries
rfm_data$Cluster <- kmeans_result$cluster
kmeans_summary <- rfm_data %>%
group_by(Cluster) %>%
summarise(
Size = n(),
Avg_Recency = mean(Recency),
Avg_Frequency = mean(Frequency),
Avg_Monetary = mean(Monetary)
)
print(kmeans_summary)
## # A tibble: 4 × 5
## Cluster Size Avg_Recency Avg_Frequency Avg_Monetary
## <int> <int> <dbl> <dbl> <dbl>
## 1 1 171 14.6 23.7 14871.
## 2 2 12 6.50 86.8 130361.
## 3 3 3086 42.3 3.82 1391.
## 4 4 1069 247. 1.54 479.
dist_matrix <- dist(rfm_data[, c("Recency","Frequency","Monetary")])
hc_ward <- hclust(dist_matrix, method="ward.D2")
hc_complete <- hclust(dist_matrix, method="complete")
hc_single <- hclust(dist_matrix, method="single")
hc_average <- hclust(dist_matrix, method="average")
clusters_ward <- cutree(hc_ward, k=4)
clusters_complete <- cutree(hc_complete, k=4)
clusters_single <- cutree(hc_single, k=4)
clusters_average <- cutree(hc_average, k=4)
sil_ward <- silhouette(clusters_ward, dist_matrix)
sil_complete <- silhouette(clusters_complete, dist_matrix)
sil_single <- silhouette(clusters_single, dist_matrix)
sil_average <- silhouette(clusters_average, dist_matrix)
cat("\nAverage Silhouette Scores by method:\n")
##
## Average Silhouette Scores by method:
cat("Ward's:", mean(sil_ward[,3]), "\n")
## Ward's: 0.9543151
cat("Complete:", mean(sil_complete[,3]), "\n")
## Complete: 0.9671201
cat("Single:", mean(sil_single[,3]), "\n")
## Single: 0.9800182
cat("Average:", mean(sil_average[,3]), "\n")
## Average: 0.963419
dist_euclidean <- dist(rfm_data[,c("Recency","Frequency","Monetary")], method="euclidean")
dist_manhattan <- dist(rfm_data[,c("Recency","Frequency","Monetary")], method="manhattan")
hc_euclidean <- hclust(dist_euclidean, method="ward.D2")
hc_manhattan <- hclust(dist_manhattan, method="ward.D2")
clusters_euclidean <- cutree(hc_euclidean, k=4)
clusters_manhattan <- cutree(hc_manhattan, k=4)
sil_euclidean <- silhouette(clusters_euclidean, dist_euclidean)
sil_manhattan <- silhouette(clusters_manhattan, dist_manhattan)
cat("Silhouette Score (Euclidean):", mean(sil_euclidean[,3]), "\n")
## Silhouette Score (Euclidean): 0.9543151
cat("Silhouette Score (Manhattan):", mean(sil_manhattan[,3]), "\n")
## Silhouette Score (Manhattan): 0.891857
# Plot final dendrogram using scaled data + ward
rfm_scaled <- scale(rfm_data[, c('Recency', 'Frequency', 'Monetary')])
hc <- hclust(dist(rfm_scaled, method = "euclidian"), method="ward.D2")
dend <- as.dendrogram(hc)
dend <- color_branches(dend, k=4)
dend <- hang.dendrogram(dend, hang=-1)
labels(dend) <- NULL
## Warning in `labels<-.dendrogram`(`*tmp*`, value = NULL): The lengths of the new
## labels is shorter than the number of leaves in the dendrogram - labels are
## recycled.
## Warning in rep(new_labels, length.out = leaves_length): 'x' is NULL so the
## result will be NULL
plot(dend, main="Dendrogram using Euclidian Distance & Ward Linkage, k=4")
clusters <- cutree(hc, k=4)
rfm_data$Cluster <- clusters
cluster_summary <- rfm_data %>%
group_by(Cluster) %>%
summarise(
Size = n(),
Average_Recency = mean(Recency),
Average_Frequency = mean(Frequency),
Average_Monetary = mean(Monetary)
) %>%
arrange(desc(Average_Monetary))
print(cluster_summary)
## # A tibble: 4 × 5
## Cluster Size Average_Recency Average_Frequency Average_Monetary
## <int> <int> <dbl> <dbl> <dbl>
## 1 4 15 5.48 83.5 111916.
## 2 1 146 11.1 24.4 14514.
## 3 2 2909 36.8 4.03 1535.
## 4 3 1268 227. 1.58 511.
All requested code has been consolidated into one script: