Setting Up The Dataset

Install and Load Libraries

This section installs and loads the required packages for data manipulation, visualization, and modelling.

# install.packages(c("readxl", "dplyr", "ggplot2", "lubridate", "caret", "randomForest", "xgboost", "corrplot", "stringr", "skimr", "tidyr"))

library(readxl)
library(dplyr)
library(ggplot2)
library(lubridate)
library(caret)
library(randomForest)
library(xgboost)
library(corrplot)
library(stringr)
library(skimr)
library(tidyr)
library(scales)
library(broom)
library(gridExtra)
library(factoextra)

Import Dataset

The dataset is imported from an Excel file and briefly inspected to understand its structure.

data <- read_excel("Advanced Topics Individual Dataset.xlsx")

knitr::kable(head(data))

Customer ID	Age	Gender	Loyalty Member	Product Type	SKU	Rating	Order Status	Payment Method	Total Price	Unit Price	Quantity	Purchase Date	Shipping Type	Add-ons Purchased	Add-on Total
1000	53	Male	No	Smartphone	SKU1004	2	Cancelled	Credit Card	5538.33	791.19	7	2024-03-20	Standard	Accessory,Accessory,Accessory	40.21
1000	53	Male	No	Tablet	SKU1002	3	Completed	Paypal	741.09	247.03	3	2024-04-20	Overnight	Impulse Item	26.09
1002	41	Male	No	Laptop	SKU1005	3	Completed	Credit Card	1855.84	463.96	4	2023-10-17	Express	NA	0.00
1002	41	Male	Yes	Smartphone	SKU1004	2	Completed	Cash	3164.76	791.19	4	2024-08-09	Overnight	Impulse Item,Impulse Item	60.16
1003	75	Male	Yes	Smartphone	SKU1001	5	Completed	Cash	41.50	20.75	2	2024-05-21	Express	Accessory	35.56
1004	41	Female	No	Smartphone	SKU1001	5	Completed	Credit Card	83.00	20.75	4	2024-05-26	Standard	Impulse Item,Accessory	65.78

Data Cleaning

Standardizing Column Names

Column names are cleaned to ensure consistency and avoid issues in later analysis.

colnames(data) <- colnames(data) %>%
  str_replace_all(" ", "_") %>%
  str_replace_all("[^[:alnum:]_]", "")

knitr::kable(colnames(data))

x
Customer_ID
Age
Gender
Loyalty_Member
Product_Type
SKU
Rating
Order_Status
Payment_Method
Total_Price
Unit_Price
Quantity
Purchase_Date
Shipping_Type
Addons_Purchased
Addon_Total

Checking Missing Values

Missing values are checked across all variables.

na_summary <- sapply(data, function(x) sum(is.na(x)))
knitr::kable(na_summary)

	x
Customer_ID	0
Age	0
Gender	1
Loyalty_Member	0
Product_Type	0
SKU	0
Rating	0
Order_Status	0
Payment_Method	0
Total_Price	0
Unit_Price	0
Quantity	0
Purchase_Date	0
Shipping_Type	0
Addons_Purchased	4868
Addon_Total	0

Handling Missing Values

Missing values in add-on related variables are handled.

data$Addons_Purchased[is.na(data$Addons_Purchased)] <- "None"
data$Addon_Total[is.na(data$Addon_Total)] <- 0

Fixing Inconsistencies

Categorical values are standardized for consistency.

data$Payment_Method <- str_to_title(data$Payment_Method)
data$Payment_Method[data$Payment_Method == "Paypal"] <- "PayPal"

Converting Data Types

Variables are converted into appropriate formats for analysis.

data$Purchase_Date <- as.Date(data$Purchase_Date)

data$Customer_ID <- as.character(data$Customer_ID)
data$Gender <- as.factor(data$Gender)
data$Loyalty_Member <- as.factor(data$Loyalty_Member)
data$Product_Type <- as.factor(data$Product_Type)
data$Payment_Method <- as.factor(data$Payment_Method)
data$Shipping_Type <- as.factor(data$Shipping_Type)
data$Order_Status <- as.factor(data$Order_Status)

str(data)

## tibble [20,000 × 16] (S3: tbl_df/tbl/data.frame)
##  $ Customer_ID     : chr [1:20000] "1000" "1000" "1002" "1002" ...
##  $ Age             : num [1:20000] 53 53 41 41 75 41 25 25 24 24 ...
##  $ Gender          : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 1 1 1 2 2 ...
##  $ Loyalty_Member  : Factor w/ 2 levels "No","Yes": 1 1 1 2 2 1 1 1 1 2 ...
##  $ Product_Type    : Factor w/ 5 levels "Headphones","Laptop",..: 3 5 2 3 3 3 4 2 3 2 ...
##  $ SKU             : chr [1:20000] "SKU1004" "SKU1002" "SKU1005" "SKU1004" ...
##  $ Rating          : num [1:20000] 2 3 3 2 5 5 3 3 2 3 ...
##  $ Order_Status    : Factor w/ 2 levels "Cancelled","Completed": 1 2 2 2 2 2 2 2 1 2 ...
##  $ Payment_Method  : Factor w/ 5 levels "Bank Transfer",..: 3 5 3 2 2 3 5 4 4 2 ...
##  $ Total_Price     : num [1:20000] 5538.3 741.1 1855.8 3164.8 41.5 ...
##  $ Unit_Price      : num [1:20000] 791.2 247 464 791.2 20.8 ...
##  $ Quantity        : num [1:20000] 7 3 4 4 2 4 9 9 7 9 ...
##  $ Purchase_Date   : Date[1:20000], format: "2024-03-20" "2024-04-20" ...
##  $ Shipping_Type   : Factor w/ 5 levels "Expedited","Express",..: 5 3 2 3 2 5 3 3 5 2 ...
##  $ Addons_Purchased: chr [1:20000] "Accessory,Accessory,Accessory" "Impulse Item" "None" "Impulse Item,Impulse Item" ...
##  $ Addon_Total     : num [1:20000] 40.2 26.1 0 60.2 35.6 ...

Summary of Clean Datasets

A summary of the cleaned dataset is displayed.

data_full <- data

data_clean <- data %>%
  filter(Order_Status == "Completed")

skim(data_clean)

Data summary
Name	data_clean
Number of rows	13432
Number of columns	16
_______________________
Column type frequency:
character	3
Date	1
factor	6
numeric	6
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
Customer_ID	1	4	5	9466
SKU	1	6	7	10
Addons_Purchased	1	4	55	76

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
Purchase_Date	0	1	2023-09-24	2024-09-23	2024-04-24	366

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
Gender	1	1	FALSE	2	Mal: 6780, Fem: 6651
Loyalty_Member	0	1	FALSE	2	No: 10512, Yes: 2920
Product_Type	0	1	FALSE	5	Sma: 4004, Tab: 2745, Lap: 2686, Sma: 2636
Order_Status	0	1	FALSE	1	Com: 13432, Can: 0
Payment_Method	0	1	FALSE	5	Cre: 3899, Pay: 3863, Ban: 2259, Cas: 1727
Shipping_Type	0	1	FALSE	5	Sta: 4561, Ove: 2247, Exp: 2227, Exp: 2210

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Age	1	48.97	18.06	18.00	33.00	49.00	65.00	80.00	▇▇▇▇▇
Rating	1	3.10	1.23	1.00	2.00	3.00	4.00	5.00	▂▃▇▂▅
Total_Price	1	3173.74	2541.48	20.75	1083.54	2534.49	4639.60	11396.80	▇▅▃▂▁
Unit_Price	1	577.88	312.83	20.75	361.18	463.96	791.19	1139.68	▂▇▂▆▂
Quantity	1	5.48	2.87	1.00	3.00	5.00	8.00	10.00	▇▇▇▇▇
Addon_Total	1	62.21	57.96	0.00	7.19	52.11	93.56	292.77	▇▅▂▁▁

Feature Engineering

Creating Transaction-Level Features

New variables are created to capture additional transaction characteristics.

# Create Transaction ID
data_clean <- data_clean %>%
  mutate(Transaction_ID = row_number())

# Extract Month from Purchase Date
data_clean$Month <- month(data_clean$Purchase_Date)

# Installment Payment (1 = Credit Card / PayPal, else 0)
data_clean$Installment_Payment <- ifelse(
  data_clean$Payment_Method %in% c("Credit Card", "PayPal"), 1, 0
)

# Handling Multi-Value Categorical Variable (Add-ons Purchased)
data_clean <- data_clean %>%
  mutate(Has_Addon = ifelse(
    is.na(Addons_Purchased) |
    Addons_Purchased == "" |
    tolower(trimws(Addons_Purchased)) == "none",
    0,
    str_count(Addons_Purchased, ",") + 1
  ))

addons_df <- data_clean %>%
  separate_rows(Addons_Purchased, sep = ",") %>%
  mutate(Addons_Purchased = str_trim(Addons_Purchased)) %>%
  filter(
    !is.na(Addons_Purchased),
    Addons_Purchased != "",
    tolower(Addons_Purchased) != "none"
  ) %>%
  group_by(Transaction_ID, Addons_Purchased) %>%
  summarise(count = n(), .groups = "drop") %>%
  pivot_wider(
    names_from = Addons_Purchased,
    values_from = count,
    values_fill = 0
  )

addon_cols <- setdiff(colnames(addons_df), "Transaction_ID")
data_clean <- data_clean %>%
  left_join(addons_df, by = "Transaction_ID") %>%
  mutate(across(all_of(addon_cols), ~replace_na(., 0)))

data_clean <- data_clean %>%
  select(
    Transaction_ID,
    Customer_ID,
    Age,
    Gender,
    Loyalty_Member,
    Purchase_Date,
    Month,
    Product_Type,
    SKU,
    Order_Status,
    Shipping_Type,
    Payment_Method,
    Installment_Payment,
    Quantity,
    Unit_Price,
    Total_Price,
    Addon_Total,
    Rating,
    Has_Addon,
    `Impulse Item`,
    Accessory,
    `Extended Warranty`
  )

# Check new variables
knitr::kable(head(data_clean))

Transaction_ID	Customer_ID	Age	Gender	Loyalty_Member	Purchase_Date	Month	Product_Type	SKU	Order_Status	Shipping_Type	Payment_Method	Installment_Payment	Quantity	Unit_Price	Total_Price	Addon_Total	Rating	Has_Addon	Impulse Item	Accessory
1	1000	53	Male	No	2024-04-20	4	Tablet	SKU1002	Completed	Overnight	PayPal	1	3	247.03	741.09	26.09	3	1	1	0
2	1002	41	Male	No	2023-10-17	10	Laptop	SKU1005	Completed	Express	Credit Card	1	4	463.96	1855.84	0.00	3	0	0	0
3	1002	41	Male	Yes	2024-08-09	8	Smartphone	SKU1004	Completed	Overnight	Cash	0	4	791.19	3164.76	60.16	2	2	2	0
4	1003	75	Male	Yes	2024-05-21	5	Smartphone	SKU1001	Completed	Express	Cash	0	2	20.75	41.50	35.56	5	1	0	1
5	1004	41	Female	No	2024-05-26	5	Smartphone	SKU1001	Completed	Standard	Credit Card	1	4	20.75	83.00	65.78	5	2	1	1
6	1005	25	Female	No	2024-01-30	1	Smartwatch	SKU1003	Completed	Overnight	PayPal	1	9	844.83	7603.47	0.00	3	0	0	0

Creating Customer-Level Dataset

Transaction data is aggregated to the customer level to capture purchasing behaviour.

customer_data <- data_clean %>%
  group_by(Customer_ID) %>%
  summarise(
    Age = first(Age),
    Gender = first(Gender),
    Ever_Loyal = max(ifelse(Loyalty_Member == "Yes", 1, 0)),
    Total_Transactions = n(),
    Total_Spend = sum(Total_Price, na.rm = TRUE),
    Avg_Spend = mean(Total_Price, na.rm = TRUE),
    Total_Quantity = sum(Quantity, na.rm = TRUE),
    Has_Addon = sum(Has_Addon, na.rm = TRUE),
    `Impulse Item` = sum(`Impulse Item`, na.rm = TRUE),
    Accessory = sum(Accessory, na.rm = TRUE),
    `Extended Warranty` = sum(`Extended Warranty`, na.rm = TRUE),
    
    .groups = "drop"
  )

# View dataset
knitr::kable(head(customer_data))

Customer_ID	Age	Gender	Ever_Loyal	Total_Transactions	Total_Spend	Avg_Spend	Total_Quantity	Has_Addon	Impulse Item	Accessory	Extended Warranty
1000	53	Male	0	1	741.09	741.090	3	1	1	0	0
1002	41	Male	1	2	5020.60	2510.300	8	2	2	0	0
1003	75	Male	1	1	41.50	41.500	2	1	0	1	0
1004	41	Female	0	1	83.00	83.000	4	2	1	1	0
1005	25	Female	0	2	11779.11	5889.555	18	2	0	0	2
1006	24	Male	1	2	6645.94	3322.970	19	3	2	1	0

Exploratory Data Analysis (EDA)

Variables Distribution in Table

The distribution of variables is summarised to identify patterns and variability.

eda_table <- function(df, var) {
  df %>%
    group_by({{ var }}) %>%
    summarise(
      Count = n(),
      Percentage = n() / nrow(df),
      Avg_Value = mean(Total_Price, na.rm = TRUE),
      Total_Value = sum(Total_Price, na.rm = TRUE),
      .groups = "drop"
    ) %>%
    arrange(desc(Count))
}
knitr::kable(head(eda_table(data_clean, Age)))

Age	Count	Percentage	Avg_Value	Total_Value
61	259	0.0192823	3183.235	824458.0
40	253	0.0188356	2903.359	734549.8
28	246	0.0183145	2996.149	737052.8
67	244	0.0181656	3006.334	733545.5
50	238	0.0177189	3253.173	774255.1
37	236	0.0175700	3043.013	718151.0

knitr::kable(eda_table(data_clean, Gender))

Gender	Count	Percentage	Avg_Value	Total_Value
Male	6780	0.5047647	3152.216	21372025.22
Female	6651	0.4951608	3196.048	21256916.03
NA	1	0.0000744	674.320	674.32

knitr::kable(eda_table(data_clean, Month))

Month	Count	Percentage	Avg_Value	Total_Value
1	1399	0.1041543	3228.219	4516278
5	1371	0.1020697	3255.226	4462915
7	1356	0.1009529	3289.784	4460946
6	1349	0.1004318	3310.597	4465995
3	1323	0.0984961	3194.351	4226127
8	1323	0.0984961	3309.473	4378433
4	1284	0.0955926	3349.162	4300324
2	1227	0.0913490	3176.243	3897250
9	1125	0.0837552	3258.562	3665883
10	613	0.0456373	2537.236	1555326
11	537	0.0399792	2588.671	1390116
12	525	0.0390858	2495.282	1310023

knitr::kable(eda_table(data_clean, Product_Type))

Product_Type	Count	Percentage	Avg_Value	Total_Value
Smartphone	4004	0.2980941	3598.361	14407836
Tablet	2745	0.2043627	2813.345	7722632
Laptop	2686	0.1999702	3114.633	8365905
Smartwatch	2636	0.1962478	3565.475	9398591
Headphones	1361	0.1013252	2009.295	2734651

knitr::kable(eda_table(data_clean, SKU))

SKU	Count	Percentage	Avg_Value	Total_Value
SKU1002	1382	0.1028886	1360.6312	1880392.4
TBL345	1363	0.1014741	4286.3095	5842239.9
HDP456	1360	0.1012507	2007.7359	2730520.8
SKU1005	1352	0.1006552	2442.1142	3301738.4
SMP234	1346	0.1002085	6254.6925	8418816.2
LTP123	1334	0.0993151	3796.7148	5064817.5
SKU1001	1329	0.0989428	112.0875	148964.2
SKU1004	1329	0.0989428	4393.8335	5839404.7
SWT567	1325	0.0986450	2546.4970	3374108.5
SKU1003	1312	0.0976772	4594.9794	6028612.9

knitr::kable(eda_table(data_clean, Shipping_Type))

Shipping_Type	Count	Percentage	Avg_Value	Total_Value
Standard	4561	0.3395622	3154.360	14387038
Overnight	2247	0.1672871	2620.259	5887723
Express	2227	0.1657981	2529.005	5632093
Expedited	2210	0.1645325	3814.654	8430386
Same Day	2187	0.1628201	3791.667	8292376

knitr::kable(head(eda_table(data_clean, Quantity)))

Quantity	Count	Percentage	Avg_Value	Total_Value
2	1369	0.1019208	1150.322	1574791
3	1367	0.1017719	1719.858	2351047
8	1363	0.1014741	4631.195	6312319
9	1344	0.1000596	5236.902	7038397
4	1343	0.0999851	2308.096	3099773
6	1343	0.0999851	3517.713	4724289

knitr::kable(eda_table(data_clean, Rating))

Rating	Count	Percentage	Avg_Value	Total_Value
3	5339	0.3974836	3034.852	16203077
5	2677	0.1993002	1981.182	5303625
2	2642	0.1966945	4077.199	10771960
1	1392	0.1036331	3751.574	5222191
4	1382	0.1028886	3711.116	5128763

knitr::kable(eda_table(data_clean, Has_Addon))

Has_Addon	Count	Percentage	Avg_Value	Total_Value
2	3419	0.2545414	3110.484	10634744
3	3375	0.2512656	3185.780	10752008
1	3351	0.2494789	3201.507	10728250
0	3287	0.2447141	3198.848	10514615

Variables Distribution in Bar Charts

The distribution of variables is visualized to identify patterns and variability.

# Function to create clean bar chart
plot_bar <- function(df, var, title_name) {
  df %>%
    filter(!is.na({{ var }})) %>%
    ggplot(aes(x = as.factor({{ var }}))) +
    geom_bar(fill = "navyblue") +
    labs(
      title = title_name,
      x = NULL,
      y = NULL
    ) +
    theme_minimal() +
    theme(
      plot.title = element_text(size = 12, face = "bold"),
      axis.text.x = element_text(size = 8, angle = 45, hjust = 1),
      axis.text.y = element_text(size = 8)
    )
}

# Create plots
p1 <- ggplot(data_clean, aes(x = Age)) +
  geom_histogram(fill = "navyblue", bins = 20, color = "white", linedwidth = 0.2) +
  scale_x_continuous(breaks = seq(0, 100, by = 10), limits = c(15, 85)) +
  labs(title = "Age", x = NULL, y = NULL) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 12, face = "bold"),
    axis.text.x = element_text(size = 8),
    axis.text.y = element_text(size = 8)
  )
p2 <- plot_bar(data_clean, Gender, "Gender")
p3 <- plot_bar(data_clean, Month, "Month")
p4 <- plot_bar(data_clean, Product_Type, "Product Type")
p5 <- plot_bar(data_clean, SKU, "SKU")
p6 <- plot_bar(data_clean, Shipping_Type, "Shipping Type")
p7 <- plot_bar(data_clean, Quantity, "Quantity")
p8 <- plot_bar(data_clean, Rating, "Rating")
p9 <- plot_bar(data_clean, Has_Addon, "Has Addon")

# Arrange in 3x3 grid
grid.arrange(p1, p2, p3,
             p4, p5, p6,
             p7, p8, p9,
             ncol = 3)

Product Type and SKU Allocation

Displays the relationship between product types and SKUs

sku_table <- data_clean %>%
  select(Product_Type, SKU) %>%
  distinct() %>%
  arrange(Product_Type, SKU)

knitr::kable(sku_table)

Product_Type	SKU
Headphones	HDP456
Headphones	SKU1003
Laptop	LTP123
Laptop	SKU1004
Laptop	SKU1005
Smartphone	SKU1001
Smartphone	SKU1004
Smartphone	SKU1005
Smartphone	SMP234
Smartwatch	SKU1003
Smartwatch	SWT567
Tablet	SKU1002
Tablet	TBL345

Time Series Analysis of Monthly Spending

Shows how spending changes across months, highlighting overall fluctuations over time

# Prepare monthly data
monthly_df <- data_clean %>%
  group_by(Month) %>%
  summarise(
    Total_Spend = sum(Total_Price, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(Month)

# Convert Month (numeric) to Jan–Dec
monthly_df$Month <- factor(monthly_df$Month,
                           levels = 1:12,
                           labels = c("Jan","Feb","Mar","Apr","May","Jun",
                                      "Jul","Aug","Sep","Oct","Nov","Dec"))

# Plot
p <- ggplot(monthly_df, aes(x = Month, y = Total_Spend, group = 1)) +
  geom_line(color = "navyblue", linewidth = 0.9) +
  geom_point(color = "navyblue", size = 2.5) +
  
  geom_smooth(method = "lm", se = FALSE,
              color = "red",
              linetype = "dashed",
              linewidth = 0.8) +
  
  labs(
    title = "Monthly Total Spend",
    x = "Month",
    y = "Total Spend"
  ) +
  
  scale_y_continuous(labels = comma) +
  
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    axis.title = element_text(size = 12),
    axis.text.x = element_text(size = 10),
    axis.text.y = element_text(size = 10)
  )

print(p)

Research Question 1: What factors drive revenue in electronics retail transactions?

Data Preparation for Modelling (Transaction-Level)

A transaction-level dataset is prepared for modelling.

model_df <- data_clean %>%
  select(
    Total_Price,
    Product_Type,
    Has_Addon,
    `Impulse Item`,
    Accessory,
    `Extended Warranty`,
    Installment_Payment,
    Payment_Method,
    Loyalty_Member,
    Age,
    Gender,
    Shipping_Type
  ) %>%
  na.omit()

model_df <- model_df %>%
  rename(
    Impulse_Item = `Impulse Item`,
    Extended_Warranty = `Extended Warranty`
  ) %>%
  mutate(
    across(c(Product_Type, Has_Addon, Installment_Payment,
             Payment_Method, Loyalty_Member, Gender,
             Shipping_Type),
           as.factor)
  )

Linear Regression

A linear regression model is fitted to identify significant predictors of total price.

model_lm <- lm(Total_Price ~ ., data = model_df)
knitr::kable(tidy(model_lm))

term	estimate	std.error	statistic	p.value
(Intercept)	2532.9483495	110.547949	22.9126671	0.0000000
Product_TypeLaptop	1741.8436603	82.088398	21.2191211	0.0000000
Product_TypeSmartphone	2407.8120918	79.228278	30.3908169	0.0000000
Product_TypeSmartwatch	2182.1868738	82.267429	26.5255264	0.0000000
Product_TypeTablet	1432.5368643	81.744659	17.5245317	0.0000000
Has_Addon1	-23.9619200	63.405396	-0.3779161	0.7054989
Has_Addon2	-80.4262948	75.909889	-1.0594969	0.2893926
Has_Addon3	-5.6159336	92.603866	-0.0606447	0.9516431
Impulse_Item	-10.1411420	41.208594	-0.2460929	0.8056141
Accessory	30.4777352	41.053373	0.7423930	0.4578623
Extended_Warranty	NA	NA	NA	NA
Installment_Payment1	-288.8079095	65.834027	-4.3869094	0.0000116
Payment_MethodCash	-760.0698786	86.799571	-8.7566087	0.0000000
Payment_MethodCredit Card	-67.9405732	54.442797	-1.2479258	0.2120800
Payment_MethodDebit Card	-582.1749068	87.284872	-6.6698259	0.0000000
Payment_MethodPayPal	NA	NA	NA	NA
Loyalty_MemberYes	-50.7459147	50.180250	-1.0112726	0.3119042
Age	0.4062262	1.145615	0.3545924	0.7229006
GenderMale	-45.1755820	41.398827	-1.0912285	0.2751919
Shipping_TypeExpress	-1509.1345681	79.742130	-18.9251850	0.0000000
Shipping_TypeOvernight	-1400.7894690	79.955822	-17.5195430	0.0000000
Shipping_TypeSame Day	-52.6481736	72.337531	-0.7278127	0.4667410
Shipping_TypeStandard	-776.9077880	64.460805	-12.0524059	0.0000000

Random Forest

A random forest model is fitted to identify important features driving total price.

set.seed(123)
model_rf <- randomForest(
  Total_Price ~ .,
  data = model_df,
  importance = TRUE
)

importance_df <- as.data.frame(importance(model_rf))
importance_df$Variable <- rownames(importance_df)

ggplot(importance_df, aes(x = reorder(Variable, `%IncMSE`), y = `%IncMSE`)) +
  geom_col(aes(fill = `%IncMSE` < 25)) +
  scale_fill_manual(
    values = c("TRUE" = "darkred", "FALSE" = "navyblue"),
    guide = "none"
  ) +
  coord_flip() +
  labs(
    title = "Feature Importance (Random Forest)",
    subtitle = "Low-importance variables highlighted in red",
    x = NULL,
    y = "% Increase in MSE"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    plot.subtitle = element_text(size = 11),
    axis.text = element_text(size = 10)
  )

Research Question 2: Can customers be segmented into meaningful groups?

Data Preparation for Modelling (Customer-Level)

A Customer-level dataset is prepared for modelling.

clustering_df <- customer_data %>%
  select(
    Age,
    Ever_Loyal,
    Total_Transactions,
    Total_Spend,
    Avg_Spend,
    Total_Quantity,
    Has_Addon,
    `Impulse Item`,
    Accessory,
    `Extended Warranty`
  )

clustering_scaled <- scale(clustering_df)

Find Optimal Clusters with Elbow Method

The elbow method is used to determine the optimal number of clusters.

fviz_nbclust(clustering_scaled, kmeans, method = "wss") +
  labs(title = "Elbow Method for Optimal Clusters")

K-Means Clustering

K-means clustering is performed to segment customers into groups based on their purchasing behaviour.

set.seed(123)

kmeans_model <- kmeans(clustering_scaled, centers = 3, nstart = 25)

fviz_cluster(kmeans_model, data = clustering_scaled)

clustering_df$Cluster <- as.factor(kmeans_model$cluster)

cluster_profiling <- clustering_df %>%
  group_by(Cluster) %>%
  summarise(
    Avg_Age = mean(Age),
    Avg_Loyalty = mean(Ever_Loyal),
    Avg_Spend = mean(Total_Spend),
    Avg_Transactions = mean(Total_Transactions),
    Avg_Quantity = mean(Total_Quantity),
    Avg_Addon = mean(Has_Addon)
  )

knitr::kable(cluster_profiling)

Cluster	Avg_Age	Avg_Loyalty	Avg_Spend	Avg_Transactions	Avg_Quantity	Avg_Addon
1	48.38653	0.4834197	8218.609	2.435751	14.007772	4.435233
2	49.61802	0.2372881	7054.542	1.248863	9.212071	1.489872
3	49.26129	0.2282587	1896.182	1.115888	4.754935	1.592730

Research Question 3: How do payment methods relate to purchasing behaviour?

Data Preparation

Payment method and installment payment summaries are created.

payment_summary <- data_clean %>%
  mutate(
    Payment_Group = ifelse(
      Payment_Method %in% c("PayPal", "Credit Card"),
      Payment_Method,
      "Other"
    )
  ) %>%
  group_by(Payment_Method, Payment_Group) %>%
  summarise(
    Transaction_Count = n(),
    .groups = "drop"
  )

installment_summary <- data_clean %>%
  mutate(Installment_Label = ifelse(Installment_Payment == 1, "Yes", "No")) %>%
  group_by(Installment_Label) %>%
  summarise(
    Transaction_Count = n(),
    .groups = "drop"
  )

Transaction by Payment Method Plot

This plot compares the transaction counts for different payment methods.

ggplot(payment_summary, aes(x = Payment_Method, y = Transaction_Count)) +
  geom_col(aes(fill = Payment_Method %in% c("PayPal", "Credit Card"))) +
  scale_fill_manual(
    values = c("TRUE" = "navyblue", "FALSE" = "darkred"),
    guide = "none"
  ) +
  labs(
    title = "Transaction by Payment Method",
    x = NULL,
    y = NULL
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 18, face = "bold"),
    axis.text = element_text(size = 13, angle = 0, hjust = 0.5)
    
  )

Transaction by Instalment Payment Plot

This plot compares the transaction counts for customers who used installment payment versus those who did not.

ggplot(installment_summary, aes(x = Installment_Label, y = Transaction_Count)) +
  geom_col(aes(fill = Installment_Label)) +
  scale_fill_manual(
    values = c("Yes" = "navyblue", "No" = "darkred"),
    guide = "none"
  ) +
  labs(
    title = "Transaction by Instalment Payment",
    x = NULL,
    y = NULL
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 18, face = "bold"),
    axis.text = element_text(size = 13, angle = 0, hjust = 0.5)
  )

Payment Behavior Analysis

This table summarizes the average spend and quantity for customers based on whether they used installment payment or not

payment_behavior <- data_clean %>%
  group_by(Installment_Payment) %>%
  summarise(
    Avg_Spend = round(mean(Total_Price), 2),
    Avg_Quantity = round(mean(Quantity), 2),
    .groups = "drop"
  )

knitr::kable(payment_behavior)

Installment_Payment	Avg_Spend	Avg_Quantity
0	3066.85	5.50
1	3251.81	5.47

Customer Segmentation and Transaction Behaviour in Electronics Retail Data